From 999a8fb812df766c5cdfda3cea8e71548e89fc8b Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Wed, 16 Jul 2025 10:29:11 +0200 Subject: [PATCH 001/813] Revert "[libc][NFC] refactor Cortex `memcpy` code" (#149035) Reverts llvm/llvm-project#148204 `libc-arm32-qemu-debian-dbg` is failing, reverting and investigating --- libc/src/string/memory_utils/CMakeLists.txt | 1 - libc/src/string/memory_utils/arm/common.h | 52 ----- .../string/memory_utils/arm/inline_memcpy.h | 195 +++++++++--------- .../llvm-project-overlay/libc/BUILD.bazel | 1 - 4 files changed, 98 insertions(+), 151 deletions(-) delete mode 100644 libc/src/string/memory_utils/arm/common.h diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt index 633d9f12949d2..a967247db53f4 100644 --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -7,7 +7,6 @@ add_header_library( aarch64/inline_memcpy.h aarch64/inline_memmove.h aarch64/inline_memset.h - arm/common.h arm/inline_memcpy.h generic/aligned_access.h generic/byte_per_byte.h diff --git a/libc/src/string/memory_utils/arm/common.h b/libc/src/string/memory_utils/arm/common.h deleted file mode 100644 index 155bc3481709e..0000000000000 --- a/libc/src/string/memory_utils/arm/common.h +++ /dev/null @@ -1,52 +0,0 @@ -//===-- Common constants and defines for arm --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H -#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H - -#include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR -#include "src/string/memory_utils/utils.h" // CPtr, Ptr, distance_to_align - -#include // size_t - -// https://libc.llvm.org/compiler_support.html -// Support for [[likely]] / [[unlikely]] -// [X] GCC 12.2 -// [X] Clang 12 -// [ ] Clang 11 -#define LIBC_ATTR_LIKELY [[likely]] -#define LIBC_ATTR_UNLIKELY [[unlikely]] - -#if defined(LIBC_COMPILER_IS_CLANG) -#if LIBC_COMPILER_CLANG_VER < 1200 -#undef LIBC_ATTR_LIKELY -#undef LIBC_ATTR_UNLIKELY -#define LIBC_ATTR_LIKELY -#define LIBC_ATTR_UNLIKELY -#endif -#endif - -namespace LIBC_NAMESPACE_DECL { - -LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t); - -enum class AssumeAccess { kUnknown, kAligned }; -enum class BlockOp { kFull, kByWord }; - -LIBC_INLINE auto misaligned(CPtr ptr) { - return distance_to_align_down(ptr); -} - -LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) { - return cpp::bit_cast(cpp::bit_cast(a) | - cpp::bit_cast(b)); -} - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H diff --git a/libc/src/string/memory_utils/arm/inline_memcpy.h b/libc/src/string/memory_utils/arm/inline_memcpy.h index 30b99d41e0967..61efebe29b485 100644 --- a/libc/src/string/memory_utils/arm/inline_memcpy.h +++ b/libc/src/string/memory_utils/arm/inline_memcpy.h @@ -5,56 +5,63 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// The functions defined in this file give approximate code size. These sizes -// assume the following configuration options: -// - LIBC_CONF_KEEP_FRAME_POINTER = false -// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false -// - LIBC_ADD_NULL_CHECKS = false #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL -#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY #include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align #include // size_t +// https://libc.llvm.org/compiler_support.html +// Support for [[likely]] / [[unlikely]] +// [X] GCC 12.2 +// [X] Clang 12 +// [ ] Clang 11 +#define LIBC_ATTR_LIKELY [[likely]] +#define LIBC_ATTR_UNLIKELY [[unlikely]] + +#if defined(LIBC_COMPILER_IS_CLANG) +#if LIBC_COMPILER_CLANG_VER < 1200 +#undef LIBC_ATTR_LIKELY +#undef LIBC_ATTR_UNLIKELY +#define LIBC_ATTR_LIKELY +#define LIBC_ATTR_UNLIKELY +#endif +#endif + namespace LIBC_NAMESPACE_DECL { namespace { -// Performs a copy of `bytes` byte from `src` to `dst`. This function has the -// semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is -// free to use whatever instruction is best for the size and assumed access. -template -LIBC_INLINE void copy(void *dst, const void *src) { - if constexpr (access == AssumeAccess::kAligned) { - constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes; - memcpy_inline(assume_aligned(dst), - assume_aligned(src)); - } else if constexpr (access == AssumeAccess::kUnknown) { - memcpy_inline(dst, src); - } else { - static_assert(false); - } -} +LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t); -template -LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) { - if constexpr (block_op == BlockOp::kFull) { - copy(dst, src); - } else if constexpr (block_op == BlockOp::kByWord) { +enum Strategy { + ForceWordLdStChain, + AssumeWordAligned, + AssumeUnaligned, +}; + +template +LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) { + if constexpr (strategy == AssumeUnaligned) { + memcpy_inline(assume_aligned<1>(dst), assume_aligned<1>(src)); + } else if constexpr (strategy == AssumeWordAligned) { + static_assert(bytes >= kWordSize); + memcpy_inline(assume_aligned(dst), + assume_aligned(src)); + } else if constexpr (strategy == ForceWordLdStChain) { // We restrict loads/stores to 4 byte to prevent the use of load/store - // multiple (LDM, STM) and load/store double (LDRD, STRD). + // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may + // fault (see notes below) and second, they use more registers which in turn + // adds push/pop instructions in the hot path. static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize)); LIBC_LOOP_UNROLL - for (size_t offset = 0; offset < bytes; offset += kWordSize) { - copy(dst + offset, src + offset); + for (size_t i = 0; i < bytes / kWordSize; ++i) { + const size_t offset = i * kWordSize; + memcpy_inline(dst + offset, src + offset); } - } else { - static_assert(false, "Invalid BlockOp"); } // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting // into the load/store instructions. @@ -65,27 +72,39 @@ LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) { src += bytes; } -template -LIBC_INLINE void consume_by_block(Ptr &dst, CPtr &src, size_t &size) { +LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, + const size_t size) { LIBC_LOOP_NOUNROLL - for (size_t i = 0; i < size / bytes; ++i) - copy_block_and_bump_pointers(dst, src); - size %= bytes; + for (size_t i = 0; i < size; ++i) + *dst++ = *src++; } -[[maybe_unused]] LIBC_INLINE void -copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) { +template +LIBC_INLINE void copy_blocks_and_update_args(Ptr &dst, CPtr &src, + size_t &size) { LIBC_LOOP_NOUNROLL - for (size_t i = 0; i < size; ++i) - *dst++ = *src++; + for (size_t i = 0; i < size / block_size; ++i) + copy_and_bump_pointers(dst, src); + // Update `size` once at the end instead of once per iteration. + size %= block_size; +} + +LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) { + return cpp::bit_cast(cpp::bit_cast(a) | + cpp::bit_cast(b)); +} + +LIBC_INLINE auto misaligned(CPtr a) { + return distance_to_align_down(a); } } // namespace -// Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned -// loads/stores. It compiles down to 208 bytes when used through `memcpy` that -// also needs to return the `dst` ptr. -// Note: +// Implementation for Cortex-M0, M0+, M1. +// Notes: +// - It compiles down to 196 bytes, but 220 bytes when used through `memcpy` +// that also needs to return the `dst` ptr. +// - These cores do not allow for unaligned loads/stores. // - When `src` and `dst` are coaligned, we start by aligning them and perform // bulk copies. We let the compiler know the pointers are aligned so it can // use load/store multiple (LDM, STM). This significantly increase throughput @@ -106,18 +125,9 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) { if (src_alignment == 0) LIBC_ATTR_LIKELY { // Both `src` and `dst` are now word-aligned. - // We first copy by blocks of 64 bytes, the compiler will use 4 - // load/store multiple (LDM, STM), each of 4 words. This requires more - // registers so additional push/pop are needed but the speedup is worth - // it. - consume_by_block<64, BlockOp::kFull, AssumeAccess::kAligned>(dst, src, - size); - // Then we use blocks of 4 word load/store. - consume_by_block<16, BlockOp::kByWord, AssumeAccess::kAligned>(dst, src, - size); - // Then we use word by word copy. - consume_by_block<4, BlockOp::kByWord, AssumeAccess::kAligned>(dst, src, - size); + copy_blocks_and_update_args<64, AssumeWordAligned>(dst, src, size); + copy_blocks_and_update_args<16, AssumeWordAligned>(dst, src, size); + copy_blocks_and_update_args<4, AssumeWordAligned>(dst, src, size); } else { // `dst` is aligned but `src` is not. @@ -128,7 +138,7 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) { src_alignment == 2 ? load_aligned(src) : load_aligned(src); - copy(dst, &value); + memcpy_inline(assume_aligned(dst), &value); dst += kWordSize; src += kWordSize; size -= kWordSize; @@ -141,8 +151,17 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) { } // Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware -// support for unaligned loads and stores. It compiles down to 272 bytes when -// used through `memcpy` that also needs to return the `dst` ptr. +// support for unaligned loads and stores. +// Notes: +// - It compiles down to 266 bytes. +// - `dst` and `src` are not `__restrict` to prevent the compiler from +// reordering loads/stores. +// - We keep state variables to a strict minimum to keep everything in the free +// registers and prevent costly push / pop. +// - If unaligned single loads/stores to normal memory are supported, unaligned +// accesses for load/store multiple (LDM, STM) and load/store double (LDRD, +// STRD) instructions are generally not supported and will still fault so we +// make sure to restrict unrolling to word loads/stores. [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src, size_t size) { if (misaligned(bitwise_or(src, dst))) @@ -150,60 +169,38 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) { if (size < 8) LIBC_ATTR_UNLIKELY { if (size & 1) - copy_block_and_bump_pointers<1>(dst, src); + copy_and_bump_pointers<1>(dst, src); if (size & 2) - copy_block_and_bump_pointers<2>(dst, src); + copy_and_bump_pointers<2>(dst, src); if (size & 4) - copy_block_and_bump_pointers<4>(dst, src); + copy_and_bump_pointers<4>(dst, src); return; } if (misaligned(src)) LIBC_ATTR_UNLIKELY { const size_t offset = distance_to_align_up(dst); if (offset & 1) - copy_block_and_bump_pointers<1>(dst, src); + copy_and_bump_pointers<1>(dst, src); if (offset & 2) - copy_block_and_bump_pointers<2>(dst, src); + copy_and_bump_pointers<2>(dst, src); size -= offset; } } - // `dst` and `src` are not necessarily both aligned at that point but this - // implementation assumes hardware support for unaligned loads and stores so - // it is still fast to perform unrolled word by word copy. Note that wider - // accesses through the use of load/store multiple (LDM, STM) and load/store - // double (LDRD, STRD) instructions are generally not supported and can fault. - // By forcing decomposition of 64 bytes copy into word by word copy, the - // compiler can use the first load to prefetch memory: - // ldr r3, [r1, #64]! <- prefetch next cache line - // str r3, [r0] - // ldr r3, [r1, #0x4] - // str r3, [r0, #0x4] - // ... - // ldr r3, [r1, #0x3c] - // str r3, [r0, #0x3c] - // This is a bit detrimental for sizes between 64 and 256 (less than 10% - // penalty) but the prefetch yields better throughput for larger copies. - consume_by_block<64, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src, - size); - consume_by_block<16, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src, - size); - consume_by_block<4, BlockOp::kByWord, AssumeAccess::kUnknown>(dst, src, size); + copy_blocks_and_update_args<64, ForceWordLdStChain>(dst, src, size); + copy_blocks_and_update_args<16, ForceWordLdStChain>(dst, src, size); + copy_blocks_and_update_args<4, AssumeUnaligned>(dst, src, size); if (size & 1) - copy_block_and_bump_pointers<1>(dst, src); + copy_and_bump_pointers<1>(dst, src); if (size & 2) - copy_block_and_bump_pointers<2>(dst, src); + LIBC_ATTR_UNLIKELY + copy_and_bump_pointers<2>(dst, src); } -[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(Ptr dst, CPtr src, +[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(void *__restrict dst_, + const void *__restrict src_, size_t size) { - // The compiler performs alias analysis and is able to prove that `dst` and - // `src` do not alias by propagating the `__restrict` keyword from the - // `memcpy` prototype. This allows the compiler to merge consecutive - // load/store (LDR, STR) instructions generated in - // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store - // double (LDRD, STRD) instructions, this is is undesirable so we prevent the - // compiler from inferring `__restrict` with the following line. - asm volatile("" : "+r"(dst), "+r"(src)); + Ptr dst = cpp::bit_cast(dst_); + CPtr src = cpp::bit_cast(src_); #ifdef __ARM_FEATURE_UNALIGNED return inline_memcpy_arm_mid_end(dst, src, size); #else @@ -213,4 +210,8 @@ copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) { } // namespace LIBC_NAMESPACE_DECL +// Cleanup local macros +#undef LIBC_ATTR_LIKELY +#undef LIBC_ATTR_UNLIKELY + #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 9e288f7fec0a8..fe843d3207ceb 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -4448,7 +4448,6 @@ libc_support_library( "src/string/memory_utils/aarch64/inline_memcpy.h", "src/string/memory_utils/aarch64/inline_memmove.h", "src/string/memory_utils/aarch64/inline_memset.h", - "src/string/memory_utils/arm/common.h", "src/string/memory_utils/arm/inline_memcpy.h", "src/string/memory_utils/generic/aligned_access.h", "src/string/memory_utils/generic/byte_per_byte.h", From 50096134f869f4ccb5e5f650edd0dd512c5a751d Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 16 Jul 2025 10:33:21 +0200 Subject: [PATCH 002/813] [libc++][NFC] Remove some __tree internal accessor functions (#147266) --- libcxx/include/__tree | 100 ++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 53 deletions(-) diff --git a/libcxx/include/__tree b/libcxx/include/__tree index f29b691b73dda..f8bb4f01b1e29 100644 --- a/libcxx/include/__tree +++ b/libcxx/include/__tree @@ -855,17 +855,11 @@ public: private: _LIBCPP_HIDE_FROM_ABI const __node_allocator& __node_alloc() const _NOEXCEPT { return __node_alloc_; } - _LIBCPP_HIDE_FROM_ABI __end_node_pointer& __begin_node() _NOEXCEPT { return __begin_node_; } - _LIBCPP_HIDE_FROM_ABI const __end_node_pointer& __begin_node() const _NOEXCEPT { return __begin_node_; } public: _LIBCPP_HIDE_FROM_ABI allocator_type __alloc() const _NOEXCEPT { return allocator_type(__node_alloc()); } -private: - _LIBCPP_HIDE_FROM_ABI size_type& size() _NOEXCEPT { return __size_; } - -public: - _LIBCPP_HIDE_FROM_ABI const size_type& size() const _NOEXCEPT { return __size_; } + _LIBCPP_HIDE_FROM_ABI size_type size() const _NOEXCEPT { return __size_; } _LIBCPP_HIDE_FROM_ABI value_compare& value_comp() _NOEXCEPT { return __value_comp_; } _LIBCPP_HIDE_FROM_ABI const value_compare& value_comp() const _NOEXCEPT { return __value_comp_; } @@ -902,8 +896,8 @@ public: _LIBCPP_HIDE_FROM_ABI ~__tree(); - _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__begin_node()); } - _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__begin_node()); } + _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__begin_node_); } + _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__begin_node_); } _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(__end_node()); } _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(__end_node()); } @@ -1225,30 +1219,30 @@ template __tree<_Tp, _Compare, _Allocator>::__tree(const value_compare& __comp) _NOEXCEPT_( is_nothrow_default_constructible<__node_allocator>::value&& is_nothrow_copy_constructible::value) : __size_(0), __value_comp_(__comp) { - __begin_node() = __end_node(); + __begin_node_ = __end_node(); } template __tree<_Tp, _Compare, _Allocator>::__tree(const allocator_type& __a) : __begin_node_(), __node_alloc_(__node_allocator(__a)), __size_(0) { - __begin_node() = __end_node(); + __begin_node_ = __end_node(); } template __tree<_Tp, _Compare, _Allocator>::__tree(const value_compare& __comp, const allocator_type& __a) : __begin_node_(), __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(__comp) { - __begin_node() = __end_node(); + __begin_node_ = __end_node(); } -// Precondition: size() != 0 +// Precondition: __size_ != 0 template typename __tree<_Tp, _Compare, _Allocator>::__node_pointer __tree<_Tp, _Compare, _Allocator>::_DetachedTreeCache::__detach_from_tree(__tree* __t) _NOEXCEPT { - __node_pointer __cache = static_cast<__node_pointer>(__t->__begin_node()); - __t->__begin_node() = __t->__end_node(); + __node_pointer __cache = static_cast<__node_pointer>(__t->__begin_node_); + __t->__begin_node_ = __t->__end_node(); __t->__end_node()->__left_->__parent_ = nullptr; __t->__end_node()->__left_ = nullptr; - __t->size() = 0; + __t->__size_ = 0; // __cache->__left_ == nullptr if (__cache->__right_ != nullptr) __cache = static_cast<__node_pointer>(__cache->__right_); @@ -1300,7 +1294,7 @@ void __tree<_Tp, _Compare, _Allocator>::__assign_unique(_ForwardIterator __first is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type"); static_assert( __has_forward_iterator_category<_ForwardIterator>::value, "__assign_unique requires a forward iterator"); - if (size() != 0) { + if (__size_ != 0) { _DetachedTreeCache __cache(this); for (; __cache.__get() != nullptr && __first != __last; ++__first) { if (__node_assign_unique(*__first, __cache.__get()).second) @@ -1318,7 +1312,7 @@ void __tree<_Tp, _Compare, _Allocator>::__assign_multi(_InputIterator __first, _ typedef typename _ITraits::value_type _ItValueType; static_assert( is_same<_ItValueType, value_type>::value, "__assign_multi may only be called with the containers value_type"); - if (size() != 0) { + if (__size_ != 0) { _DetachedTreeCache __cache(this); for (; __cache.__get() && __first != __last; ++__first) { __assign_value(__cache.__get()->__value_, *__first); @@ -1337,7 +1331,7 @@ __tree<_Tp, _Compare, _Allocator>::__tree(const __tree& __t) __node_alloc_(__node_traits::select_on_container_copy_construction(__t.__node_alloc())), __size_(0), __value_comp_(__t.value_comp()) { - __begin_node() = __end_node(); + __begin_node_ = __end_node(); } template @@ -1348,13 +1342,13 @@ __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t) _NOEXCEPT_( __node_alloc_(std::move(__t.__node_alloc_)), __size_(__t.__size_), __value_comp_(std::move(__t.__value_comp_)) { - if (size() == 0) - __begin_node() = __end_node(); + if (__size_ == 0) + __begin_node_ = __end_node(); else { __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node()); - __t.__begin_node() = __t.__end_node(); + __t.__begin_node_ = __t.__end_node(); __t.__end_node()->__left_ = nullptr; - __t.size() = 0; + __t.__size_ = 0; } } @@ -1362,19 +1356,19 @@ template __tree<_Tp, _Compare, _Allocator>::__tree(__tree&& __t, const allocator_type& __a) : __node_alloc_(__node_allocator(__a)), __size_(0), __value_comp_(std::move(__t.value_comp())) { if (__a == __t.__alloc()) { - if (__t.size() == 0) - __begin_node() = __end_node(); + if (__t.__size_ == 0) + __begin_node_ = __end_node(); else { - __begin_node() = __t.__begin_node(); + __begin_node_ = __t.__begin_node_; __end_node()->__left_ = __t.__end_node()->__left_; __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node()); - size() = __t.size(); - __t.__begin_node() = __t.__end_node(); + __size_ = __t.__size_; + __t.__begin_node_ = __t.__end_node(); __t.__end_node()->__left_ = nullptr; - __t.size() = 0; + __t.__size_ = 0; } } else { - __begin_node() = __end_node(); + __begin_node_ = __end_node(); } } @@ -1387,13 +1381,13 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, true_type) __move_assign_alloc(__t); __size_ = __t.__size_; __value_comp_ = std::move(__t.__value_comp_); - if (size() == 0) - __begin_node() = __end_node(); + if (__size_ == 0) + __begin_node_ = __end_node(); else { __end_node()->__left_->__parent_ = static_cast<__end_node_pointer>(__end_node()); - __t.__begin_node() = __t.__end_node(); + __t.__begin_node_ = __t.__end_node(); __t.__end_node()->__left_ = nullptr; - __t.size() = 0; + __t.__size_ = 0; } } @@ -1404,15 +1398,15 @@ void __tree<_Tp, _Compare, _Allocator>::__move_assign(__tree& __t, false_type) { else { value_comp() = std::move(__t.value_comp()); const_iterator __e = end(); - if (size() != 0) { + if (__size_ != 0) { _DetachedTreeCache __cache(this); - while (__cache.__get() != nullptr && __t.size() != 0) { + while (__cache.__get() != nullptr && __t.__size_ != 0) { __assign_value(__cache.__get()->__value_, std::move(__t.remove(__t.begin())->__value_)); __node_insert_multi(__cache.__get()); __cache.__advance(); } } - while (__t.size() != 0) { + while (__t.__size_ != 0) { __insert_multi_from_orphaned_node(__e, std::move(__t.remove(__t.begin())->__value_)); } } @@ -1460,12 +1454,12 @@ void __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t) std::__swap_allocator(__node_alloc(), __t.__node_alloc()); swap(__size_, __t.__size_); swap(__value_comp_, __t.__value_comp_); - if (size() == 0) - __begin_node() = __end_node(); + if (__size_ == 0) + __begin_node_ = __end_node(); else __end_node()->__left_->__parent_ = __end_node(); - if (__t.size() == 0) - __t.__begin_node() = __t.__end_node(); + if (__t.__size_ == 0) + __t.__begin_node_ = __t.__end_node(); else __t.__end_node()->__left_->__parent_ = __t.__end_node(); } @@ -1473,8 +1467,8 @@ void __tree<_Tp, _Compare, _Allocator>::swap(__tree& __t) template void __tree<_Tp, _Compare, _Allocator>::clear() _NOEXCEPT { destroy(__root()); - size() = 0; - __begin_node() = __end_node(); + __size_ = 0; + __begin_node_ = __end_node(); __end_node()->__left_ = nullptr; } @@ -1664,10 +1658,10 @@ void __tree<_Tp, _Compare, _Allocator>::__insert_node_at( __new_node->__parent_ = __parent; // __new_node->__is_black_ is initialized in __tree_balance_after_insert __child = __new_node; - if (__begin_node()->__left_ != nullptr) - __begin_node() = static_cast<__end_node_pointer>(__begin_node()->__left_); + if (__begin_node_->__left_ != nullptr) + __begin_node_ = static_cast<__end_node_pointer>(__begin_node_->__left_); std::__tree_balance_after_insert(__end_node()->__left_, __child); - ++size(); + ++__size_; } template @@ -1811,9 +1805,9 @@ typename __tree<_Tp, _Compare, _Allocator>::iterator __tree<_Tp, _Compare, _Allocator>::__remove_node_pointer(__node_pointer __ptr) _NOEXCEPT { iterator __r(__ptr); ++__r; - if (__begin_node() == __ptr) - __begin_node() = __r.__ptr_; - --size(); + if (__begin_node_ == __ptr) + __begin_node_ = __r.__ptr_; + --__size_; std::__tree_remove(__end_node()->__left_, static_cast<__node_base_pointer>(__ptr)); return __r; } @@ -2177,13 +2171,13 @@ template typename __tree<_Tp, _Compare, _Allocator>::__node_holder __tree<_Tp, _Compare, _Allocator>::remove(const_iterator __p) _NOEXCEPT { __node_pointer __np = __p.__get_np(); - if (__begin_node() == __p.__ptr_) { + if (__begin_node_ == __p.__ptr_) { if (__np->__right_ != nullptr) - __begin_node() = static_cast<__end_node_pointer>(__np->__right_); + __begin_node_ = static_cast<__end_node_pointer>(__np->__right_); else - __begin_node() = static_cast<__end_node_pointer>(__np->__parent_); + __begin_node_ = static_cast<__end_node_pointer>(__np->__parent_); } - --size(); + --__size_; std::__tree_remove(__end_node()->__left_, static_cast<__node_base_pointer>(__np)); return __node_holder(__np, _Dp(__node_alloc(), true)); } From c364e6234e727cbf54ca666fa706693711a31119 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 16 Jul 2025 08:35:37 +0000 Subject: [PATCH 003/813] [gn build] Port 3e4153c97b54 --- llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn index f21822f178655..4eab61b6e9ce2 100644 --- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn @@ -305,6 +305,7 @@ copy("Headers") { "riscv_bitmanip.h", "riscv_corev_alu.h", "riscv_crypto.h", + "riscv_nds.h", "riscv_ntlh.h", "rtmintrin.h", "s390intrin.h", From de4b458aa5e52812aa9c392f62a616b6c6c1716f Mon Sep 17 00:00:00 2001 From: Nashe Mncube Date: Wed, 16 Jul 2025 09:50:04 +0100 Subject: [PATCH 004/813] [AArch64][Codegen]Transform saturating smull to sqdmulh (#143671) This patch adds a pattern for recognizing saturating vector smull. Prior to this patch these were performed using a combination of smull+smull2+uzp+smin like the following ``` smull2 v5.2d, v1.4s, v2.4s smull v1.2d, v1.2s, v2.2s uzp2 v1.4s, v1.4s, v5.4s smin v1.4s, v1.4s, v0.4s ``` which now optimizes to ``` sqdmulh v0.4s, v1.4s, v0.4s ``` This only operates on vectors containing int32 and int16 types --- .../Target/AArch64/AArch64ISelLowering.cpp | 109 ++++++++- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 10 + .../CodeGen/AArch64/saturating-vec-smull.ll | 223 ++++++++++++++++++ 3 files changed, 340 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/saturating-vec-smull.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 235df9022c6fb..4f13a14d24649 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1143,6 +1143,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, ISD::SIGN_EXTEND_INREG, ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR, ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR}); + setTargetDAGCombine(ISD::SMIN); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::LOAD); @@ -2392,6 +2393,15 @@ static bool isIntImmediate(const SDNode *N, uint64_t &Imm) { return false; } +bool isVectorizedBinOp(unsigned Opcode) { + switch (Opcode) { + case AArch64ISD::SQDMULH: + return true; + default: + return false; + } +} + // isOpcWithIntImmediate - This method tests to see if the node is a specific // opcode and that it has a immediate integer right operand. // If so Imm will receive the value. @@ -20131,8 +20141,9 @@ static SDValue performConcatVectorsCombine(SDNode *N, // size, combine into an binop of two contacts of the source vectors. eg: // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d)) if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() && - DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() && - N1->hasOneUse()) { + (DAG.getTargetLoweringInfo().isBinOp(N0Opc) || + isVectorizedBinOp(N0Opc)) && + N0->hasOneUse() && N1->hasOneUse()) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); SDValue N10 = N1->getOperand(0); @@ -20991,6 +21002,98 @@ static SDValue performBuildVectorCombine(SDNode *N, return SDValue(); } +// A special combine for the sqdmulh family of instructions. +// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ), +// SATURATING_VAL ) can be reduced to sqdmulh(...) +static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG) { + + if (N->getOpcode() != ISD::SMIN) + return SDValue(); + + EVT DestVT = N->getValueType(0); + + if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 || + DestVT.isScalableVector()) + return SDValue(); + + ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1)); + + if (!Clamp) + return SDValue(); + + MVT ScalarType; + unsigned ShiftAmt = 0; + switch (Clamp->getSExtValue()) { + case (1ULL << 15) - 1: + ScalarType = MVT::i16; + ShiftAmt = 16; + break; + case (1ULL << 31) - 1: + ScalarType = MVT::i32; + ShiftAmt = 32; + break; + default: + return SDValue(); + } + + SDValue Sra = N->getOperand(0); + if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse()) + return SDValue(); + + ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1)); + if (!RightShiftVec) + return SDValue(); + unsigned SExtValue = RightShiftVec->getSExtValue(); + + if (SExtValue != (ShiftAmt - 1)) + return SDValue(); + + SDValue Mul = Sra.getOperand(0); + if (Mul.getOpcode() != ISD::MUL) + return SDValue(); + + SDValue SExt0 = Mul.getOperand(0); + SDValue SExt1 = Mul.getOperand(1); + + if (SExt0.getOpcode() != ISD::SIGN_EXTEND || + SExt1.getOpcode() != ISD::SIGN_EXTEND) + return SDValue(); + + EVT SExt0Type = SExt0.getOperand(0).getValueType(); + EVT SExt1Type = SExt1.getOperand(0).getValueType(); + + if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType || + SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() || + SExt0Type.getVectorNumElements() == 1) + return SDValue(); + + SDLoc DL(N); + SDValue V0 = SExt0.getOperand(0); + SDValue V1 = SExt1.getOperand(0); + + // Ensure input vectors are extended to legal types + if (SExt0Type.getFixedSizeInBits() < 64) { + unsigned VecNumElements = SExt0Type.getVectorNumElements(); + EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements), + VecNumElements); + V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0); + V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1); + } + + SDValue SQDMULH = + DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1); + + return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH); +} + +static SDValue performSMINCombine(SDNode *N, SelectionDAG &DAG) { + if (SDValue V = trySQDMULHCombine(N, DAG)) { + return V; + } + + return SDValue(); +} + static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); @@ -26742,6 +26845,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performAddSubCombine(N, DCI); case ISD::BUILD_VECTOR: return performBuildVectorCombine(N, DCI, DAG); + case ISD::SMIN: + return performSMINCombine(N, DAG); case ISD::TRUNCATE: return performTruncateCombine(N, DAG, DCI); case AArch64ISD::ANDS: diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ddc685fae5e9a..ce91b72fa24e5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1022,6 +1022,7 @@ def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull, [SDNPCommutative]>; def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull, [SDNPCommutative]>; +def AArch64sqdmulh : SDNode<"AArch64ISD::SQDMULH", SDT_AArch64mull>; // Reciprocal estimates and steps. def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>; @@ -9439,6 +9440,15 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)), (EXTRACT_SUBREG V128:$Rm, dsub)), (UMULLv4i32_v2i64 V128:$Rn, V128:$Rm))>; +def : Pat<(v4i16 (AArch64sqdmulh (v4i16 V64:$Rn), (v4i16 V64:$Rm))), + (SQDMULHv4i16 V64:$Rn, V64:$Rm)>; +def : Pat<(v2i32 (AArch64sqdmulh (v2i32 V64:$Rn), (v2i32 V64:$Rm))), + (SQDMULHv2i32 V64:$Rn, V64:$Rm)>; +def : Pat<(v8i16 (AArch64sqdmulh (v8i16 V128:$Rn), (v8i16 V128:$Rm))), + (SQDMULHv8i16 V128:$Rn, V128:$Rm)>; +def : Pat<(v4i32 (AArch64sqdmulh (v4i32 V128:$Rn), (v4i32 V128:$Rm))), + (SQDMULHv4i32 V128:$Rn, V128:$Rm)>; + // Conversions within AdvSIMD types in the same register size are free. // But because we need a consistent lane ordering, in big endian many // conversions require one or more REV instructions. diff --git a/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll new file mode 100644 index 0000000000000..b647daf72ca35 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/saturating-vec-smull.ll @@ -0,0 +1,223 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-none-elf < %s | FileCheck %s + + +define <2 x i16> @saturating_2xi16(<2 x i16> %a, <2 x i16> %b) { +; CHECK-LABEL: saturating_2xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: sqdmulh v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret + %as = sext <2 x i16> %a to <2 x i32> + %bs = sext <2 x i16> %b to <2 x i32> + %m = mul <2 x i32> %bs, %as + %sh = ashr <2 x i32> %m, splat (i32 15) + %ma = tail call <2 x i32> @llvm.smin.v4i32(<2 x i32> %sh, <2 x i32> splat (i32 32767)) + %t = trunc <2 x i32> %ma to <2 x i16> + ret <2 x i16> %t +} + +define <4 x i16> @saturating_4xi16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: saturating_4xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmulh v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %as = sext <4 x i16> %a to <4 x i32> + %bs = sext <4 x i16> %b to <4 x i32> + %m = mul <4 x i32> %bs, %as + %sh = ashr <4 x i32> %m, splat (i32 15) + %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767)) + %t = trunc <4 x i32> %ma to <4 x i16> + ret <4 x i16> %t +} + +define <8 x i16> @saturating_8xi16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: saturating_8xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmulh v0.8h, v1.8h, v0.8h +; CHECK-NEXT: ret + %as = sext <8 x i16> %a to <8 x i32> + %bs = sext <8 x i16> %b to <8 x i32> + %m = mul <8 x i32> %bs, %as + %sh = ashr <8 x i32> %m, splat (i32 15) + %ma = tail call <8 x i32> @llvm.smin.v8i32(<8 x i32> %sh, <8 x i32> splat (i32 32767)) + %t = trunc <8 x i32> %ma to <8 x i16> + ret <8 x i16> %t +} + +define <2 x i32> @saturating_2xi32(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: saturating_2xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmulh v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret + %as = sext <2 x i32> %a to <2 x i64> + %bs = sext <2 x i32> %b to <2 x i64> + %m = mul <2 x i64> %bs, %as + %sh = ashr <2 x i64> %m, splat (i64 31) + %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647)) + %t = trunc <2 x i64> %ma to <2 x i32> + ret <2 x i32> %t +} + +define <4 x i32> @saturating_4xi32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: saturating_4xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmulh v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %as = sext <4 x i32> %a to <4 x i64> + %bs = sext <4 x i32> %b to <4 x i64> + %m = mul <4 x i64> %bs, %as + %sh = ashr <4 x i64> %m, splat (i64 31) + %ma = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %sh, <4 x i64> splat (i64 2147483647)) + %t = trunc <4 x i64> %ma to <4 x i32> + ret <4 x i32> %t +} + +define <8 x i32> @saturating_8xi32(<8 x i32> %a, <8 x i32> %b) { +; CHECK-LABEL: saturating_8xi32: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmulh v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sqdmulh v0.4s, v2.4s, v0.4s +; CHECK-NEXT: ret + %as = sext <8 x i32> %a to <8 x i64> + %bs = sext <8 x i32> %b to <8 x i64> + %m = mul <8 x i64> %bs, %as + %sh = ashr <8 x i64> %m, splat (i64 31) + %ma = tail call <8 x i64> @llvm.smin.v8i64(<8 x i64> %sh, <8 x i64> splat (i64 2147483647)) + %t = trunc <8 x i64> %ma to <8 x i32> + ret <8 x i32> %t +} + +define <2 x i64> @saturating_2xi32_2xi64(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: saturating_2xi32_2xi64: +; CHECK: // %bb.0: +; CHECK-NEXT: sqdmulh v0.2s, v1.2s, v0.2s +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 +; CHECK-NEXT: ret + %as = sext <2 x i32> %a to <2 x i64> + %bs = sext <2 x i32> %b to <2 x i64> + %m = mul <2 x i64> %bs, %as + %sh = ashr <2 x i64> %m, splat (i64 31) + %ma = tail call <2 x i64> @llvm.smin.v8i64(<2 x i64> %sh, <2 x i64> splat (i64 2147483647)) + ret <2 x i64> %ma +} + +define <6 x i16> @saturating_6xi16(<6 x i16> %a, <6 x i16> %b) { +; CHECK-LABEL: saturating_6xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: smull2 v3.4s, v1.8h, v0.8h +; CHECK-NEXT: movi v2.4s, #127, msl #8 +; CHECK-NEXT: sqdmulh v0.4h, v1.4h, v0.4h +; CHECK-NEXT: sshr v3.4s, v3.4s, #15 +; CHECK-NEXT: smin v2.4s, v3.4s, v2.4s +; CHECK-NEXT: xtn2 v0.8h, v2.4s +; CHECK-NEXT: ret + %as = sext <6 x i16> %a to <6 x i32> + %bs = sext <6 x i16> %b to <6 x i32> + %m = mul <6 x i32> %bs, %as + %sh = ashr <6 x i32> %m, splat (i32 15) + %ma = tail call <6 x i32> @llvm.smin.v6i32(<6 x i32> %sh, <6 x i32> splat (i32 32767)) + %t = trunc <6 x i32> %ma to <6 x i16> + ret <6 x i16> %t +} + +define <4 x i16> @unsupported_saturation_value_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: unsupported_saturation_value_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: movi v1.4s, #42 +; CHECK-NEXT: sshr v0.4s, v0.4s, #15 +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %as = sext <4 x i16> %a to <4 x i32> + %bs = sext <4 x i16> %b to <4 x i32> + %m = mul <4 x i32> %bs, %as + %sh = ashr <4 x i32> %m, splat (i32 15) + %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 42)) + %t = trunc <4 x i32> %ma to <4 x i16> + ret <4 x i16> %t +} + +define <4 x i16> @unsupported_shift_value_v4i16(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: unsupported_shift_value_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: smull v0.4s, v1.4h, v0.4h +; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: sshr v0.4s, v0.4s, #3 +; CHECK-NEXT: smin v0.4s, v0.4s, v1.4s +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret + %as = sext <4 x i16> %a to <4 x i32> + %bs = sext <4 x i16> %b to <4 x i32> + %m = mul <4 x i32> %bs, %as + %sh = ashr <4 x i32> %m, splat (i32 3) + %ma = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %sh, <4 x i32> splat (i32 32767)) + %t = trunc <4 x i32> %ma to <4 x i16> + ret <4 x i16> %t +} + +define <2 x i16> @extend_to_illegal_type(<2 x i16> %a, <2 x i16> %b) { +; CHECK-LABEL: extend_to_illegal_type: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: sqdmulh v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret + %as = sext <2 x i16> %a to <2 x i48> + %bs = sext <2 x i16> %b to <2 x i48> + %m = mul <2 x i48> %bs, %as + %sh = ashr <2 x i48> %m, splat (i48 15) + %ma = tail call <2 x i48> @llvm.smin.v4i32(<2 x i48> %sh, <2 x i48> splat (i48 32767)) + %t = trunc <2 x i48> %ma to <2 x i16> + ret <2 x i16> %t +} + +define <2 x i11> @illegal_source(<2 x i11> %a, <2 x i11> %b) { +; CHECK-LABEL: illegal_source: +; CHECK: // %bb.0: +; CHECK-NEXT: shl v0.2s, v0.2s, #21 +; CHECK-NEXT: shl v1.2s, v1.2s, #21 +; CHECK-NEXT: sshr v0.2s, v0.2s, #21 +; CHECK-NEXT: sshr v1.2s, v1.2s, #21 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: movi v1.2s, #127, msl #8 +; CHECK-NEXT: sshr v0.2s, v0.2s, #15 +; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret + %as = sext <2 x i11> %a to <2 x i32> + %bs = sext <2 x i11> %b to <2 x i32> + %m = mul <2 x i32> %bs, %as + %sh = ashr <2 x i32> %m, splat (i32 15) + %ma = tail call <2 x i32> @llvm.smin.v2i32(<2 x i32> %sh, <2 x i32> splat (i32 32767)) + %t = trunc <2 x i32> %ma to <2 x i11> + ret <2 x i11> %t +} +define <1 x i16> @saturating_1xi16(<1 x i16> %a, <1 x i16> %b) { +; CHECK-LABEL: saturating_1xi16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip1 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: zip1 v1.4h, v1.4h, v0.4h +; CHECK-NEXT: shl v0.2s, v0.2s, #16 +; CHECK-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-NEXT: shl v1.2s, v1.2s, #16 +; CHECK-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-NEXT: mul v0.2s, v1.2s, v0.2s +; CHECK-NEXT: movi v1.2s, #127, msl #8 +; CHECK-NEXT: sshr v0.2s, v0.2s, #15 +; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: uzp1 v0.4h, v0.4h, v0.4h +; CHECK-NEXT: ret + %as = sext <1 x i16> %a to <1 x i32> + %bs = sext <1 x i16> %b to <1 x i32> + %m = mul <1 x i32> %bs, %as + %sh = ashr <1 x i32> %m, splat (i32 15) + %ma = tail call <1 x i32> @llvm.smin.v1i32(<1 x i32> %sh, <1 x i32> splat (i32 32767)) + %t = trunc <1 x i32> %ma to <1 x i16> + ret <1 x i16> %t +} From 828a867ee010cdd832c43f7d844959adb4884b2e Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 16 Jul 2025 09:59:36 +0100 Subject: [PATCH 005/813] [AArch64] Reduce the costs of and/or/xor reductions (#148553) Since the costs were added the codegen for i8/i16 and/or/xor reductions has improved. This updates the cost model to produce the same costs in terms of number of instructions. --- .../AArch64/AArch64TargetTransformInfo.cpp | 56 +++++++++---------- .../Analysis/CostModel/AArch64/reduce-and.ll | 16 +++--- .../Analysis/CostModel/AArch64/reduce-or.ll | 16 +++--- .../Analysis/CostModel/AArch64/reduce-xor.ll | 16 +++--- .../PhaseOrdering/AArch64/quant_4x4.ll | 10 ++-- 5 files changed, 56 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index f339396f3a411..90d3d92d6bbf5 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5211,34 +5211,34 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll // AND: llvm/test/CodeGen/AArch64/reduce-and.ll static const CostTblEntry CostTblNoPairwise[]{ - {ISD::ADD, MVT::v8i8, 2}, - {ISD::ADD, MVT::v16i8, 2}, - {ISD::ADD, MVT::v4i16, 2}, - {ISD::ADD, MVT::v8i16, 2}, - {ISD::ADD, MVT::v2i32, 2}, - {ISD::ADD, MVT::v4i32, 2}, - {ISD::ADD, MVT::v2i64, 2}, - {ISD::OR, MVT::v8i8, 15}, - {ISD::OR, MVT::v16i8, 17}, - {ISD::OR, MVT::v4i16, 7}, - {ISD::OR, MVT::v8i16, 9}, - {ISD::OR, MVT::v2i32, 3}, - {ISD::OR, MVT::v4i32, 5}, - {ISD::OR, MVT::v2i64, 3}, - {ISD::XOR, MVT::v8i8, 15}, - {ISD::XOR, MVT::v16i8, 17}, - {ISD::XOR, MVT::v4i16, 7}, - {ISD::XOR, MVT::v8i16, 9}, - {ISD::XOR, MVT::v2i32, 3}, - {ISD::XOR, MVT::v4i32, 5}, - {ISD::XOR, MVT::v2i64, 3}, - {ISD::AND, MVT::v8i8, 15}, - {ISD::AND, MVT::v16i8, 17}, - {ISD::AND, MVT::v4i16, 7}, - {ISD::AND, MVT::v8i16, 9}, - {ISD::AND, MVT::v2i32, 3}, - {ISD::AND, MVT::v4i32, 5}, - {ISD::AND, MVT::v2i64, 3}, + {ISD::ADD, MVT::v8i8, 2}, + {ISD::ADD, MVT::v16i8, 2}, + {ISD::ADD, MVT::v4i16, 2}, + {ISD::ADD, MVT::v8i16, 2}, + {ISD::ADD, MVT::v2i32, 2}, + {ISD::ADD, MVT::v4i32, 2}, + {ISD::ADD, MVT::v2i64, 2}, + {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr + {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8 + {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr + {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16 + {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr + {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32 + {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov + {ISD::XOR, MVT::v8i8, 5}, // Same as above for or... + {ISD::XOR, MVT::v16i8, 7}, + {ISD::XOR, MVT::v4i16, 4}, + {ISD::XOR, MVT::v8i16, 6}, + {ISD::XOR, MVT::v2i32, 3}, + {ISD::XOR, MVT::v4i32, 5}, + {ISD::XOR, MVT::v2i64, 3}, + {ISD::AND, MVT::v8i8, 5}, // Same as above for or... + {ISD::AND, MVT::v16i8, 7}, + {ISD::AND, MVT::v4i16, 4}, + {ISD::AND, MVT::v8i16, 6}, + {ISD::AND, MVT::v2i32, 3}, + {ISD::AND, MVT::v4i32, 5}, + {ISD::AND, MVT::v2i64, 3}, }; switch (ISD) { default: diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll index 21e0356fd7321..b221fc8a35ab3 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll @@ -15,14 +15,14 @@ define void @reduce() { ; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) -; CHECK-NEXT: Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) -; CHECK-NEXT: Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> undef) ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll index 27dd42297bfab..4bb59e3a09b7a 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll @@ -15,14 +15,14 @@ define void @reduce() { ; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) -; CHECK-NEXT: Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) -; CHECK-NEXT: Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> undef) ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll index 826605450a2d8..8e81aadbb9934 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll @@ -15,14 +15,14 @@ define void @reduce() { ; CHECK-NEXT: Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef) ; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 18 for: %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 20 for: %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) -; CHECK-NEXT: Cost Model: Found costs of 7 for: %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) -; CHECK-NEXT: Cost Model: Found costs of 9 for: %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) -; CHECK-NEXT: Cost Model: Found costs of 10 for: %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 5 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 7 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 8 for: %V32i8 = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 10 for: %V64i8 = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> undef) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 6 for: %V8i16 = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> undef) +; CHECK-NEXT: Cost Model: Found costs of 7 for: %V16i16 = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> undef) ; CHECK-NEXT: Cost Model: Found costs of 3 for: %V2i32 = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 5 for: %V4i32 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> undef) ; CHECK-NEXT: Cost Model: Found costs of 6 for: %V8i32 = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> undef) diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll index 09a59de44c745..d55559d632019 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll @@ -62,12 +62,11 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) { ; CHECK-NEXT: store <8 x i16> [[PREDPHI]], ptr [[DCT]], align 2, !alias.scope [[META0]], !noalias [[META3]] ; CHECK-NEXT: store <8 x i16> [[PREDPHI34]], ptr [[TMP0]], align 2, !alias.scope [[META0]], !noalias [[META3]] ; CHECK-NEXT: [[BIN_RDX35:%.*]] = or <8 x i16> [[PREDPHI34]], [[PREDPHI]] -; CHECK-NEXT: [[BIN_RDX:%.*]] = sext <8 x i16> [[BIN_RDX35]] to <8 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = tail call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> [[BIN_RDX]]) +; CHECK-NEXT: [[TMP29:%.*]] = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> [[BIN_RDX35]]) ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[TMP29]], [[VECTOR_BODY]] ], [ [[OR_15:%.*]], [[IF_END_15:%.*]] ] -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[OR_LCSSA]], 0 +; CHECK-NEXT: [[OR_LCSSA_IN:%.*]] = phi i16 [ [[TMP29]], [[VECTOR_BODY]] ], [ [[OR_1551:%.*]], [[IF_END_15:%.*]] ] +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i16 [[OR_LCSSA_IN]], 0 ; CHECK-NEXT: [[LNOT_EXT:%.*]] = zext i1 [[TOBOOL]] to i32 ; CHECK-NEXT: ret i32 [[LNOT_EXT]] ; CHECK: for.body: @@ -514,8 +513,7 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) { ; CHECK: if.end.15: ; CHECK-NEXT: [[STOREMERGE_15:%.*]] = phi i16 [ [[CONV28_15]], [[IF_ELSE_15]] ], [ [[CONV12_15]], [[IF_THEN_15]] ] ; CHECK-NEXT: store i16 [[STOREMERGE_15]], ptr [[ARRAYIDX_15]], align 2 -; CHECK-NEXT: [[OR_1551:%.*]] = or i16 [[OR_1450]], [[STOREMERGE_15]] -; CHECK-NEXT: [[OR_15]] = sext i16 [[OR_1551]] to i32 +; CHECK-NEXT: [[OR_1551]] = or i16 [[OR_1450]], [[STOREMERGE_15]] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; entry: From 905bb5bddb690765cab5416d55ab017d7c832eb3 Mon Sep 17 00:00:00 2001 From: Serge Pavlov Date: Wed, 16 Jul 2025 16:02:15 +0700 Subject: [PATCH 006/813] [RISCV][FPEnv] Lowering of fpmode intrinsics (#148569) The change implements custom lowering of `get_fpmode`, `set_fpmode` and `reset_fpmode` for RISCV target. The implementation is aligned with the functions `fegetmode` and `fesetmode` in GLIBC. --- .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h | 11 ++++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 57 +++++++++++++++++++ llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 + llvm/lib/Target/RISCV/RISCVInstrInfo.td | 54 ++++++++++++++++++ llvm/test/CodeGen/RISCV/fpenv-xlen.ll | 34 +++++++++++ 5 files changed, 159 insertions(+) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 4c8dcf376755b..7ad5d5f3118b6 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -494,6 +494,17 @@ inline static bool isValidRoundingMode(unsigned Mode) { } } // namespace RISCVVXRndMode +namespace RISCVExceptFlags { +enum ExceptionFlag { + NX = 0x01, // Inexact + UF = 0x02, // Underflow + OF = 0x04, // Overflow + DZ = 0x08, // Divide by zero + NV = 0x10, // Invalid operation + ALL = 0x1F // Mask for all accrued exception flags +}; +} + //===----------------------------------------------------------------------===// // Floating-point Immediates // diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2754f6c3f8252..de830666d89b8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -655,6 +655,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::GET_FPENV, XLenVT, Custom); setOperationAction(ISD::SET_FPENV, XLenVT, Custom); setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom); + setOperationAction(ISD::GET_FPMODE, XLenVT, Custom); + setOperationAction(ISD::SET_FPMODE, XLenVT, Custom); + setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom); } setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool, @@ -8225,6 +8228,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerSET_FPENV(Op, DAG); case ISD::RESET_FPENV: return lowerRESET_FPENV(Op, DAG); + case ISD::GET_FPMODE: + return lowerGET_FPMODE(Op, DAG); + case ISD::SET_FPMODE: + return lowerSET_FPMODE(Op, DAG); + case ISD::RESET_FPMODE: + return lowerRESET_FPMODE(Op, DAG); case ISD::EH_DWARF_CFA: return lowerEH_DWARF_CFA(Op, DAG); case ISD::VP_MERGE: @@ -14002,6 +14011,54 @@ SDValue RISCVTargetLowering::lowerRESET_FPENV(SDValue Op, EnvValue); } +const uint64_t ModeMask64 = ~RISCVExceptFlags::ALL; +const uint32_t ModeMask32 = ~RISCVExceptFlags::ALL; + +SDValue RISCVTargetLowering::lowerGET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32; + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT); + SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other); + SDValue Result = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo); + Chain = Result.getValue(1); + return DAG.getMergeValues({Result, Chain}, DL); +} + +SDValue RISCVTargetLowering::lowerSET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32; + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue EnvValue = Op->getOperand(1); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT); + + EnvValue = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, EnvValue); + EnvValue = DAG.getNode(ISD::AND, DL, XLenVT, EnvValue, ModeMask); + Chain = DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo, + ModeMask); + return DAG.getNode(RISCVISD::SET_CSR, DL, MVT::Other, Chain, SysRegNo, + EnvValue); +} + +SDValue RISCVTargetLowering::lowerRESET_FPMODE(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32; + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT); + + return DAG.getNode(RISCVISD::CLEAR_CSR, DL, MVT::Other, Chain, SysRegNo, + ModeMask); +} + SDValue RISCVTargetLowering::lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 41bbf6b9dcf2e..3af729aaba2ae 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -563,6 +563,9 @@ class RISCVTargetLowering : public TargetLowering { SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerSET_FPMODE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerRESET_FPMODE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index f63531a0109b0..653607827282e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -120,6 +120,20 @@ def riscv_swap_csr : RVSDNode<"SWAP_CSR", SDTCisInt<2>]>, [SDNPHasChain]>; +// Clear bits of CSR. The first operand is the address of the required CSR, +// the second is the bitmask of cleared bits. +def riscv_clear_csr : RVSDNode<"CLEAR_CSR", + SDTypeProfile<0, 2, [SDTCisInt<0>, + SDTCisInt<1>]>, + [SDNPHasChain]>; + +// Set bits of CSR. The first operand is the address of the required CSR, +// the second is the bitmask of bits to set. +def riscv_set_csr : RVSDNode<"SET_CSR", + SDTypeProfile<0, 2, [SDTCisInt<0>, + SDTCisInt<1>]>, + [SDNPHasChain]>; + // A read of the 64-bit counter CSR on a 32-bit target (returns (Lo, Hi)). // It takes a chain operand and another two target constant operands (the // CSR numbers of the low and high parts of the counter). @@ -2038,6 +2052,42 @@ class SwapSysRegImm Regs> let Defs = Regs; } +class ClearSysReg Regs> + : Pseudo<(outs), (ins GPR:$val), + [(riscv_clear_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>, + PseudoInstExpansion<(CSRRC X0, SR.Encoding, GPR:$val)> { + let hasSideEffects = 0; + let Uses = Regs; + let Defs = Regs; +} + +class ClearSysRegImm Regs> + : Pseudo<(outs), (ins uimm5:$val), + [(riscv_clear_csr (XLenVT SR.Encoding), uimm5:$val)]>, + PseudoInstExpansion<(CSRRCI X0, SR.Encoding, uimm5:$val)> { + let hasSideEffects = 0; + let Uses = Regs; + let Defs = Regs; +} + +class SetSysReg Regs> + : Pseudo<(outs), (ins GPR:$val), + [(riscv_set_csr (XLenVT SR.Encoding), (XLenVT GPR:$val))]>, + PseudoInstExpansion<(CSRRS X0, SR.Encoding, GPR:$val)> { + let hasSideEffects = 0; + let Uses = Regs; + let Defs = Regs; +} + +class SetSysRegImm Regs> + : Pseudo<(outs), (ins uimm5:$val), + [(riscv_set_csr (XLenVT SR.Encoding), uimm5:$val)]>, + PseudoInstExpansion<(CSRRSI X0, SR.Encoding, uimm5:$val)> { + let hasSideEffects = 0; + let Uses = Regs; + let Defs = Regs; +} + def ReadFRM : ReadSysReg; let hasPostISelHook = 1 in { def WriteFRM : WriteSysReg; @@ -2056,6 +2106,10 @@ let hasPostISelHook = 1 in { def ReadFCSR : ReadSysReg; def WriteFCSR : WriteSysReg; def WriteFCSRImm : WriteSysRegImm; +def ClearFCSR : ClearSysReg; +def ClearFCSRImm : ClearSysRegImm; +def SetFCSR : SetSysReg; +def SetFCSRImm : SetSysRegImm; } /// Other pseudo-instructions diff --git a/llvm/test/CodeGen/RISCV/fpenv-xlen.ll b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll index 148186b21c125..255c120434f34 100644 --- a/llvm/test/CodeGen/RISCV/fpenv-xlen.ll +++ b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll @@ -35,3 +35,37 @@ entry: call void @llvm.reset.fpenv() ret void } + +define iXLen @func_get_fpmode() { +; CHECK-LABEL: func_get_fpmode: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: frcsr a0 +; CHECK-NEXT: ret +entry: + %fpenv = call iXLen @llvm.get.fpmode.iXLen() + ret iXLen %fpenv +} + +define void @func_set_fpmode(iXLen %fpmode) { +; CHECK-LABEL: func_set_fpmode: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a1, -32 +; CHECK-NEXT: csrc fcsr, a1 +; CHECK-NEXT: andi a0, a0, -32 +; CHECK-NEXT: csrs fcsr, a0 +; CHECK-NEXT: ret +entry: + call void @llvm.set.fpmode.iXLen(iXLen %fpmode) + ret void +} + +define void @func_reset_fpmode() { +; CHECK-LABEL: func_reset_fpmode: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li a0, -32 +; CHECK-NEXT: csrc fcsr, a0 +; CHECK-NEXT: ret +entry: + call void @llvm.reset.fpmode() + ret void +} From 5ae49393957c6b4d47ace7aaf5fc1e0388d3a695 Mon Sep 17 00:00:00 2001 From: nerix Date: Wed, 16 Jul 2025 11:02:50 +0200 Subject: [PATCH 007/813] [LLDB] Add formatters for MSVC STL std::vector (#147538) This adds synthetic child providers for `std::vector` and `std::vector` for MSVC's STL. The structure of a `std::vector` is relatively similar to libc++'s implementation that uses `__begin` and `__end`. `std::vector` is different. It's a `std::vector` wrapper instead of `std::vector`. This makes the calculation slightly less simple. I put a comment in the `GetChildAtIndex` to make this clear. - [NatVis for `std::vector`](https://github.com/microsoft/STL/blob/313964b78a8fd5a52e7965e13781f735bcce13c5/stl/debugger/STL.natvis#L1193-L1205) - [NatVis for `std::vector`](https://github.com/microsoft/STL/blob/313964b78a8fd5a52e7965e13781f735bcce13c5/stl/debugger/STL.natvis#L1167-L1179) Towards #24834. --- .../Plugins/Language/CPlusPlus/CMakeLists.txt | 1 + .../Language/CPlusPlus/CPlusPlusLanguage.cpp | 30 +- .../Plugins/Language/CPlusPlus/MsvcStl.h | 5 + .../Language/CPlusPlus/MsvcStlVector.cpp | 305 ++++++++++++++++++ .../vbool/TestDataFormatterStdVBool.py | 24 +- .../data-formatter-stl/generic/vbool/main.cpp | 39 ++- .../vector/TestDataFormatterStdVector.py | 11 + 7 files changed, 407 insertions(+), 8 deletions(-) create mode 100644 lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt index ea86b6b4327be..8ee6e2a246c55 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt +++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt @@ -36,6 +36,7 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN MsvcStl.cpp MsvcStlSmartPointer.cpp MsvcStlTuple.cpp + MsvcStlVector.cpp MSVCUndecoratedNameParser.cpp LINK_COMPONENTS diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp index bf4139119a76b..8724e829835c1 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp @@ -1404,7 +1404,7 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { stl_deref_flags.SetFrontEndWantsDereference(); cpp_category_sp->AddTypeSynthetic( - "^std::(__debug::)?vector<.+>(( )?&)?$", eFormatterMatchRegex, + "^std::__debug::vector<.+>(( )?&)?$", eFormatterMatchRegex, SyntheticChildrenSP(new ScriptedSyntheticChildren( stl_synth_flags, "lldb.formatters.cpp.gnu_libstdcpp.StdVectorSynthProvider"))); @@ -1465,10 +1465,10 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { "libstdc++ std::bitset summary provider", "^std::(__debug::)?bitset<.+>(( )?&)?$", stl_summary_flags, true); - AddCXXSummary( - cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider, - "libstdc++ std::vector summary provider", - "^std::(__debug::)?vector<.+>(( )?&)?$", stl_summary_flags, true); + AddCXXSummary(cpp_category_sp, + lldb_private::formatters::ContainerSizeSummaryProvider, + "libstdc++ std::__debug::vector summary provider", + "^std::__debug::vector<.+>(( )?&)?$", stl_summary_flags, true); AddCXXSummary( cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider, @@ -1615,6 +1615,20 @@ GenericTupleSyntheticFrontEndCreator(CXXSyntheticChildren *children, return LibStdcppTupleSyntheticFrontEndCreator(children, valobj_sp); } +static SyntheticChildrenFrontEnd * +GenericVectorSyntheticFrontEndCreator(CXXSyntheticChildren *children, + lldb::ValueObjectSP valobj_sp) { + if (!valobj_sp) + return nullptr; + + // checks for vector and vector + if (auto *msvc = MsvcStlVectorSyntheticFrontEndCreator(valobj_sp)) + return msvc; + + return new ScriptedSyntheticChildren::FrontEnd( + "lldb.formatters.cpp.gnu_libstdcpp.StdVectorSynthProvider", *valobj_sp); +} + /// Load formatters that are formatting types from more than one STL static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { if (!cpp_category_sp) @@ -1686,6 +1700,12 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider, "MSVC STL/libstdc++ std::tuple summary provider", "^std::tuple<.*>(( )?&)?$", stl_summary_flags, true); + AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider, + "MSVC/libstdc++ std::vector summary provider", + "^std::vector<.+>(( )?&)?$", stl_summary_flags, true); + AddCXXSynthetic(cpp_category_sp, GenericVectorSyntheticFrontEndCreator, + "MSVC/libstdc++ std::vector synthetic provider", + "^std::vector<.+>(( )?&)?$", stl_synth_flags, true); } static void LoadMsvcStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h index bad47701904bb..81397851b6010 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h +++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h @@ -51,6 +51,11 @@ SyntheticChildrenFrontEnd * MsvcStlTupleSyntheticFrontEndCreator(CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp); +// MSVC STL std::vector<> +bool IsMsvcStlVector(ValueObject &valobj); +lldb_private::SyntheticChildrenFrontEnd * +MsvcStlVectorSyntheticFrontEndCreator(lldb::ValueObjectSP valobj_sp); + } // namespace formatters } // namespace lldb_private diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp new file mode 100644 index 0000000000000..cfc98d27f56d6 --- /dev/null +++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStlVector.cpp @@ -0,0 +1,305 @@ +//===-- MsvcStlVector.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MsvcStl.h" + +#include "lldb/DataFormatters/FormattersHelpers.h" +#include "lldb/DataFormatters/TypeSynthetic.h" + +using namespace lldb; + +namespace lldb_private { +namespace formatters { + +class MsvcStlVectorSyntheticFrontEnd : public SyntheticChildrenFrontEnd { +public: + MsvcStlVectorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp); + + llvm::Expected CalculateNumChildren() override; + + lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override; + + lldb::ChildCacheState Update() override; + + llvm::Expected GetIndexOfChildWithName(ConstString name) override; + +private: + ValueObject *m_start = nullptr; + ValueObject *m_finish = nullptr; + CompilerType m_element_type; + uint32_t m_element_size = 0; +}; + +class MsvcStlVectorBoolSyntheticFrontEnd : public SyntheticChildrenFrontEnd { +public: + MsvcStlVectorBoolSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp); + + llvm::Expected CalculateNumChildren() override; + + lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override; + + lldb::ChildCacheState Update() override; + + llvm::Expected GetIndexOfChildWithName(ConstString name) override; + +private: + CompilerType m_bool_type; + ExecutionContextRef m_exe_ctx_ref; + uint64_t m_count = 0; + uint64_t m_element_bit_size = 0; + lldb::addr_t m_base_data_address = 0; + std::map m_children; +}; + +} // namespace formatters +} // namespace lldb_private + +lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd:: + MsvcStlVectorSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp) + : SyntheticChildrenFrontEnd(*valobj_sp), m_element_type() { + if (valobj_sp) + Update(); +} + +llvm::Expected lldb_private::formatters:: + MsvcStlVectorSyntheticFrontEnd::CalculateNumChildren() { + if (!m_start || !m_finish) + return llvm::createStringError( + "Failed to determine start/end of vector data."); + + uint64_t start_val = m_start->GetValueAsUnsigned(0); + uint64_t finish_val = m_finish->GetValueAsUnsigned(0); + + // A default-initialized empty vector. + if (start_val == 0 && finish_val == 0) + return 0; + + if (start_val == 0) + return llvm::createStringError("Invalid value for start of vector."); + + if (finish_val == 0) + return llvm::createStringError("Invalid value for end of vector."); + + if (start_val > finish_val) + return llvm::createStringError( + "Start of vector data begins after end pointer."); + + size_t num_children = (finish_val - start_val); + if (num_children % m_element_size) + return llvm::createStringError("Size not multiple of element size."); + + return num_children / m_element_size; +} + +lldb::ValueObjectSP +lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd::GetChildAtIndex( + uint32_t idx) { + if (!m_start || !m_finish) + return lldb::ValueObjectSP(); + + uint64_t offset = idx * m_element_size; + offset = offset + m_start->GetValueAsUnsigned(0); + StreamString name; + name.Printf("[%" PRIu64 "]", (uint64_t)idx); + return CreateValueObjectFromAddress(name.GetString(), offset, + m_backend.GetExecutionContextRef(), + m_element_type); +} + +lldb::ChildCacheState +lldb_private::formatters::MsvcStlVectorSyntheticFrontEnd::Update() { + m_start = m_finish = nullptr; + ValueObjectSP data_sp(m_backend.GetChildAtNamePath({"_Mypair", "_Myval2"})); + + if (!data_sp) + return lldb::ChildCacheState::eRefetch; + + m_start = data_sp->GetChildMemberWithName("_Myfirst").get(); + m_finish = data_sp->GetChildMemberWithName("_Mylast").get(); + if (!m_start || !m_finish) + return lldb::ChildCacheState::eRefetch; + + m_element_type = m_start->GetCompilerType().GetPointeeType(); + llvm::Expected size_or_err = m_element_type.GetByteSize(nullptr); + if (size_or_err) + m_element_size = *size_or_err; + else + LLDB_LOG_ERRORV(GetLog(LLDBLog::DataFormatters), size_or_err.takeError(), + "{0}"); + + return lldb::ChildCacheState::eRefetch; +} + +llvm::Expected lldb_private::formatters:: + MsvcStlVectorSyntheticFrontEnd::GetIndexOfChildWithName(ConstString name) { + if (!m_start || !m_finish) + return llvm::createStringError("Type has no child named '%s'", + name.AsCString()); + auto optional_idx = ExtractIndexFromString(name.GetCString()); + if (!optional_idx) { + return llvm::createStringError("Type has no child named '%s'", + name.AsCString()); + } + return *optional_idx; +} + +lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd:: + MsvcStlVectorBoolSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp) + : SyntheticChildrenFrontEnd(*valobj_sp), m_bool_type(), m_exe_ctx_ref(), + m_children() { + if (valobj_sp) { + Update(); + m_bool_type = + valobj_sp->GetCompilerType().GetBasicTypeFromAST(lldb::eBasicTypeBool); + } +} + +llvm::Expected lldb_private::formatters:: + MsvcStlVectorBoolSyntheticFrontEnd::CalculateNumChildren() { + return m_count; +} + +lldb::ValueObjectSP +lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::GetChildAtIndex( + uint32_t idx) { + auto iter = m_children.find(idx), end = m_children.end(); + if (iter != end) + return iter->second; + if (idx >= m_count) + return {}; + if (m_base_data_address == 0 || m_count == 0) + return {}; + if (!m_bool_type) + return {}; + + // The vector is represented as a sequence of `int`s. + // The size of an `int` is in `m_element_bit_size` (most often 32b). + // To access the element at index `i`: + // (bool)((data_address[i / bit_size] >> (i % bit_size)) & 1) + + // int *byte_location = &data_address[i / bit_size] + size_t byte_idx = (idx / m_element_bit_size) * (m_element_bit_size / 8); + lldb::addr_t byte_location = m_base_data_address + byte_idx; + + ProcessSP process_sp(m_exe_ctx_ref.GetProcessSP()); + if (!process_sp) + return {}; + Status err; + Scalar scalar; + size_t bytes_read = process_sp->ReadScalarIntegerFromMemory( + byte_location, m_element_bit_size / 8, false, scalar, err); + if (err.Fail() || bytes_read == 0 || !scalar.IsValid()) + return {}; + + size_t bit_index = idx % m_element_bit_size; + bool bit_set = scalar.GetAPSInt()[bit_index]; + std::optional size = + llvm::expectedToOptional(m_bool_type.GetByteSize(nullptr)); + if (!size) + return {}; + WritableDataBufferSP buffer_sp(new DataBufferHeap(*size, 0)); + if (bit_set && buffer_sp && buffer_sp->GetBytes()) { + // regardless of endianness, anything non-zero is true + *(buffer_sp->GetBytes()) = 1; + } + StreamString name; + name.Printf("[%" PRIu64 "]", (uint64_t)idx); + ValueObjectSP retval_sp(CreateValueObjectFromData( + name.GetString(), + DataExtractor(buffer_sp, process_sp->GetByteOrder(), + process_sp->GetAddressByteSize()), + m_exe_ctx_ref, m_bool_type)); + if (retval_sp) + m_children[idx] = retval_sp; + return retval_sp; +} + +lldb::ChildCacheState +lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd::Update() { + m_exe_ctx_ref.Clear(); + m_count = 0; + m_element_bit_size = 0; + m_base_data_address = 0; + m_children.clear(); + + ValueObjectSP valobj_sp = m_backend.GetSP(); + if (!valobj_sp) + return lldb::ChildCacheState::eRefetch; + auto exe_ctx_ref = valobj_sp->GetExecutionContextRef(); + + ValueObjectSP size_sp = valobj_sp->GetChildMemberWithName("_Mysize"); + if (!size_sp) + return lldb::ChildCacheState::eRefetch; + uint64_t count = size_sp->GetValueAsUnsigned(0); + if (count == 0) + return lldb::ChildCacheState::eReuse; + + ValueObjectSP begin_sp(valobj_sp->GetChildAtNamePath( + {"_Myvec", "_Mypair", "_Myval2", "_Myfirst"})); + if (!begin_sp) + return lldb::ChildCacheState::eRefetch; + + // FIXME: the STL exposes _EEN_VBITS as a constant - it should be used instead + CompilerType begin_ty = begin_sp->GetCompilerType().GetPointeeType(); + if (!begin_ty.IsValid()) + return lldb::ChildCacheState::eRefetch; + llvm::Expected element_bit_size = begin_ty.GetBitSize(nullptr); + if (!element_bit_size) + return lldb::ChildCacheState::eRefetch; + + uint64_t base_data_address = begin_sp->GetValueAsUnsigned(0); + if (!base_data_address) + return lldb::ChildCacheState::eRefetch; + + m_exe_ctx_ref = exe_ctx_ref; + m_count = count; + m_element_bit_size = *element_bit_size; + m_base_data_address = base_data_address; + return lldb::ChildCacheState::eRefetch; +} + +llvm::Expected +lldb_private::formatters::MsvcStlVectorBoolSyntheticFrontEnd:: + GetIndexOfChildWithName(ConstString name) { + if (!m_count || !m_base_data_address) + return llvm::createStringError("Type has no child named '%s'", + name.AsCString()); + auto optional_idx = ExtractIndexFromString(name.AsCString()); + if (!optional_idx) { + return llvm::createStringError("Type has no child named '%s'", + name.AsCString()); + } + uint32_t idx = *optional_idx; + if (idx >= CalculateNumChildrenIgnoringErrors()) + return llvm::createStringError("Type has no child named '%s'", + name.AsCString()); + return idx; +} + +lldb_private::SyntheticChildrenFrontEnd * +lldb_private::formatters::MsvcStlVectorSyntheticFrontEndCreator( + lldb::ValueObjectSP valobj_sp) { + if (!valobj_sp) + return nullptr; + + valobj_sp = valobj_sp->GetNonSyntheticValue(); + if (!valobj_sp) + return nullptr; + + // We can't check the template parameter here, because PDB doesn't include + // this information. + + // vector + if (valobj_sp->GetChildMemberWithName("_Mypair") != nullptr) + return new MsvcStlVectorSyntheticFrontEnd(valobj_sp); + // vector + if (valobj_sp->GetChildMemberWithName("_Myvec") != nullptr) + return new MsvcStlVectorBoolSyntheticFrontEnd(valobj_sp); + + return nullptr; +} diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py index 56c86d1edde25..dd142d2be193b 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/TestDataFormatterStdVBool.py @@ -47,7 +47,7 @@ def cleanup(): self.expect( "frame variable -A vBool", substrs=[ - "size=49", + "size=73", "[0] = false", "[1] = true", "[18] = false", @@ -55,13 +55,20 @@ def cleanup(): "[36] = false", "[47] = true", "[48] = true", + "[49] = true", + "[50] = false", + "[56] = false", + "[65] = true", + "[70] = false", + "[71] = true", + "[72] = true", ], ) self.expect( "expr -A -- vBool", substrs=[ - "size=49", + "size=73", "[0] = false", "[1] = true", "[18] = false", @@ -69,6 +76,13 @@ def cleanup(): "[36] = false", "[47] = true", "[48] = true", + "[49] = true", + "[50] = false", + "[56] = false", + "[65] = true", + "[70] = false", + "[71] = true", + "[72] = true", ], ) @@ -88,3 +102,9 @@ def test_libstdcxx_debug(self): dictionary={"USE_LIBSTDCPP": 1, "CXXFLAGS_EXTRAS": "-D_GLIBCXX_DEBUG"} ) self.do_test() + + @add_test_categories(["msvcstl"]) + def test_libstdcxx(self): + # No flags, because the "msvcstl" category checks that the MSVC STL is used by default. + self.build() + self.do_test() diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp index 22fc6c89ca8a2..2c54166ace7cc 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vbool/main.cpp @@ -1,10 +1,10 @@ #include -#include #include int main() { std::vector vBool; + // 0..=7 vBool.push_back(false); vBool.push_back(true); vBool.push_back(false); @@ -14,6 +14,7 @@ int main() { vBool.push_back(false); vBool.push_back(true); + // 8..=15 vBool.push_back(false); vBool.push_back(true); vBool.push_back(false); @@ -23,6 +24,7 @@ int main() { vBool.push_back(false); vBool.push_back(true); + // 16..=23 vBool.push_back(false); vBool.push_back(true); vBool.push_back(false); @@ -32,6 +34,7 @@ int main() { vBool.push_back(false); vBool.push_back(true); + // 24..=31 vBool.push_back(false); vBool.push_back(true); vBool.push_back(false); @@ -41,6 +44,7 @@ int main() { vBool.push_back(false); vBool.push_back(true); + // 32..=39 vBool.push_back(false); vBool.push_back(true); vBool.push_back(false); @@ -50,6 +54,7 @@ int main() { vBool.push_back(false); vBool.push_back(true); + // 40..=47 vBool.push_back(false); vBool.push_back(true); vBool.push_back(false); @@ -58,6 +63,38 @@ int main() { vBool.push_back(true); vBool.push_back(false); vBool.push_back(true); + + // 48..=55 + vBool.push_back(true); + vBool.push_back(true); + vBool.push_back(false); + vBool.push_back(true); + vBool.push_back(false); + vBool.push_back(true); + vBool.push_back(false); + vBool.push_back(true); + + // 56..=63 + vBool.push_back(false); + vBool.push_back(true); + vBool.push_back(false); + vBool.push_back(true); + vBool.push_back(false); + vBool.push_back(true); + vBool.push_back(false); + vBool.push_back(true); + + // 64..=71 + vBool.push_back(false); + vBool.push_back(true); + vBool.push_back(false); + vBool.push_back(true); + vBool.push_back(true); + vBool.push_back(true); + vBool.push_back(false); + vBool.push_back(true); + + // 72 vBool.push_back(true); std::puts("// Set break point at this line."); diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py index ba8b10450f4fc..d4da60f86a315 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/vector/TestDataFormatterStdVector.py @@ -184,6 +184,12 @@ def test_libcxx(self): self.build(dictionary={"USE_LIBCPP": 1}) self.do_test() + @add_test_categories(["msvcstl"]) + def test_msvcstl(self): + # No flags, because the "msvcstl" category checks that the MSVC STL is used by default. + self.build() + self.do_test() + def do_test_ref_and_ptr(self): """Test that that file and class static variables display correctly.""" (self.target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint( @@ -215,3 +221,8 @@ def test_ref_and_ptr_libstdcxx_debug(self): def test_ref_and_ptr_libcxx(self): self.build(dictionary={"USE_LIBCPP": 1}) self.do_test_ref_and_ptr() + + @add_test_categories(["msvcstl"]) + def test_ref_and_ptr_msvcstl(self): + self.build() + self.do_test_ref_and_ptr() From f98cf07b7dc7af7716e1da9a1257b8c6416f9a46 Mon Sep 17 00:00:00 2001 From: nerix Date: Wed, 16 Jul 2025 11:07:45 +0200 Subject: [PATCH 008/813] [LLDB] Convert libstdc++ std::variant summary to C++ (#148929) This PR converts the `std::variant` summary from Python to C++. Split from #148554. MSVC's STL and libstdc++ use the same type name for `std::variant`, thus they need one "dispatcher" function that checks the type and calls the appropriate summary. For summaries, both need to be implemented in C++. This is mostly a 1:1 translation. The main difference is that in C++, the summary returns `false` if it can't inspect the object properly (e.g. a member could not be found). In Python, this wasn't possible. --- lldb/examples/synthetic/gnu_libstdcpp.py | 32 ------------- .../Language/CPlusPlus/CPlusPlusLanguage.cpp | 8 ++-- .../Plugins/Language/CPlusPlus/LibStdcpp.cpp | 46 +++++++++++++++++++ .../Plugins/Language/CPlusPlus/LibStdcpp.h | 4 ++ 4 files changed, 53 insertions(+), 37 deletions(-) diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py index 20b9488af5597..f42a009c21f48 100644 --- a/lldb/examples/synthetic/gnu_libstdcpp.py +++ b/lldb/examples/synthetic/gnu_libstdcpp.py @@ -882,38 +882,6 @@ def update(self): return False -def VariantSummaryProvider(valobj, dict): - raw_obj = valobj.GetNonSyntheticValue() - index_obj = raw_obj.GetChildMemberWithName("_M_index") - data_obj = raw_obj.GetChildMemberWithName("_M_u") - if not (index_obj and index_obj.IsValid() and data_obj and data_obj.IsValid()): - return "" - - def get_variant_npos_value(index_byte_size): - if index_byte_size == 1: - return 0xFF - elif index_byte_size == 2: - return 0xFFFF - else: - return 0xFFFFFFFF - - npos_value = get_variant_npos_value(index_obj.GetByteSize()) - index = index_obj.GetValueAsUnsigned(0) - if index == npos_value: - return " No Value" - - # Strip references and typedefs. - variant_type = raw_obj.GetType().GetCanonicalType().GetDereferencedType() - template_arg_count = variant_type.GetNumberOfTemplateArguments() - - # Invalid index can happen when the variant is not initialized yet. - if index >= template_arg_count: - return " " - - active_type = variant_type.GetTemplateArgumentType(index) - return f" Active Type = {active_type.GetDisplayTypeName()} " - - class VariantSynthProvider: def __init__(self, valobj, dict): self.raw_obj = valobj.GetNonSyntheticValue() diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp index 8724e829835c1..4a3fdede84d32 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp @@ -1513,11 +1513,9 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { TypeSummaryImplSP(new ScriptSummaryFormat( stl_summary_flags, "lldb.formatters.cpp.gnu_libstdcpp.ForwardListSummaryProvider"))); - cpp_category_sp->AddTypeSummary( - "^std::variant<.+>$", eFormatterMatchRegex, - TypeSummaryImplSP(new ScriptSummaryFormat( - stl_summary_flags, - "lldb.formatters.cpp.gnu_libstdcpp.VariantSummaryProvider"))); + AddCXXSummary(cpp_category_sp, LibStdcppVariantSummaryProvider, + "libstdc++ std::variant summary provider", "^std::variant<.+>$", + stl_summary_flags, true); AddCXXSynthetic( cpp_category_sp, diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp index c80a52d0f9ed6..595e835b37df9 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp @@ -366,3 +366,49 @@ bool lldb_private::formatters::LibStdcppSmartPointerSummaryProvider( return true; } + +static uint64_t LibStdcppVariantNposValue(size_t index_byte_size) { + switch (index_byte_size) { + case 1: + return 0xff; + case 2: + return 0xffff; + default: + return 0xffff'ffff; + } +} + +bool formatters::LibStdcppVariantSummaryProvider( + ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) { + ValueObjectSP valobj_sp = valobj.GetNonSyntheticValue(); + if (!valobj_sp) + return false; + + ValueObjectSP index_obj = valobj_sp->GetChildMemberWithName("_M_index"); + ValueObjectSP data_obj = valobj_sp->GetChildMemberWithName("_M_u"); + if (!index_obj || !data_obj) + return false; + + auto index_bytes = index_obj->GetByteSize(); + if (!index_bytes) + return false; + auto npos_value = LibStdcppVariantNposValue(*index_bytes); + auto index = index_obj->GetValueAsUnsigned(0); + if (index == npos_value) { + stream.Printf(" No Value"); + return true; + } + + auto variant_type = + valobj_sp->GetCompilerType().GetCanonicalType().GetNonReferenceType(); + if (!variant_type) + return false; + if (index >= variant_type.GetNumTemplateArguments(true)) { + stream.Printf(" "); + return true; + } + + auto active_type = variant_type.GetTypeTemplateArgument(index, true); + stream << " Active Type = " << active_type.GetDisplayTypeName() << " "; + return true; +} diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h index 8d4d777edee88..8d2025e940ead 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.h @@ -29,6 +29,10 @@ bool LibStdcppUniquePointerSummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options); // libstdc++ std::unique_ptr<> +bool LibStdcppVariantSummaryProvider( + ValueObject &valobj, Stream &stream, + const TypeSummaryOptions &options); // libstdc++ std::variant<> + SyntheticChildrenFrontEnd * LibstdcppMapIteratorSyntheticFrontEndCreator(CXXSyntheticChildren *, lldb::ValueObjectSP); From 3483ac50e396f395008cd9456ca575891fb2d24b Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 16 Jul 2025 09:14:14 +0000 Subject: [PATCH 009/813] [gn build] Port 5ae49393957c --- .../gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn index fc256c8d14063..0c8e3aa664e38 100644 --- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn @@ -64,5 +64,6 @@ static_library("CPlusPlus") { "MsvcStl.cpp", "MsvcStlSmartPointer.cpp", "MsvcStlTuple.cpp", + "MsvcStlVector.cpp", ] } From 46b4bd2882447c46138f7272bc915ce6f74bf7bc Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 16 Jul 2025 11:24:17 +0200 Subject: [PATCH 010/813] [libc++] Implement _LIBCPP_SUPPRESS_DEPRECATED macros in terms of _LIBCPP_DIAGNOSTIC macros (#143857) This makes the code a bit more consistent, since we use the `_LIBCPP_DIAGNOSTIC` macros everywhere else. --- libcxx/include/__config | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/libcxx/include/__config b/libcxx/include/__config index 1d547eac30952..ee06abfba7a08 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -416,6 +416,12 @@ typedef __char32_t char32_t; # define _LIBCPP_GCC_DIAGNOSTIC_IGNORED(str) # endif +// Macros to enter and leave a state where deprecation warnings are suppressed. +# define _LIBCPP_SUPPRESS_DEPRECATED_PUSH \ + _LIBCPP_DIAGNOSTIC_PUSH _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated") \ + _LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations") +# define _LIBCPP_SUPPRESS_DEPRECATED_POP _LIBCPP_DIAGNOSTIC_POP + # if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_FAST # define _LIBCPP_HARDENING_SIG f # elif _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_EXTENSIVE @@ -713,17 +719,6 @@ typedef __char32_t char32_t; # define _LIBCPP_DEPRECATED_WITH_CHAR8_T # endif -// Macros to enter and leave a state where deprecation warnings are suppressed. -# if defined(_LIBCPP_COMPILER_CLANG_BASED) || defined(_LIBCPP_COMPILER_GCC) -# define _LIBCPP_SUPPRESS_DEPRECATED_PUSH \ - _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wdeprecated\"") \ - _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -# define _LIBCPP_SUPPRESS_DEPRECATED_POP _Pragma("GCC diagnostic pop") -# else -# define _LIBCPP_SUPPRESS_DEPRECATED_PUSH -# define _LIBCPP_SUPPRESS_DEPRECATED_POP -# endif - # if _LIBCPP_STD_VER <= 11 # define _LIBCPP_EXPLICIT_SINCE_CXX14 # else From bbfbe7d789f7b6ade715f3656db0bdd657eeef68 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 16 Jul 2025 11:25:19 +0200 Subject: [PATCH 011/813] [clang][bytecode][NFC] Remove unused function prototypes (#149031) --- clang/lib/AST/ByteCode/Interp.cpp | 8 ++++---- clang/lib/AST/ByteCode/Interp.h | 14 -------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 98fb8c8fcded5..edb1866b5265c 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -815,7 +815,7 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { return true; } -bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { +static bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { if (!CheckLive(S, OpPC, Ptr, AK_MemberCall)) return false; if (!Ptr.isDummy()) { @@ -937,7 +937,7 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) { return false; } -bool CheckCallDepth(InterpState &S, CodePtr OpPC) { +static bool CheckCallDepth(InterpState &S, CodePtr OpPC) { if ((S.Current->getDepth() + 1) > S.getLangOpts().ConstexprCallDepth) { S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_depth_limit_exceeded) @@ -1092,8 +1092,8 @@ bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr, return false; } -bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F, - const CallExpr *CE, unsigned ArgSize) { +static bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F, + const CallExpr *CE, unsigned ArgSize) { auto Args = ArrayRef(CE->getArgs(), CE->getNumArgs()); auto NonNullArgs = collectNonNullArgs(F->getDecl(), Args); unsigned Offset = 0; diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index 6be68e4a978b5..ce0ebdd8321b7 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -98,26 +98,12 @@ bool CheckGlobalInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr); /// Checks if a value can be stored in a block. bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr); -/// Checks if a method can be invoked on an object. -bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr); - /// Checks if a value can be initialized. bool CheckInit(InterpState &S, CodePtr OpPC, const Pointer &Ptr); -/// Checks if a method can be called. -bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F); - -/// Checks if calling the currently active function would exceed -/// the allowed call depth. -bool CheckCallDepth(InterpState &S, CodePtr OpPC); - /// Checks the 'this' pointer. bool CheckThis(InterpState &S, CodePtr OpPC, const Pointer &This); -/// Checks if all the arguments annotated as 'nonnull' are in fact not null. -bool CheckNonNullArgs(InterpState &S, CodePtr OpPC, const Function *F, - const CallExpr *CE, unsigned ArgSize); - /// Checks if dynamic memory allocation is available in the current /// language mode. bool CheckDynamicMemoryAllocation(InterpState &S, CodePtr OpPC); From 60579ec3059b2b6cc9dad90eaac1ed363fc395a7 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Wed, 16 Jul 2025 04:31:48 -0500 Subject: [PATCH 012/813] [Support][BLAKE3] Prefix more blake3 methods (#149007) Added by #147948, blake3_xof_many and blake3_compress_subtree_wide cause conflicts when linking llvm and blake3 statically into the same binary. Similar to #148607. --- llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h index d5be360815add..d24657465dd8f 100644 --- a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h +++ b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h @@ -10,7 +10,9 @@ #define blake3_hasher llvm_blake3_hasher #define blake3_chunk_state llvm_blake3_chunk_state #define blake3_compress_in_place llvm_blake3_compress_in_place +#define blake3_compress_subtree_wide llvm_blake3_compress_subtree_wide #define blake3_compress_xof llvm_blake3_compress_xof +#define blake3_xof_many llvm_blake3_xof_many #define blake3_hash_many llvm_blake3_hash_many #define blake3_simd_degree llvm_blake3_simd_degree #define blake3_compress_in_place_portable llvm_blake3_compress_in_place_portable From 653872f782e1faaabc1da23769e6b35b10e74bde Mon Sep 17 00:00:00 2001 From: Orlando Cazalet-Hyams Date: Wed, 16 Jul 2025 10:43:09 +0100 Subject: [PATCH 013/813] [KeyInstr] Fix verifier check (#149043) The verifier check was in the wrong place, meaning it wasn't actually checking many instructions. Fixing that causes a test failure (coro-dwarf-key-instrs.cpp) because coros turn off the feature but still annotate instructions with the metadata (which is a supported situation, but the verifier doesn't like it, and it's hard to teach the verifier to like it). Fix that by avoiding emitting any key instruction metadata if the DISubprogram has opted out of key instructions. --- clang/lib/CodeGen/CGDebugInfo.cpp | 4 ++++ llvm/lib/IR/Verifier.cpp | 15 +++++++++------ .../DebugInfo/KeyInstructions/Generic/verify.ll | 3 +++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index b985db7a9494b..75ee08a2bcfa6 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -170,6 +170,10 @@ void CGDebugInfo::addInstToSpecificSourceAtom(llvm::Instruction *KeyInstruction, if (!Group || !CGM.getCodeGenOpts().DebugKeyInstructions) return; + llvm::DISubprogram *SP = KeyInstruction->getFunction()->getSubprogram(); + if (!SP || !SP->getKeyInstructionsEnabled()) + return; + addInstSourceAtomMetadata(KeyInstruction, Group, /*Rank=*/1); llvm::Instruction *BackupI = diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index dc5373e172f28..48688453b6986 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -3185,12 +3185,6 @@ void Verifier::visitFunction(const Function &F) { CheckDI(SP->describes(&F), "!dbg attachment points at wrong subprogram for function", N, &F, &I, DL, Scope, SP); - - if (DL->getAtomGroup()) - CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(), - "DbgLoc uses atomGroup but DISubprogram doesn't have Key " - "Instructions enabled", - DL, DL->getScope()->getSubprogram()); }; for (auto &BB : F) for (auto &I : BB) { @@ -5492,6 +5486,15 @@ void Verifier::visitInstruction(Instruction &I) { if (MDNode *N = I.getDebugLoc().getAsMDNode()) { CheckDI(isa(N), "invalid !dbg metadata attachment", &I, N); visitMDNode(*N, AreDebugLocsAllowed::Yes); + + if (auto *DL = dyn_cast(N)) { + if (DL->getAtomGroup()) { + CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(), + "DbgLoc uses atomGroup but DISubprogram doesn't have Key " + "Instructions enabled", + DL, DL->getScope()->getSubprogram()); + } + } } if (auto *DII = dyn_cast(&I)) { diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll index 0f8f505c51a58..5d73b2669ccda 100644 --- a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll +++ b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll @@ -7,6 +7,8 @@ define dso_local void @f() !dbg !10 { entry: +; Include non-key location to check verifier is checking the whole function. + %0 = add i32 0, 0, !dbg !14 ret void, !dbg !13 } @@ -20,3 +22,4 @@ entry: !11 = !DISubroutineType(types: !12) !12 = !{null} !13 = !DILocation(line: 1, column: 11, scope: !10, atomGroup: 1, atomRank: 1) +!14 = !DILocation(line: 1, column: 11, scope: !10) From 38be53aa04de8c6d494de8074328ac8907f3f631 Mon Sep 17 00:00:00 2001 From: Artemiy Bulavin Date: Wed, 16 Jul 2025 11:11:38 +0100 Subject: [PATCH 014/813] [MLIR] Fix use-after-frees when accessing DistinctAttr storage (#148666) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR fixes a use-after-free error that happens when `DistinctAttr` instances are created within a `PassManager` running with crash recovery enabled. The root cause is that `DistinctAttr` storage is allocated in a thread_local allocator, which is destroyed when the crash recovery thread joins, invalidating the storage. Moreover, even without crash reproduction disabling multithreading on the context will destroy the context's thread pool, and in turn delete the threadlocal storage. This means a call to `ctx->disableMulthithreading()` breaks the IR. This PR replaces the thread local allocator with a synchronised allocator that's shared between threads. This persists the lifetime of allocated DistinctAttr storage instances to the lifetime of the context. ### Problem Details: The `DistinctAttributeAllocator` uses a `ThreadLocalCache` for lock-free allocation of `DistinctAttr` storage in a multithreaded context. The issue occurs when a `PassManager` is run with crash recovery (`runWithCrashRecovery`), the pass pipeline is executed on a temporary thread spawned by `llvm::CrashRecoveryContext`. Any `DistinctAttr`s created during this execution have their storage allocated in the thread_local cache of this temporary thread. When the thread joins, the thread_local storage is destroyed, freeing the `DistinctAttr`s' memory. If this attribute is accessed later, e.g. when printing, it results in a use-after-free. As mentioned previously, this is also seen after creating some `DistinctAttr`s and then calling `ctx->disableMulthithreading()`. ### Solution `DistinctAttrStorageAllocator` uses a synchronised, shared allocator instead of one wrapped in a `ThreadLocalCache`. The former is what stores the allocator in transient thread_local storage. ### Testing: A C++ unit test has been added to validate this fix. (I was previously reproducing this failure with `mlir-opt` but I can no longer do so and I am unsure why.) ----- Note: This is a 2nd attempt at my previous PR https://github.com/llvm/llvm-project/pull/128566 that was reverted in https://github.com/llvm/llvm-project/pull/133000. I believe I've addressed the TSAN and race condition concerns. --- mlir/lib/IR/AttributeDetail.h | 29 ++++++------ mlir/lib/IR/MLIRContext.cpp | 1 + mlir/lib/Pass/PassCrashRecovery.cpp | 9 +++- mlir/unittests/IR/CMakeLists.txt | 1 + .../IR/DistinctAttributeAllocatorTest.cpp | 45 +++++++++++++++++++ 5 files changed, 69 insertions(+), 16 deletions(-) create mode 100644 mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp diff --git a/mlir/lib/IR/AttributeDetail.h b/mlir/lib/IR/AttributeDetail.h index 26d40ac3a38f6..cb9d21bf3e611 100644 --- a/mlir/lib/IR/AttributeDetail.h +++ b/mlir/lib/IR/AttributeDetail.h @@ -19,11 +19,9 @@ #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/IntegerSet.h" #include "mlir/IR/MLIRContext.h" -#include "mlir/Support/StorageUniquer.h" -#include "mlir/Support/ThreadLocalCache.h" #include "llvm/ADT/APFloat.h" -#include "llvm/ADT/PointerIntPair.h" -#include "llvm/Support/TrailingObjects.h" +#include "llvm/Support/Allocator.h" +#include namespace mlir { namespace detail { @@ -396,27 +394,30 @@ class DistinctAttributeUniquer { Attribute referencedAttr); }; -/// An allocator for distinct attribute storage instances. It uses thread local -/// bump pointer allocators stored in a thread local cache to ensure the storage -/// is freed after the destruction of the distinct attribute allocator. -class DistinctAttributeAllocator { +/// An allocator for distinct attribute storage instances. Uses a synchronized +/// BumpPtrAllocator to ensure thread-safety. The allocated storage is deleted +/// when the DistinctAttributeAllocator is destroyed. +class DistinctAttributeAllocator final { public: DistinctAttributeAllocator() = default; - DistinctAttributeAllocator(DistinctAttributeAllocator &&) = delete; DistinctAttributeAllocator(const DistinctAttributeAllocator &) = delete; DistinctAttributeAllocator & operator=(const DistinctAttributeAllocator &) = delete; - /// Allocates a distinct attribute storage using a thread local bump pointer - /// allocator to enable synchronization free parallel allocations. DistinctAttrStorage *allocate(Attribute referencedAttr) { - return new (allocatorCache.get().Allocate()) + std::scoped_lock guard(allocatorMutex); + return new (allocator.Allocate()) DistinctAttrStorage(referencedAttr); - } + }; private: - ThreadLocalCache allocatorCache; + /// Used to allocate distict attribute storages. The managed memory is freed + /// automatically when the allocator instance is destroyed. + llvm::BumpPtrAllocator allocator; + + /// Used to lock access to the allocator. + std::mutex allocatorMutex; }; } // namespace detail } // namespace mlir diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp index 716d9c85a377d..06ec1c85fb4d5 100644 --- a/mlir/lib/IR/MLIRContext.cpp +++ b/mlir/lib/IR/MLIRContext.cpp @@ -31,6 +31,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/RWMutex.h" #include "llvm/Support/ThreadPool.h" diff --git a/mlir/lib/Pass/PassCrashRecovery.cpp b/mlir/lib/Pass/PassCrashRecovery.cpp index 08f5114ae6eb2..3c9735f910094 100644 --- a/mlir/lib/Pass/PassCrashRecovery.cpp +++ b/mlir/lib/Pass/PassCrashRecovery.cpp @@ -411,14 +411,19 @@ struct FileReproducerStream : public mlir::ReproducerStream { LogicalResult PassManager::runWithCrashRecovery(Operation *op, AnalysisManager am) { + const bool threadingEnabled = getContext()->isMultithreadingEnabled(); crashReproGenerator->initialize(getPasses(), op, verifyPasses); // Safely invoke the passes within a recovery context. LogicalResult passManagerResult = failure(); llvm::CrashRecoveryContext recoveryContext; - recoveryContext.RunSafelyOnThread( - [&] { passManagerResult = runPasses(op, am); }); + const auto runPassesFn = [&] { passManagerResult = runPasses(op, am); }; + if (threadingEnabled) + recoveryContext.RunSafelyOnThread(runPassesFn); + else + recoveryContext.RunSafely(runPassesFn); crashReproGenerator->finalize(op, passManagerResult); + return passManagerResult; } diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt index d22afb3003e76..a46e64718dab9 100644 --- a/mlir/unittests/IR/CMakeLists.txt +++ b/mlir/unittests/IR/CMakeLists.txt @@ -6,6 +6,7 @@ add_mlir_unittest(MLIRIRTests AttrTypeReplacerTest.cpp Diagnostic.cpp DialectTest.cpp + DistinctAttributeAllocatorTest.cpp InterfaceTest.cpp IRMapping.cpp InterfaceAttachmentTest.cpp diff --git a/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp new file mode 100644 index 0000000000000..99067d09f7bed --- /dev/null +++ b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp @@ -0,0 +1,45 @@ +//=== DistinctAttributeAllocatorTest.cpp - DistinctAttr storage alloc test ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gtest/gtest.h" + +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/MLIRContext.h" +#include "llvm/Support/CrashRecoveryContext.h" +#include + +using namespace mlir; + +// +// Test that a DistinctAttr that is created on a separate thread does +// not have its storage deleted when the thread joins. +// +TEST(DistinctAttributeAllocatorTest, TestAttributeWellFormedAfterThreadJoin) { + MLIRContext ctx; + OpBuilder builder(&ctx); + DistinctAttr attr; + + std::thread t([&ctx, &attr]() { + attr = DistinctAttr::create(UnitAttr::get(&ctx)); + ASSERT_TRUE(attr); + }); + t.join(); + + // If the attribute storage got deleted after the thread joins (which we don't + // want) then trying to access it triggers an assert in Debug mode, and a + // crash otherwise. Run this in a CrashRecoveryContext to avoid bringing down + // the whole test suite if this test fails. Additionally, MSAN and/or TSAN + // should raise failures here if the attribute storage was deleted. + llvm::CrashRecoveryContext crc; + EXPECT_TRUE(crc.RunSafely([attr]() { (void)attr.getAbstractAttribute(); })); + EXPECT_TRUE( + crc.RunSafely([attr]() { (void)*cast(attr).getImpl(); })); + + ASSERT_TRUE(attr); +} From 5b8c15c6e7f3ac17383c12483f466a721b1040ba Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 16 Jul 2025 11:41:32 +0100 Subject: [PATCH 015/813] [DebugInfo] Remove getPrevNonDebugInstruction (#148859) With the advent of intrinsic-less debug-info, we no longer need to scatter calls to getPrevNonDebugInstruction around the codebase. Remove most of them -- there are one or two that have the "SkipPseudoOp" flag turned on, however they don't seem to be in positions where skipping anything would be reasonable. --- llvm/include/llvm/IR/Instruction.h | 11 ----------- .../llvm/Transforms/Utils/LockstepReverseIterator.h | 4 ++-- llvm/lib/CodeGen/CodeGenPrepare.cpp | 4 ++-- llvm/lib/CodeGen/StackProtector.cpp | 2 +- llvm/lib/IR/Instruction.cpp | 8 -------- llvm/lib/Transforms/IPO/OpenMPOpt.cpp | 2 +- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp | 2 +- .../Transforms/InstCombine/InstructionCombining.cpp | 2 +- .../Transforms/Instrumentation/AddressSanitizer.cpp | 2 +- llvm/lib/Transforms/Scalar/GVN.cpp | 2 +- llvm/lib/Transforms/Scalar/GVNSink.cpp | 2 +- llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp | 3 +-- llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp | 8 ++++---- 13 files changed, 16 insertions(+), 36 deletions(-) diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h index c317a06753970..5d25804a684ac 100644 --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -898,17 +898,6 @@ class Instruction : public User, /// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst. LLVM_ABI bool isDebugOrPseudoInst() const LLVM_READONLY; - /// Return a pointer to the previous non-debug instruction in the same basic - /// block as 'this', or nullptr if no such instruction exists. Skip any pseudo - /// operations if \c SkipPseudoOp is true. - LLVM_ABI const Instruction * - getPrevNonDebugInstruction(bool SkipPseudoOp = false) const; - Instruction *getPrevNonDebugInstruction(bool SkipPseudoOp = false) { - return const_cast( - static_cast(this)->getPrevNonDebugInstruction( - SkipPseudoOp)); - } - /// Create a copy of 'this' instruction that is identical in all ways except /// the following: /// * The instruction has no parent diff --git a/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h b/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h index cd525a9710103..5b92b33a10ea0 100644 --- a/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h +++ b/llvm/include/llvm/Transforms/Utils/LockstepReverseIterator.h @@ -61,7 +61,7 @@ class LockstepReverseIterator } Insts.clear(); for (BasicBlock *BB : Blocks) { - Instruction *Prev = BB->getTerminator()->getPrevNonDebugInstruction(); + Instruction *Prev = BB->getTerminator()->getPrevNode(); if (!Prev) { // Block wasn't big enough - only contained a terminator. if constexpr (EarlyFailure) { @@ -108,7 +108,7 @@ class LockstepReverseIterator return *this; SmallVector NewInsts; for (Instruction *Inst : Insts) { - Instruction *Prev = Inst->getPrevNonDebugInstruction(); + Instruction *Prev = Inst->getPrevNode(); if (!Prev) { if constexpr (!EarlyFailure) { this->ActiveBlocks.remove(Inst->getParent()); diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 70a9788c76e1f..d9d41f1d72e35 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -3015,7 +3015,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, // %phi = phi ptr [ %0, %bb0 ], [ %2, %entry ] if (PredBB && PredBB->getSingleSuccessor() == BB) CI = dyn_cast_or_null( - PredBB->getTerminator()->getPrevNonDebugInstruction(true)); + PredBB->getTerminator()->getPrevNode()); if (CI && CI->use_empty() && isIntrinsicOrLFToBeTailCalled(TLInfo, CI) && @@ -3032,7 +3032,7 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, for (BasicBlock *Pred : predecessors(BB)) { if (!VisitedBBs.insert(Pred).second) continue; - if (Instruction *I = Pred->rbegin()->getPrevNonDebugInstruction(true)) { + if (Instruction *I = Pred->rbegin()->getPrevNode()) { CallInst *CI = dyn_cast(I); if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) && attributesPermitTailCall(F, CI, RetI, *TLI)) { diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index 3ec70083b7043..9cc9af88c5e4f 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -626,7 +626,7 @@ bool InsertStackProtectors(const TargetMachine *TM, Function *F, // If we're instrumenting a block with a tail call, the check has to be // inserted before the call rather than between it and the return. - Instruction *Prev = CheckLoc->getPrevNonDebugInstruction(); + Instruction *Prev = CheckLoc->getPrevNode(); if (auto *CI = dyn_cast_if_present(Prev)) if (CI->isTailCall() && isInTailCallPosition(*CI, *TM)) CheckLoc = Prev; diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index c6dca727e0e89..763cc1832b794 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -1235,14 +1235,6 @@ bool Instruction::isDebugOrPseudoInst() const { return isa(this) || isa(this); } -const Instruction * -Instruction::getPrevNonDebugInstruction(bool SkipPseudoOp) const { - for (const Instruction *I = getPrevNode(); I; I = I->getPrevNode()) - if (!isa(I) && !(SkipPseudoOp && isa(I))) - return I; - return nullptr; -} - const DebugLoc &Instruction::getStableDebugLoc() const { return getDebugLoc(); } diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index 5de2285c2d2e3..5e2247f2a88d0 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -2875,7 +2875,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain { if (It->getSecond().IsReachedFromAlignedBarrierOnly) break; return false; - } while ((CurI = CurI->getPrevNonDebugInstruction())); + } while ((CurI = CurI->getPrevNode())); // Delayed decision on the forward pass to allow aligned barrier detection // in the backwards traversal. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 73293bb5f4a0e..3321435a6fecb 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3933,7 +3933,7 @@ Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) { if (NFI && isIdenticalOrStrongerFence(NFI, &FI)) return eraseInstFromFunction(FI); - if (auto *PFI = dyn_cast_or_null(FI.getPrevNonDebugInstruction())) + if (auto *PFI = dyn_cast_or_null(FI.getPrevNode())) if (isIdenticalOrStrongerFence(PFI, &FI)) return eraseInstFromFunction(FI); return nullptr; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 91a1b61ddc483..b587d76465803 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3890,7 +3890,7 @@ bool InstCombinerImpl::removeInstructionsBeforeUnreachable(Instruction &I) { // This includes instructions like stores and "llvm.assume" that may not get // removed by simple dead code elimination. bool Changed = false; - while (Instruction *Prev = I.getPrevNonDebugInstruction()) { + while (Instruction *Prev = I.getPrevNode()) { // While we theoretically can erase EH, that would result in a block that // used to start with an EH no longer starting with EH, which is invalid. // To make it valid, we'd need to fixup predecessors to no longer refer to diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index dfbe4f8172066..5957940add577 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -3424,7 +3424,7 @@ static void findStoresToUninstrumentedArgAllocas( isa(cast(Val)->getOperand(0)) && // Check that the cast appears directly before the store. Otherwise // moving the cast before InsBefore may break the IR. - Val == It->getPrevNonDebugInstruction(); + Val == It->getPrevNode(); bool IsArgInit = IsDirectArgInit || IsArgInitViaCast; if (!IsArgInit) continue; diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index d9d05c3e8cc49..8bff458f88bb9 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -1310,7 +1310,7 @@ static Value *findDominatingValue(const MemoryLocation &Loc, Type *LoadTy, BatchAAResults BatchAA(*AA); for (BasicBlock *BB = FromBB; BB; BB = BB->getSinglePredecessor()) for (auto *Inst = BB == FromBB ? From : BB->getTerminator(); - Inst != nullptr; Inst = Inst->getPrevNonDebugInstruction()) { + Inst != nullptr; Inst = Inst->getPrevNode()) { // Stop the search if limit is reached. if (++NumVisitedInsts > MaxNumVisitedInsts) return nullptr; diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index 2058df33ea331..a5fc0b4c6904d 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -799,7 +799,7 @@ void GVNSink::sinkLastInstruction(ArrayRef Blocks, BasicBlock *BBEnd) { SmallVector Insts; for (BasicBlock *BB : Blocks) - Insts.push_back(BB->getTerminator()->getPrevNonDebugInstruction()); + Insts.push_back(BB->getTerminator()->getPrevNode()); Instruction *I0 = Insts.front(); SmallVector NewOperands; diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index a09303bb4469f..60e5df08c6efd 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -194,8 +194,7 @@ static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F, // Calls to experimental_deoptimize must be followed by a return // of the value computed by experimental_deoptimize. // I.e., we can not change `ret` to `br` for this block. - if (auto *CI = - dyn_cast_or_null(Term->getPrevNonDebugInstruction())) { + if (auto *CI = dyn_cast_or_null(Term->getPrevNode())) { if (Function *F = CI->getCalledFunction()) if (Intrinsic::ID ID = F->getIntrinsicID()) if (ID == Intrinsic::experimental_deoptimize) diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp index 55dc8a79fd9ab..d6b578aa8ffd1 100644 --- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -2736,8 +2736,8 @@ TEST_P(OpenMPIRBuilderTestWithParams, DynamicWorkShareLoop) { EXPECT_EQ(OrigUpperBound->getValue(), 21); EXPECT_EQ(OrigStride->getValue(), 1); - CallInst *FiniCall = dyn_cast( - &*(LatchBlock->getTerminator()->getPrevNonDebugInstruction(true))); + CallInst *FiniCall = + dyn_cast(&*(LatchBlock->getTerminator()->getPrevNode())); EXPECT_EQ(FiniCall, nullptr); // The original loop iterator should only be used in the condition, in the @@ -2840,8 +2840,8 @@ TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoopOrdered) { EXPECT_EQ(SchedVal->getValue(), static_cast(OMPScheduleType::OrderedStaticChunked)); - CallInst *FiniCall = dyn_cast( - &*(LatchBlock->getTerminator()->getPrevNonDebugInstruction(true))); + CallInst *FiniCall = + dyn_cast(&*(LatchBlock->getTerminator()->getPrevNode())); ASSERT_NE(FiniCall, nullptr); EXPECT_EQ(FiniCall->getCalledFunction()->getName(), "__kmpc_dispatch_fini_4u"); From e333d6019dafd84f9aef919894d8c82389cde001 Mon Sep 17 00:00:00 2001 From: Julian Nagele Date: Wed, 16 Jul 2025 12:41:59 +0200 Subject: [PATCH 016/813] [AArch64] Replace expensive move from wzr by two moves via floating point immediate (#146538) We've noticed that inserting 0 into a known vector lane is implemented via a move from wzr, i.e., moving between register banks. We think it will be cheaper (and have seen improvements on our benchmarks) to materialize 0 into a floating point register and insert from there. PR: https://github.com/llvm/llvm-project/pull/146538 --- llvm/lib/Target/AArch64/AArch64Features.td | 7 ++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 ++ llvm/lib/Target/AArch64/AArch64Processors.td | 18 +++-- .../CodeGen/AArch64/arm64-vector-insertion.ll | 71 +++++++++++++++++-- llvm/test/CodeGen/AArch64/vecreduce-fadd.ll | 3 +- 5 files changed, 89 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 9973df865ea17..c1c1f0a1024d0 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -840,6 +840,13 @@ def FeatureDisableFastIncVL : SubtargetFeature<"disable-fast-inc-vl", "HasDisableFastIncVL", "true", "Do not prefer INC/DEC, ALL, { 1, 2, 4 } over ADDVL">; +// On most processors we want to avoid moving from WZR to vector registers +// (relying on materializing 0 to a FPR and moving from there instead), +// but on some (in-order) cores it's preferable to avoid the extra instruction instead. +def FeatureUseWzrToVecMove : SubtargetFeature<"use-wzr-to-vec-move", + "UseWzrToVecMove", "true", + "Move from WZR to insert 0 into vector registers">; + //===----------------------------------------------------------------------===// // Architectures. // diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ce91b72fa24e5..6c46b18d506c5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -419,6 +419,8 @@ def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER", def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">; +def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">; + //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -7377,6 +7379,7 @@ def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn), (i64 0)), dsub)>; +let Predicates = [UseWzrToVecMove] in { def : Pat<(vector_insert (v8f16 V128:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), (INSvi16gpr V128:$Rn, VectorIndexH:$imm, WZR)>; def : Pat<(vector_insert (v4f16 V64:$Rn), (f16 fpimm0), (i64 VectorIndexH:$imm)), @@ -7387,6 +7390,7 @@ def : Pat<(vector_insert (v2f32 V64:$Rn), (f32 fpimm0), (i64 VectorIndexS:$imm)) (EXTRACT_SUBREG (INSvi32gpr (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)), VectorIndexS:$imm, WZR), dsub)>; def : Pat<(vector_insert v2f64:$Rn, (f64 fpimm0), (i64 VectorIndexD:$imm)), (INSvi64gpr V128:$Rn, VectorIndexS:$imm, XZR)>; +} def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn), (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))), diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 5379305bc7a7f..adc984ad795af 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -21,40 +21,46 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320", "Cortex-A320 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53", "Cortex-A53 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureBalanceFPOps, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA55 : SubtargetFeature<"a55", "ARMProcFamily", "CortexA55", "Cortex-A55 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeaturePostRAScheduler, - FeatureFuseAddress]>; + FeatureFuseAddress, + FeatureUseWzrToVecMove]>; def TuneA510 : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510", "Cortex-A510 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler + FeaturePostRAScheduler, + FeatureUseWzrToVecMove ]>; def TuneA520 : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520", "Cortex-A520 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA520AE : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520", "Cortex-A520AE ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeaturePostRAScheduler]>; + FeaturePostRAScheduler, + FeatureUseWzrToVecMove]>; def TuneA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57", "Cortex-A57 ARM processors", [ diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll index ff28c7817d143..bae254bbd2104 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -172,8 +172,9 @@ define <8 x half> @test_insert_v8f16_insert_1(half %a) { ; CHECK-LABEL: test_insert_v8f16_insert_1: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: dup.8h v0, v0[0] -; CHECK-NEXT: mov.h v0[7], wzr +; CHECK-NEXT: mov.h v0[7], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <8 x half> , half %a, i32 0 %v.1 = insertelement <8 x half> %v.0, half %a, i32 1 @@ -278,8 +279,9 @@ define <4 x float> @test_insert_3_f32_undef_zero_vector(float %a) { ; CHECK-LABEL: test_insert_3_f32_undef_zero_vector: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: dup.4s v0, v0[0] -; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> , float %a, i32 0 %v.1 = insertelement <4 x float> %v.0, float %a, i32 1 @@ -347,12 +349,12 @@ define <8 x i16> @test_insert_v8i16_i16_zero(<8 x i16> %a) { ret <8 x i16> %v.0 } -; TODO: This should jsut be a mov.s v0[3], wzr define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) { ; CHECK-LABEL: test_insert_v4f16_f16_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov.h v0[0], wzr +; CHECK-NEXT: mov.h v0[0], v1[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0 @@ -362,7 +364,8 @@ define <4 x half> @test_insert_v4f16_f16_zero(<4 x half> %a) { define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) { ; CHECK-LABEL: test_insert_v8f16_f16_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov.h v0[6], wzr +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov.h v0[6], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6 ret <8 x half> %v.0 @@ -371,8 +374,9 @@ define <8 x half> @test_insert_v8f16_f16_zero(<8 x half> %a) { define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) { ; CHECK-LABEL: test_insert_v2f32_f32_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d1, #0000000000000000 ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: mov.s v0[0], wzr +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0 @@ -382,7 +386,8 @@ define <2 x float> @test_insert_v2f32_f32_zero(<2 x float> %a) { define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) { ; CHECK-LABEL: test_insert_v4f32_f32_zero: ; CHECK: // %bb.0: -; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov.s v0[3], v1[0] ; CHECK-NEXT: ret %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3 ret <4 x float> %v.0 @@ -391,8 +396,60 @@ define <4 x float> @test_insert_v4f32_f32_zero(<4 x float> %a) { define <2 x double> @test_insert_v2f64_f64_zero(<2 x double> %a) { ; CHECK-LABEL: test_insert_v2f64_f64_zero: ; CHECK: // %bb.0: +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: mov.d v0[1], v1[0] +; CHECK-NEXT: ret + %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1 + ret <2 x double> %v.0 +} + +define <4 x half> @test_insert_v4f16_f16_zero_wzr(<4 x half> %a) #1 { +; CHECK-LABEL: test_insert_v4f16_f16_zero_wzr: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov.h v0[0], wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %v.0 = insertelement <4 x half> %a, half 0.000000e+00, i32 0 + ret <4 x half> %v.0 +} + +define <8 x half> @test_insert_v8f16_f16_zero_wzr(<8 x half> %a) #1 { +; CHECK-LABEL: test_insert_v8f16_f16_zero_wzr: +; CHECK: // %bb.0: +; CHECK-NEXT: mov.h v0[6], wzr +; CHECK-NEXT: ret + %v.0 = insertelement <8 x half> %a, half 0.000000e+00, i32 6 + ret <8 x half> %v.0 +} + +define <2 x float> @test_insert_v2f32_f32_zero_wzr(<2 x float> %a) #1 { +; CHECK-LABEL: test_insert_v2f32_f32_zero_wzr: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: mov.s v0[0], wzr +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ret + %v.0 = insertelement <2 x float> %a, float 0.000000e+00, i32 0 + ret <2 x float> %v.0 +} + +define <4 x float> @test_insert_v4f32_f32_zero_wzr(<4 x float> %a) #1 { +; CHECK-LABEL: test_insert_v4f32_f32_zero_wzr: +; CHECK: // %bb.0: +; CHECK-NEXT: mov.s v0[3], wzr +; CHECK-NEXT: ret + %v.0 = insertelement <4 x float> %a, float 0.000000e+00, i32 3 + ret <4 x float> %v.0 +} + +define <2 x double> @test_insert_v2f64_f64_zero_xzr(<2 x double> %a) #1 { +; CHECK-LABEL: test_insert_v2f64_f64_zero_xzr: +; CHECK: // %bb.0: ; CHECK-NEXT: mov.d v0[1], xzr ; CHECK-NEXT: ret %v.0 = insertelement <2 x double> %a, double 0.000000e+00, i32 1 ret <2 x double> %v.0 } + +attributes #1 = {"tune-cpu"="cortex-a55"} diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll index 8a84d3ca2328c..59dfcf9850a49 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll @@ -63,8 +63,9 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) { ; ; CHECK-SD-FP16-LABEL: add_v3HalfH: ; CHECK-SD-FP16: // %bb.0: +; CHECK-SD-FP16-NEXT: movi d1, #0000000000000000 ; CHECK-SD-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-FP16-NEXT: mov v0.h[3], wzr +; CHECK-SD-FP16-NEXT: mov v0.h[3], v1.h[0] ; CHECK-SD-FP16-NEXT: faddp v0.4h, v0.4h, v0.4h ; CHECK-SD-FP16-NEXT: faddp h0, v0.2h ; CHECK-SD-FP16-NEXT: ret From b117ccf40df16bcd0b24e2a91d8f749ddb7933f3 Mon Sep 17 00:00:00 2001 From: Vikram Hegde <115221833+vikramRH@users.noreply.github.com> Date: Wed, 16 Jul 2025 16:30:58 +0530 Subject: [PATCH 017/813] [CodeGen][NPM] Account inserted passes for -start/stop options (#148111) same as https://github.com/llvm/llvm-project/pull/138830 This partly solves the issue https://github.com/llvm/llvm-project/issues/138831 for -enable-new-pm. https://github.com/llvm/llvm-project/pull/137290 will not have this problem, but this needs to be added this till we migrate to the new pass builder structure. Even with this, there is no way to -start-after an inserted pass right now. Co-authored-by : Oke, Akshat <[Akshat.Oke@amd.com](mailto:Akshat.Oke@amd.com)> --- llvm/include/llvm/Passes/CodeGenPassBuilder.h | 4 +++- llvm/test/tools/llc/new-pm/start-stop-inserted.ll | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llc/new-pm/start-stop-inserted.ll diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index dc5f3f80f547e..a8176ebb776cf 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -579,8 +579,10 @@ template class CodeGenPassBuilder { void insertPass(InsertedPassT &&Pass) const { AfterCallbacks.emplace_back( [&](StringRef Name, MachineFunctionPassManager &MFPM) mutable { - if (Name == TargetPassT::name()) + if (Name == TargetPassT::name() && + runBeforeAdding(InsertedPassT::name())) { MFPM.addPass(std::forward(Pass)); + } }); } diff --git a/llvm/test/tools/llc/new-pm/start-stop-inserted.ll b/llvm/test/tools/llc/new-pm/start-stop-inserted.ll new file mode 100644 index 0000000000000..ce5ad2d9e5065 --- /dev/null +++ b/llvm/test/tools/llc/new-pm/start-stop-inserted.ll @@ -0,0 +1,15 @@ +; REQUIRES: amdgpu-registered-target + +; AMDGPU inserts the fourth instance of dead-mi-elimination pass after detect-dead-lanes +; This checks that the pipeline stops before that. + +; RUN: llc -mtriple=amdgcn-amd-amdhsa -O3 -enable-new-pm -stop-before=dead-mi-elimination,4 --print-pipeline-passes -filetype=null %s | FileCheck %s + +; There is no way to -start-after an inserted pass right now. +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -O3 -enable-new-pm -start-after=dead-mi-elimination,4 --print-pipeline-passes -filetype=null %s + + +; CHECK: dead-mi-elimination +; CHECK: dead-mi-elimination +; CHECK: dead-mi-elimination +; CHECK-NOT: dead-mi-elimination From 73630d5e20c8f29aec426954f448079665caf3ab Mon Sep 17 00:00:00 2001 From: Marina Taylor Date: Wed, 16 Jul 2025 12:15:23 +0100 Subject: [PATCH 018/813] [Support] Error if SocketPath is too long (#148903) If the path is longer than sockaddr_un's buffer, it will be truncated, at which point it may become indistinguishable from similar truncated paths. This will cause `bind` to fail with an "Address already in use" error. There is some existing code that checks `fs::exists` to catch these errors, but since `fs::exists` compares the full un-truncated paths, a too-long path will prevent those checks from working. rdar://154397133 --- llvm/lib/Support/raw_socket_stream.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp index fd1c681672138..cf51de889deaf 100644 --- a/llvm/lib/Support/raw_socket_stream.cpp +++ b/llvm/lib/Support/raw_socket_stream.cpp @@ -119,6 +119,14 @@ ListeningSocket::ListeningSocket(ListeningSocket &&LS) Expected ListeningSocket::createUnix(StringRef SocketPath, int MaxBacklog) { + // If SocketPath is too long, the path will be truncated, and there may be + // collisions with other truncated addresses that the fs::exists check below + // will be unable to detect. + if (SocketPath.size() >= sizeof(sockaddr_un::sun_path)) + return llvm::make_error( + std::make_error_code(std::errc::filename_too_long), + "SocketPath too long"); + // Handle instances where the target socket address already exists and // differentiate between a preexisting file with and without a bound socket // From 3b8a18c27a1e70895feac15d48b3a6122e6b377f Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Wed, 16 Jul 2025 13:26:10 +0200 Subject: [PATCH 019/813] [clang][bytecode] Fix contains check using llvm::find (#149050) We need to compare to the end() interator. --- clang/lib/AST/ByteCode/Interp.cpp | 2 +- clang/test/AST/ByteCode/placement-new.cpp | 8 ++++++++ clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index edb1866b5265c..e8b519478c026 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -574,7 +574,7 @@ bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { // The This pointer is writable in constructors and destructors, // even if isConst() returns true. - if (llvm::find(S.InitializingBlocks, Ptr.block())) + if (llvm::is_contained(S.InitializingBlocks, Ptr.block())) return true; const QualType Ty = Ptr.getType(); diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp index 670def2d5870e..b587cd6eaf89c 100644 --- a/clang/test/AST/ByteCode/placement-new.cpp +++ b/clang/test/AST/ByteCode/placement-new.cpp @@ -486,3 +486,11 @@ namespace bitcast { } static_assert(foo() == 0); } + +constexpr int modify_const_variable() { + const int a = 10; + new ((int *)&a) int(12); // both-note {{modification of object of const-qualified type 'const int' is not allowed in a constant expression}} + return a; +} +static_assert(modify_const_variable()); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} diff --git a/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp b/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp index 6f6f9b04aa392..4cf0e9ffe1d64 100644 --- a/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp +++ b/clang/test/SemaCXX/cxx2c-constexpr-placement-new.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -std=c++2c -verify %s +// RUN: %clang_cc1 -std=c++2c -verify %s -fexperimental-new-constant-interpreter namespace std { From 402b989693a0d5d17be6bf996bce52cf3ca73886 Mon Sep 17 00:00:00 2001 From: macurtis-amd Date: Wed, 16 Jul 2025 06:37:08 -0500 Subject: [PATCH 020/813] AMDGPU: Fix assert when multi operands to update after folding imm (#148205) In the original motivating test case, [FoldList](https://github.com/llvm/llvm-project/blob/d8a2141ff98ee35cd1886f536ccc3548b012820b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp#L1764) had entries: ``` #0: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc UseOpNo: 1 #1: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc UseOpNo: 2 ``` After calling [updateOperand(#0)](https://github.com/llvm/llvm-project/blob/d8a2141ff98ee35cd1886f536ccc3548b012820b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp#L1773), [tryConstantFoldOp(#0.UseMI)](https://github.com/llvm/llvm-project/blob/d8a2141ff98ee35cd1886f536ccc3548b012820b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp#L1786) removed operand 1, and entry #​1.UseOpNo was no longer valid, resulting in an [assert](https://github.com/llvm/llvm-project/blob/4a35214bddbb67f9597a500d48ab8c4fb25af150/llvm/include/llvm/ADT/ArrayRef.h#L452). This change defers constant folding until all operands have been updated so that UseOpNo values remain stable. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 14 ++++++++++---- .../bug-multi-operands-to-update-after-fold.mir | 15 +++++++++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0ed06c37507af..e172c0b63189b 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); + SetVector ConstantFoldCandidates; for (FoldCandidate &Fold : FoldList) { assert(!Fold.isReg() || Fold.Def.OpToFold); if (Fold.isReg() && Fold.getReg().isVirtual()) { @@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, << static_cast(Fold.UseOpNo) << " of " << *Fold.UseMI); - if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) { - LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI); - Changed = true; - } + if (Fold.isImm()) + ConstantFoldCandidates.insert(Fold.UseMI); } else if (Fold.Commuted) { // Restoring instruction's original operand order if fold has failed. TII->commuteInstruction(*Fold.UseMI, false); } } + + for (MachineInstr *MI : ConstantFoldCandidates) { + if (tryConstantFoldOp(MI)) { + LLVM_DEBUG(dbgs() << "Constant folded " << *MI); + Changed = true; + } + } return true; } diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir new file mode 100644 index 0000000000000..d0c9740c6954e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir @@ -0,0 +1,15 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s +--- +name: snork +body: | + bb.0: + ; CHECK-LABEL: name: snork + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: SI_RETURN + %0:sreg_32 = S_MOV_B32 0 + %1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 + %2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc + SI_RETURN +... From 62bd778fd46e647bf3c32540651413f848448f19 Mon Sep 17 00:00:00 2001 From: enh-google Date: Wed, 16 Jul 2025 07:39:17 -0400 Subject: [PATCH 021/813] Remove workarounds for NDK versions before 2017's r16. (#148879) --- libcxx/include/CMakeLists.txt | 1 - libcxx/include/__locale_dir/locale_base_api.h | 2 - .../__locale_dir/locale_base_api/android.h | 45 ------------------- libcxx/include/module.modulemap.in | 1 - .../gn/secondary/libcxx/include/BUILD.gn | 1 - 5 files changed, 50 deletions(-) delete mode 100644 libcxx/include/__locale_dir/locale_base_api/android.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 4f2a8dddad92c..d729fa81e2b2f 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -514,7 +514,6 @@ set(files __locale_dir/check_grouping.h __locale_dir/get_c_locale.h __locale_dir/locale_base_api.h - __locale_dir/locale_base_api/android.h __locale_dir/locale_base_api/bsd_locale_fallbacks.h __locale_dir/locale_base_api/ibm.h __locale_dir/locale_base_api/musl.h diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h index 8dbc28e839839..9f3ce02a3af20 100644 --- a/libcxx/include/__locale_dir/locale_base_api.h +++ b/libcxx/include/__locale_dir/locale_base_api.h @@ -129,8 +129,6 @@ // will define those directly. # if defined(_AIX) || defined(__MVS__) # include <__locale_dir/locale_base_api/ibm.h> -# elif defined(__ANDROID__) -# include <__locale_dir/locale_base_api/android.h> # elif defined(__OpenBSD__) # include <__locale_dir/locale_base_api/openbsd.h> # elif defined(__wasi__) || _LIBCPP_HAS_MUSL_LIBC diff --git a/libcxx/include/__locale_dir/locale_base_api/android.h b/libcxx/include/__locale_dir/locale_base_api/android.h deleted file mode 100644 index 36b8d93e1b228..0000000000000 --- a/libcxx/include/__locale_dir/locale_base_api/android.h +++ /dev/null @@ -1,45 +0,0 @@ -// -*- C++ -*- -//===-----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H -#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H - -#include - -// FIXME: Is this actually required? -extern "C" { -#include -} - -#include - -// If we do not have this header, we are in a platform build rather than an NDK -// build, which will always be at least as new as the ToT NDK, in which case we -// don't need any of the inlines below since libc provides them. -#if __has_include() -# include -// In NDK versions later than 16, locale-aware functions are provided by -// legacy_stdlib_inlines.h -# if __NDK_MAJOR__ <= 16 -# if __ANDROID_API__ < 26 - -inline _LIBCPP_HIDE_FROM_ABI float strtof_l(const char* __nptr, char** __endptr, locale_t) { - return ::strtof(__nptr, __endptr); -} - -inline _LIBCPP_HIDE_FROM_ABI double strtod_l(const char* __nptr, char** __endptr, locale_t) { - return ::strtod(__nptr, __endptr); -} - -# endif // __ANDROID_API__ < 26 - -# endif // __NDK_MAJOR__ <= 16 -#endif // __has_include() - -#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index 61ba1c381b2b3..602e72bbf5b01 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -1590,7 +1590,6 @@ module std [system] { } module locale_base_api { - textual header "__locale_dir/locale_base_api/android.h" textual header "__locale_dir/locale_base_api/bsd_locale_fallbacks.h" textual header "__locale_dir/locale_base_api/ibm.h" textual header "__locale_dir/locale_base_api/musl.h" diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index d594141e358a3..9a34f6b27d026 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1172,7 +1172,6 @@ if (current_toolchain == default_toolchain) { "__locale_dir/check_grouping.h", "__locale_dir/get_c_locale.h", "__locale_dir/locale_base_api.h", - "__locale_dir/locale_base_api/android.h", "__locale_dir/locale_base_api/bsd_locale_fallbacks.h", "__locale_dir/locale_base_api/ibm.h", "__locale_dir/locale_base_api/musl.h", From 949103b45c2f7f1fc28106b5db435660e466b804 Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Wed, 16 Jul 2025 14:08:41 +0200 Subject: [PATCH 022/813] [SLP][NFC] Use range-based `for` in `matchAssociativeReduction` (#149029) --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 87de28044b2ae..5ec7b49e37079 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22140,7 +22140,7 @@ class HorizontalReduction { // Try to regroup reduced values so that it gets more profitable to try to // reduce them. Values are grouped by their value ids, instructions - by // instruction op id and/or alternate op id, plus do extra analysis for - // loads (grouping them by the distabce between pointers) and cmp + // loads (grouping them by the distance between pointers) and cmp // instructions (grouping them by the predicate). SmallMapVector< size_t, SmallMapVector, 2>, @@ -22207,10 +22207,9 @@ class HorizontalReduction { for (auto &PossibleReducedVals : PossibleReducedValsVect) { auto PossibleRedVals = PossibleReducedVals.second.takeVector(); SmallVector> PossibleRedValsVect; - for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end(); - It != E; ++It) { + for (auto &Slice : PossibleRedVals) { PossibleRedValsVect.emplace_back(); - auto RedValsVect = It->second.takeVector(); + auto RedValsVect = Slice.second.takeVector(); stable_sort(RedValsVect, llvm::less_second()); for (const std::pair &Data : RedValsVect) PossibleRedValsVect.back().append(Data.second, Data.first); @@ -22370,8 +22369,8 @@ class HorizontalReduction { SmallVector Candidates; Candidates.reserve(2 * OrigReducedVals.size()); DenseMap TrackedToOrig(2 * OrigReducedVals.size()); - for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) { - Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]); + for (Value *ReducedVal : OrigReducedVals) { + Value *RdxVal = TrackedVals.at(ReducedVal); // Check if the reduction value was not overriden by the extractelement // instruction because of the vectorization and exclude it, if it is not // compatible with other values. @@ -22382,7 +22381,7 @@ class HorizontalReduction { (S && !Inst)) continue; Candidates.push_back(RdxVal); - TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]); + TrackedToOrig.try_emplace(RdxVal, ReducedVal); } bool ShuffledExtracts = false; // Try to handle shuffled extractelements. From 7674566c9666e69d274b145ea4744c6ac13c1adb Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Wed, 16 Jul 2025 14:09:09 +0200 Subject: [PATCH 023/813] [SLP][NFC] Simplify `count_if` to `count` (#149072) --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5ec7b49e37079..6b307f7bac4f5 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5540,8 +5540,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, return std::max(Entries[I].front()->getVectorFactor(), Entries[I].back()->getVectorFactor()); }); - unsigned NumUndefs = - count_if(CurrentOrder, [&](unsigned Idx) { return Idx == NumScalars; }); + unsigned NumUndefs = count(CurrentOrder, NumScalars); if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2)) return std::nullopt; return std::move(CurrentOrder); From ade2f1023d716bbf1473aaaf46d10faac73b014f Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Wed, 16 Jul 2025 14:09:27 +0200 Subject: [PATCH 024/813] [SLP][NFCI] Don't trim indexes, reuse a variable (#149074) --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6b307f7bac4f5..da6af353c709f 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8622,11 +8622,10 @@ void BoUpSLP::tryToVectorizeGatheredLoads( State == LoadsState::CompressVectorize) return false; ConsecutiveNodesSize += VL.size(); - unsigned Start = std::distance(Slice.begin(), It); - unsigned Sz = Slice.size() - Start; + size_t Start = std::distance(Slice.begin(), It); + size_t Sz = Slice.size() - Start; return Sz < VL.size() || - Slice.slice(std::distance(Slice.begin(), It), - VL.size()) != VL; + Slice.slice(Start, VL.size()) != VL; })) continue; // Try to build long masked gather loads. From 5abdce4083ed37203e4893f8f7ef3890586b008a Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 16 Jul 2025 13:19:55 +0100 Subject: [PATCH 025/813] [LLVM][AArch64InstrInfo] Prevent fill folding when DstReg is SP. (#148885) We can remove subreg COPY instructions by filling directly into the COPY's destination register. However, this is only valid when the copy and fill have compatible register classes. Fixes https://github.com/llvm/llvm-project/issues/148659 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 6 +++--- llvm/test/CodeGen/AArch64/spill-fold.mir | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5420545cc3cec..cdb224d0cd09f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -6288,13 +6288,13 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( // LDRWui %0:sub_32, %stack.0 // if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) { - const TargetRegisterClass *FillRC; + const TargetRegisterClass *FillRC = nullptr; switch (DstMO.getSubReg()) { default: - FillRC = nullptr; break; case AArch64::sub_32: - FillRC = &AArch64::GPR32RegClass; + if (AArch64::GPR64RegClass.hasSubClassEq(getRegClass(DstReg))) + FillRC = &AArch64::GPR32RegClass; break; case AArch64::ssub: FillRC = &AArch64::FPR32RegClass; diff --git a/llvm/test/CodeGen/AArch64/spill-fold.mir b/llvm/test/CodeGen/AArch64/spill-fold.mir index 0149e4504bed2..9ea9ce53b68a8 100644 --- a/llvm/test/CodeGen/AArch64/spill-fold.mir +++ b/llvm/test/CodeGen/AArch64/spill-fold.mir @@ -10,6 +10,7 @@ define i64 @test_subreg_fill_fold() { ret i64 0 } define double @test_subreg_fill_fold2() { ret double 0.0 } define <4 x float> @test_subreg_fill_fold3() { ret <4 x float> undef } + define i64 @test_subreg_fill_fold4() { ret i64 0 } define i64 @test_nzcv_spill_fold() { ret i64 0 } ... --- @@ -121,6 +122,24 @@ body: | RET_ReallyLR implicit $s0 ... --- +# CHECK-LABEL: name: test_subreg_fill_fold4 +# Ensure the COPY is maintained when its result register class is not compatible +# with the fill load's. +name: test_subreg_fill_fold4 +registers: + - { id: 0, class: gpr32 } + - { id: 1, class: gpr64sp } +body: | + bb.0: + %0 = COPY $wzr + INLINEASM &nop, 1, 12, implicit-def dead $x0, 12, implicit-def dead $x1, 12, implicit-def dead $x2, 12, implicit-def dead $x3, 12, implicit-def dead $x4, 12, implicit-def dead $x5, 12, implicit-def dead $x6, 12, implicit-def dead $x7, 12, implicit-def dead $x8, 12, implicit-def dead $x9, 12, implicit-def dead $x10, 12, implicit-def dead $x11, 12, implicit-def dead $x12, 12, implicit-def dead $x13, 12, implicit-def dead $x14, 12, implicit-def dead $x15, 12, implicit-def dead $x16, 12, implicit-def dead $x17, 12, implicit-def dead $x18, 12, implicit-def dead $x19, 12, implicit-def dead $x20, 12, implicit-def dead $x21, 12, implicit-def dead $x22, 12, implicit-def dead $x23, 12, implicit-def dead $x24, 12, implicit-def dead $x25, 12, implicit-def dead $x26, 12, implicit-def dead $x27, 12, implicit-def dead $x28, 12, implicit-def dead $fp, 12, implicit-def dead $lr, 12, implicit-def $sp + ; CHECK: %2:gpr32 = LDRWui %stack.0, 0 :: (load (s32) from %stack.0) + ; CHECK: undef %1.sub_32:gpr64sp = COPY %2 + undef %1.sub_32:gpr64sp = COPY %0 + $x0 = COPY %1 + RET_ReallyLR implicit $x0 +... +--- # CHECK-LABEL: name: test_nzcv_spill_fold # Ensure that nzcv COPY cannot be folded. name: test_nzcv_spill_fold From 9e5470e7d6ea1ad4fe25a9416706d769e41a03c1 Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Wed, 16 Jul 2025 15:25:24 +0300 Subject: [PATCH 026/813] [Clang] Diagnose forming references to nullptr (#143667) Per [decl.ref], > Because a null pointer value or a pointer past the end of an object does not point to an object, a reference in a well-defined program cannot refer to such things. Note this does not fixes the new bytecode interpreter. Fixes #48665 --- clang/docs/ReleaseNotes.rst | 3 +- .../include/clang/Basic/DiagnosticASTKinds.td | 12 +- clang/lib/AST/ByteCode/State.h | 1 + clang/lib/AST/ExprConstant.cpp | 104 +++++++++++++--- clang/test/AST/ByteCode/complex.cpp | 6 +- clang/test/AST/ByteCode/const-eval.c | 2 + clang/test/AST/ByteCode/cxx11.cpp | 4 +- clang/test/AST/ByteCode/records.cpp | 10 +- clang/test/CXX/drs/cwg14xx.cpp | 2 + clang/test/CXX/expr/expr.const/p2-0x.cpp | 10 +- clang/test/Sema/const-eval.c | 5 +- .../SemaCXX/constant-expression-cxx11.cpp | 4 +- .../SemaCXX/constant-expression-cxx14.cpp | 111 +++++++++++++++++- .../SemaCXX/constant-expression-cxx2a.cpp | 2 +- .../SemaCXX/constexpr-backtrace-limit.cpp | 4 +- .../range.zip/iterator/increment.pass.cpp | 4 +- 16 files changed, 238 insertions(+), 46 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 1eb3e369a302e..2f1705ba7db06 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -674,7 +674,7 @@ Improvements to Clang's diagnostics #GH142457, #GH139913, #GH138850, #GH137867, #GH137860, #GH107840, #GH93308, #GH69470, #GH59391, #GH58172, #GH46215, #GH45915, #GH45891, #GH44490, #GH36703, #GH32903, #GH23312, #GH69874. - + - Clang no longer emits a spurious -Wdangling-gsl warning in C++23 when iterating over an element of a temporary container in a range-based for loop.(#GH109793, #GH145164) @@ -970,6 +970,7 @@ Bug Fixes to C++ Support - Fixed a crash involving list-initialization of an empty class with a non-empty initializer list. (#GH147949) - Fixed constant evaluation of equality comparisons of constexpr-unknown references. (#GH147663) +- Diagnose binding a reference to ``*nullptr`` during constant evaluation. (#GH48665) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td index a67b9995d3b54..071a38f513911 100644 --- a/clang/include/clang/Basic/DiagnosticASTKinds.td +++ b/clang/include/clang/Basic/DiagnosticASTKinds.td @@ -174,10 +174,11 @@ def note_constexpr_heap_alloc_limit_exceeded : Note< def note_constexpr_this : Note< "%select{|implicit }0use of 'this' pointer is only allowed within the " "evaluation of a call to a 'constexpr' member function">; -def access_kind : TextSubstitution< - "%select{read of|read of|assignment to|increment of|decrement of|" - "member call on|dynamic_cast of|typeid applied to|construction of|" - "destruction of|read of}0">; +def access_kind + : TextSubstitution< + "%select{read of|read of|assignment to|increment of|decrement of|" + "member call on|dynamic_cast of|typeid applied to|construction of|" + "destruction of|read of|read of}0">; def access_kind_subobject : TextSubstitution< "%select{read of|read of|assignment to|increment of|decrement of|" "member call on|dynamic_cast of|typeid applied to|" @@ -222,6 +223,9 @@ def note_constexpr_ltor_incomplete_type : Note< def note_constexpr_access_null : Note< "%sub{access_kind}0 " "dereferenced null pointer is not allowed in a constant expression">; +def note_constexpr_dereferencing_null + : Note<"dereferencing a null pointer is not allowed in a constant " + "expression">; def note_constexpr_access_past_end : Note< "%sub{access_kind}0 dereferenced one-past-the-end pointer " "is not allowed in a constant expression">; diff --git a/clang/lib/AST/ByteCode/State.h b/clang/lib/AST/ByteCode/State.h index 9a81fa6b7d220..6fc33222ac956 100644 --- a/clang/lib/AST/ByteCode/State.h +++ b/clang/lib/AST/ByteCode/State.h @@ -35,6 +35,7 @@ enum AccessKinds { AK_Construct, AK_Destroy, AK_IsWithinLifetime, + AK_Dereference }; /// The order of this enum is important for diagnostics. diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 1b33b6706e204..767cc4c3b19eb 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -1529,7 +1529,7 @@ CallStackFrame::~CallStackFrame() { static bool isRead(AccessKinds AK) { return AK == AK_Read || AK == AK_ReadObjectRepresentation || - AK == AK_IsWithinLifetime; + AK == AK_IsWithinLifetime || AK == AK_Dereference; } static bool isModification(AccessKinds AK) { @@ -1540,6 +1540,7 @@ static bool isModification(AccessKinds AK) { case AK_DynamicCast: case AK_TypeId: case AK_IsWithinLifetime: + case AK_Dereference: return false; case AK_Assign: case AK_Increment: @@ -1558,15 +1559,16 @@ static bool isAnyAccess(AccessKinds AK) { /// Is this an access per the C++ definition? static bool isFormalAccess(AccessKinds AK) { return isAnyAccess(AK) && AK != AK_Construct && AK != AK_Destroy && - AK != AK_IsWithinLifetime; + AK != AK_IsWithinLifetime && AK != AK_Dereference; } -/// Is this kind of axcess valid on an indeterminate object value? +/// Is this kind of access valid on an indeterminate object value? static bool isValidIndeterminateAccess(AccessKinds AK) { switch (AK) { case AK_Read: case AK_Increment: case AK_Decrement: + case AK_Dereference: // These need the object's value. return false; @@ -1733,7 +1735,10 @@ namespace { bool checkNullPointerForFoldAccess(EvalInfo &Info, const Expr *E, AccessKinds AK) { return checkNullPointerDiagnosingWith([&Info, E, AK] { - Info.FFDiag(E, diag::note_constexpr_access_null) << AK; + if (AK == AccessKinds::AK_Dereference) + Info.FFDiag(E, diag::note_constexpr_dereferencing_null); + else + Info.FFDiag(E, diag::note_constexpr_access_null) << AK; }); } @@ -4305,7 +4310,10 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E, } if (!LVal.Base) { - Info.FFDiag(E, diag::note_constexpr_access_null) << AK; + if (AK == AccessKinds::AK_Dereference) + Info.FFDiag(E, diag::note_constexpr_dereferencing_null); + else + Info.FFDiag(E, diag::note_constexpr_access_null) << AK; return CompleteObject(); } @@ -4407,8 +4415,9 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E, ConstexprVar = VD->isConstexpr(); // Unless we're looking at a local variable or argument in a constexpr call, - // the variable we're reading must be const. - if (!Frame) { + // the variable we're reading must be const (unless we are binding to a + // reference). + if (AK != clang::AK_Dereference && !Frame) { if (IsAccess && isa(VD)) { // Access of a parameter that's not associated with a frame isn't going // to work out, but we can leave it to evaluateVarDeclInit to provide a @@ -4472,12 +4481,16 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E, } } - if (!evaluateVarDeclInit(Info, E, VD, Frame, LVal.getLValueVersion(), BaseVal)) + // When binding to a reference, the variable does not need to be constexpr + // or have constant initalization. + if (AK != clang::AK_Dereference && + !evaluateVarDeclInit(Info, E, VD, Frame, LVal.getLValueVersion(), + BaseVal)) return CompleteObject(); // If evaluateVarDeclInit sees a constexpr-unknown variable, it returns // a null BaseVal. Any constexpr-unknown variable seen here is an error: // we can't access a constexpr-unknown object. - if (!BaseVal) { + if (AK != clang::AK_Dereference && !BaseVal) { Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1) << AK << VD; Info.Note(VD->getLocation(), diag::note_declared_at); @@ -4491,7 +4504,10 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E, } return CompleteObject(LVal.Base, &(*Alloc)->Value, LVal.Base.getDynamicAllocType()); - } else { + } + // When binding to a reference, the variable does not need to be + // within its lifetime. + else if (AK != clang::AK_Dereference) { const Expr *Base = LVal.Base.dyn_cast(); if (!Frame) { @@ -4572,7 +4588,7 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E, NoteLValueLocation(Info, LVal.Base); return CompleteObject(); } - } else { + } else if (AK != clang::AK_Dereference) { BaseVal = Frame->getTemporary(Base, LVal.Base.getVersion()); assert(BaseVal && "missing value for temporary"); } @@ -5200,6 +5216,29 @@ enum EvalStmtResult { ESR_CaseNotFound }; } +/// Evaluates the initializer of a reference. +static bool EvaluateInitForDeclOfReferenceType(EvalInfo &Info, + const ValueDecl *D, + const Expr *Init, LValue &Result, + APValue &Val) { + assert(Init->isGLValue() && D->getType()->isReferenceType()); + // A reference is an lvalue. + if (!EvaluateLValue(Init, Result, Info)) + return false; + // [C++26][decl.ref] + // The object designated by such a glvalue can be outside its lifetime + // Because a null pointer value or a pointer past the end of an object + // does not point to an object, a reference in a well-defined program cannot + // refer to such things; + if (!Result.Designator.Invalid && Result.Designator.isOnePastTheEnd()) { + Info.FFDiag(Init, diag::note_constexpr_access_past_end) << AK_Dereference; + return false; + } + + // Save the result. + Result.moveInto(Val); + return true; +} static bool EvaluateVarDecl(EvalInfo &Info, const VarDecl *VD) { if (VD->isInvalidDecl()) @@ -5221,7 +5260,11 @@ static bool EvaluateVarDecl(EvalInfo &Info, const VarDecl *VD) { if (InitE->isValueDependent()) return false; - if (!EvaluateInPlace(Val, Info, Result, InitE)) { + // For references to objects, check they do not designate a one-past-the-end + // object. + if (VD->getType()->isReferenceType()) { + return EvaluateInitForDeclOfReferenceType(Info, VD, InitE, Result, Val); + } else if (!EvaluateInPlace(Val, Info, Result, InitE)) { // Wipe out any partially-computed value, to allow tracking that this // evaluation failed. Val = APValue(); @@ -6851,9 +6894,18 @@ static bool HandleConstructorCall(const Expr *E, const LValue &This, ThisOverrideRAII ThisOverride(*Info.CurrentCall, &SubobjectParent, isa(Init)); FullExpressionRAII InitScope(Info); - if (!EvaluateInPlace(*Value, Info, Subobject, Init) || - (FD && FD->isBitField() && - !truncateBitfieldValue(Info, Init, *Value, FD))) { + if (FD && FD->getType()->isReferenceType() && + !FD->getType()->isFunctionReferenceType()) { + LValue Result; + if (!EvaluateInitForDeclOfReferenceType(Info, FD, Init, Result, + *Value)) { + if (!Info.noteFailure()) + return false; + Success = false; + } + } else if (!EvaluateInPlace(*Value, Info, Subobject, Init) || + (FD && FD->isBitField() && + !truncateBitfieldValue(Info, Init, *Value, FD))) { // If we're checking for a potential constant expression, evaluate all // initializers even if some of them fail. if (!Info.noteFailure()) @@ -9287,7 +9339,13 @@ bool LValueExprEvaluator::VisitArraySubscriptExpr(const ArraySubscriptExpr *E) { } bool LValueExprEvaluator::VisitUnaryDeref(const UnaryOperator *E) { - return evaluatePointer(E->getSubExpr(), Result); + bool Success = evaluatePointer(E->getSubExpr(), Result); + // [C++26][expr.unary.op] + // If the operand points to an object or function, the result + // denotes that object or function; otherwise, the behavior is undefined. + return Success && + (!E->getType().getNonReferenceType()->isObjectType() || + findCompleteObject(Info, E, AK_Dereference, Result, E->getType())); } bool LValueExprEvaluator::VisitUnaryReal(const UnaryOperator *E) { @@ -10906,9 +10964,17 @@ bool RecordExprEvaluator::VisitCXXParenListOrInitListExpr( isa(Init)); APValue &FieldVal = Result.getStructField(Field->getFieldIndex()); - if (!EvaluateInPlace(FieldVal, Info, Subobject, Init) || - (Field->isBitField() && !truncateBitfieldValue(Info, Init, - FieldVal, Field))) { + if (Field->getType()->isReferenceType()) { + LValue Result; + if (!EvaluateInitForDeclOfReferenceType(Info, Field, Init, Result, + FieldVal)) { + if (!Info.noteFailure()) + return false; + Success = false; + } + } else if (!EvaluateInPlace(FieldVal, Info, Subobject, Init) || + (Field->isBitField() && + !truncateBitfieldValue(Info, Init, FieldVal, Field))) { if (!Info.noteFailure()) return false; Success = false; diff --git a/clang/test/AST/ByteCode/complex.cpp b/clang/test/AST/ByteCode/complex.cpp index 2c0111c53d3bf..959d759005ef4 100644 --- a/clang/test/AST/ByteCode/complex.cpp +++ b/clang/test/AST/ByteCode/complex.cpp @@ -396,10 +396,10 @@ namespace ComplexConstexpr { // both-note {{cannot refer to element 3 of array of 2 elements}} constexpr _Complex float *p = 0; constexpr float pr = __real *p; // both-error {{constant expr}} \ - // ref-note {{cannot access real component of null}} \ - // expected-note {{read of dereferenced null pointer}} + // expected-note {{read of dereferenced null pointer}} \ + // ref-note {{dereferencing a null pointer}} constexpr float pi = __imag *p; // both-error {{constant expr}} \ - // ref-note {{cannot access imaginary component of null}} + // ref-note {{dereferencing a null pointer}} constexpr const _Complex double *q = &test3 + 1; constexpr double qr = __real *q; // ref-error {{constant expr}} \ // ref-note {{cannot access real component of pointer past the end}} diff --git a/clang/test/AST/ByteCode/const-eval.c b/clang/test/AST/ByteCode/const-eval.c index eab14c08ec809..c8651a744f969 100644 --- a/clang/test/AST/ByteCode/const-eval.c +++ b/clang/test/AST/ByteCode/const-eval.c @@ -51,6 +51,8 @@ struct s { }; EVAL_EXPR(19, ((int)&*(char*)10 == 10 ? 1 : -1)); +// ref-error@-1 {{expression is not an integer constant expression}} \ +// ref-note@-1 {{dereferencing a null pointer}} #ifndef NEW_INTERP EVAL_EXPR(20, __builtin_constant_p(*((int*) 10))); diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp index b34e7823220e2..55554220b0a8a 100644 --- a/clang/test/AST/ByteCode/cxx11.cpp +++ b/clang/test/AST/ByteCode/cxx11.cpp @@ -39,7 +39,9 @@ struct S { constexpr S s = { 5 }; constexpr const int *p = &s.m + 1; -constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok +constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; +// ref-error@-1 {{constexpr variable 'np2' must be initialized by a constant expression}} \ +// ref-note@-1 {{dereferencing a null pointer is not allowed in a constant expression}} constexpr int preDec(int x) { // both-error {{never produces a constant expression}} return --x; // both-note {{subexpression}} diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp index d369c64bc3904..774fed6189d64 100644 --- a/clang/test/AST/ByteCode/records.cpp +++ b/clang/test/AST/ByteCode/records.cpp @@ -413,7 +413,7 @@ namespace DeriveFailures { constexpr Derived(int i) : OtherVal(i) {} // ref-error {{never produces a constant expression}} \ // both-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} \ - // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} + // ref-note {{non-constexpr constructor 'Base' cannot be used in a constant expression}} }; constexpr Derived D(12); // both-error {{must be initialized by a constant expression}} \ @@ -1660,9 +1660,11 @@ namespace NullptrCast { constexpr A *na = nullptr; constexpr B *nb = nullptr; constexpr A &ra = *nb; // both-error {{constant expression}} \ - // both-note {{cannot access base class of null pointer}} + // ref-note {{dereferencing a null pointer}} \ + // expected-note {{cannot access base class of null pointer}} constexpr B &rb = (B&)*na; // both-error {{constant expression}} \ - // both-note {{cannot access derived class of null pointer}} + // ref-note {{dereferencing a null pointer}} \ + // expected-note {{cannot access derived class of null pointer}} constexpr bool test() { auto a = (A*)(B*)nullptr; @@ -1740,7 +1742,7 @@ namespace CtorOfInvalidClass { #if __cplusplus >= 202002L template concept ReferenceOf = Q; - /// This calls a valid and constexpr copy constructor of InvalidCtor, + /// This calls a valid and constexpr copy constructor of InvalidCtor, /// but should still be rejected. template auto R, typename Rep> int F; // both-error {{non-type template argument is not a constant expression}} #endif diff --git a/clang/test/CXX/drs/cwg14xx.cpp b/clang/test/CXX/drs/cwg14xx.cpp index 17d5c2fc2e210..8d39018d8926c 100644 --- a/clang/test/CXX/drs/cwg14xx.cpp +++ b/clang/test/CXX/drs/cwg14xx.cpp @@ -107,6 +107,8 @@ void f() { constexpr int p = &*a; // since-cxx11-error@-1 {{cannot initialize a variable of type 'const int' with an rvalue of type 'A *'}} constexpr A *p2 = &*a; + // since-cxx11-error@-1 {{constexpr variable 'p2' must be initialized by a constant expression}} + // since-cxx11-note@-2 {{dereferencing a null pointer}} } struct A { diff --git a/clang/test/CXX/expr/expr.const/p2-0x.cpp b/clang/test/CXX/expr/expr.const/p2-0x.cpp index c6c3381be5523..910c8635f7353 100644 --- a/clang/test/CXX/expr/expr.const/p2-0x.cpp +++ b/clang/test/CXX/expr/expr.const/p2-0x.cpp @@ -199,15 +199,15 @@ namespace UndefinedBehavior { constexpr A *na = nullptr; constexpr B *nb = nullptr; - constexpr A &ra = *nb; // expected-error {{constant expression}} expected-note {{cannot access base class of null pointer}} - constexpr B &rb = (B&)*na; // expected-error {{constant expression}} expected-note {{cannot access derived class of null pointer}} + constexpr A &ra = *nb; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}} + constexpr B &rb = (B&)*na; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}} static_assert((A*)nb == 0, ""); static_assert((B*)na == 0, ""); constexpr const int &nf = nb->n; // expected-error {{constant expression}} expected-note {{cannot access field of null pointer}} constexpr const int &mf = nb->m; // expected-error {{constant expression}} expected-note {{cannot access field of null pointer}} constexpr const int *np1 = (int*)nullptr + 0; // ok - constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // ok - constexpr const int *np3 = &(*(int(*)[4])nullptr)[2]; // expected-error {{constant expression}} expected-note {{cannot perform pointer arithmetic on null pointer}} + constexpr const int *np2 = &(*(int(*)[4])nullptr)[0]; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}} + constexpr const int *np3 = &(*(int(*)[4])nullptr)[2]; // expected-error {{constant expression}} expected-note {{dereferencing a null pointer}} struct C { constexpr int f() const { return 0; } @@ -485,7 +485,7 @@ namespace std { namespace TypeId { struct S { virtual void f(); }; constexpr S *p = 0; - constexpr const std::type_info &ti1 = typeid(*p); // expected-error {{must be initialized by a constant expression}} cxx11-note {{typeid applied to expression of polymorphic type 'S'}} cxx20-note {{dereferenced null pointer}} + constexpr const std::type_info &ti1 = typeid(*p); // expected-error {{must be initialized by a constant expression}} cxx11-note {{typeid applied to expression of polymorphic type 'S'}} cxx20-note {{dereferencing a null pointer}} struct T {} t; constexpr const std::type_info &ti2 = typeid(t); diff --git a/clang/test/Sema/const-eval.c b/clang/test/Sema/const-eval.c index e358aceaad5a4..87c21120e7c5d 100644 --- a/clang/test/Sema/const-eval.c +++ b/clang/test/Sema/const-eval.c @@ -32,7 +32,7 @@ void f(void) _Complex float g16 = (1.0f + 1.0fi); // ?: in constant expressions. -int g17[(3?:1) - 2]; +int g17[(3?:1) - 2]; EVAL_EXPR(18, ((int)((void*)10 + 10)) == 20 ? 1 : -1); @@ -41,6 +41,9 @@ struct s { }; EVAL_EXPR(19, ((int)&*(char*)10 == 10 ? 1 : -1)); +// expected-error@-1 {{not an integer constant expression}} \ +// expected-note@-1 {{dereferencing a null pointer is not allowed in a constant expression}} + EVAL_EXPR(20, __builtin_constant_p(*((int*) 10))); diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp index c390fee1c38d9..5ecb8c607f59a 100644 --- a/clang/test/SemaCXX/constant-expression-cxx11.cpp +++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp @@ -1413,8 +1413,8 @@ namespace ComplexConstexpr { static_assert(t2p[2] == 0.0, ""); // expected-error {{constant expr}} expected-note {{one-past-the-end pointer}} static_assert(t2p[3] == 0.0, ""); // expected-error {{constant expr}} expected-note {{cannot refer to element 3 of array of 2 elements}} constexpr _Complex float *p = 0; // expected-warning {{'_Complex' is a C99 extension}} - constexpr float pr = __real *p; // expected-error {{constant expr}} expected-note {{cannot access real component of null}} - constexpr float pi = __imag *p; // expected-error {{constant expr}} expected-note {{cannot access imaginary component of null}} + constexpr float pr = __real *p; // expected-error {{constant expr}} expected-note {{dereferencing a null pointer}} + constexpr float pi = __imag *p; // expected-error {{constant expr}} expected-note {{dereferencing a null pointer}} constexpr const _Complex double *q = &test3 + 1; // expected-warning {{'_Complex' is a C99 extension}} constexpr double qr = __real *q; // expected-error {{constant expr}} expected-note {{cannot access real component of pointer past the end}} constexpr double qi = __imag *q; // expected-error {{constant expr}} expected-note {{cannot access imaginary component of pointer past the end}} diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp index e93b98c185a82..182c0d01141ff 100644 --- a/clang/test/SemaCXX/constant-expression-cxx14.cpp +++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp @@ -265,7 +265,7 @@ namespace const_modify { namespace null { constexpr int test(int *p) { - return *p = 123; // expected-note {{assignment to dereferenced null pointer}} + return *p = 123; // expected-note {{dereferencing a null pointer}} } static_assert(test(0), ""); // expected-error {{constant expression}} expected-note {{in call}} } @@ -1335,4 +1335,113 @@ namespace comparison_dead_variable { } // FIXME: This should fail. static_assert(f(),""); + +} +namespace GH48665 { +constexpr bool foo(int *i) { + int &j = *i; + // expected-note@-1 {{dereferencing a null pointer}} + return true; +} + +static_assert(foo(nullptr), ""); // expected-note {{in call to 'foo(nullptr)'}} +// expected-error@-1 {{static assertion expression is not an integral constant expression}} + +constexpr bool foo_rvalue(int *i) { + int &&j = (int&&)*i; + // expected-note@-1 {{dereferencing a null pointer}} + return true; +} +static_assert(foo_rvalue(nullptr), ""); // expected-note {{in call to 'foo_rvalue(nullptr)'}} +// expected-error@-1 {{static assertion expression is not an integral constant expression}} + +int arr[3]; // expected-note {{declared here}} +constexpr bool f() { // cxx14_20-error {{constexpr function never produces a constant expression}} + int &r = arr[3]; // expected-note {{read of dereferenced one-past-the-end pointer}} \ + // cxx14_20-note {{read of dereferenced one-past-the-end pointer}} \ + // expected-warning {{array index 3 is past the end of the array}} + return true; +} +static_assert(f(), ""); // expected-note {{in call to 'f()'}} +// expected-error@-1 {{static assertion expression is not an integral constant expression}} + + +struct Aggregate { + int &r; +}; +constexpr bool test_agg(int *i) { + Aggregate a{*i}; //expected-note {{dereferencing a null pointer}} + return true; +} +static_assert(test_agg(nullptr), ""); // expected-note {{in call to 'test_agg(nullptr)'}} +// expected-error@-1 {{static assertion expression is not an integral constant expression}} + +struct B { + constexpr B(int *p) : r{*p} {} // expected-note {{dereferencing a null pointer}} + int &r; +}; + +constexpr bool test_ctr(int *i) { + B b(i); // expected-note {{in call to 'B(nullptr)'}} + return true; +} + +static_assert(test_ctr(nullptr), ""); // expected-note {{in call to 'test_ctr(nullptr)'}} +// expected-error@-1 {{static assertion expression is not an integral constant expression}} + + +// verify that we can dereference function pointers +namespace functions { + +constexpr int f() {return 0;} +constexpr int(*f_ptr)() = &f; +constexpr int(*null_ptr)() = nullptr; + +constexpr int(&f_ref)() = f; +constexpr int test = (*f_ptr)(); +constexpr int test2 = (*f_ref)(); +constexpr int test3 = (*f_ref)(); +constexpr int test4 = (*null_ptr)(); +//expected-error@-1 {{constexpr variable 'test4' must be initialized by a constant expression}} \ +//expected-note@-1 {{'(*null_ptr)' evaluates to a null function pointer}} + +constexpr int(*f_ptr_arr[1])() = {&f}; +constexpr int test_array_ok = (f_ptr_arr[0])(); +constexpr int test_array_err = (f_ptr_arr[1])(); +// expected-error@-1 {{constexpr variable 'test_array_err' must be initialized by a constant expression}} \ +// expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} + +struct S { + int(*f_ptr)() = &f; + int(*f_ptr_arr[1])() = {&f}; + int(&f_ref)() = f; + int(*null_ptr)() = nullptr; +}; + +constexpr int test_member() { + S s {}; + (*s.f_ptr)(); + (*s.f_ref)(); + (s.f_ref)(); + (s.f_ptr_arr[0])(); + (s.f_ptr_arr[1])(); + // expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} + return 0; +} +constexpr int test_member_null() { // cxx14_20-error {{never produces a constant expression}} + S s {}; + (*s.null_ptr)(); // expected-note {{'(*s.null_ptr)' evaluates to a null function pointer}} \ + // cxx14_20-note {{'(*s.null_ptr)' evaluates to a null function pointer}} + return 0; +} + +static_assert(test_member(), ""); +// expected-error@-1 {{static assertion expression is not an integral constant expression}} \ +// expected-note@-1 {{in call to 'test_member()'}} + +static_assert(test_member_null(), ""); +// expected-error@-1 {{static assertion expression is not an integral constant expression}} \ +// expected-note@-1 {{in call to 'test_member_null()'}} + +} } diff --git a/clang/test/SemaCXX/constant-expression-cxx2a.cpp b/clang/test/SemaCXX/constant-expression-cxx2a.cpp index 85720606fe9de..ffb7e633c2919 100644 --- a/clang/test/SemaCXX/constant-expression-cxx2a.cpp +++ b/clang/test/SemaCXX/constant-expression-cxx2a.cpp @@ -927,7 +927,7 @@ namespace dynamic_alloc { constexpr void use_after_free() { // expected-error {{never produces a constant expression}} int *p = new int; delete p; - *p = 1; // expected-note {{assignment to heap allocated object that has been deleted}} + *p = 1; // expected-note {{read of heap allocated object that has been deleted}} } constexpr void use_after_free_2() { // expected-error {{never produces a constant expression}} struct X { constexpr void f() {} }; diff --git a/clang/test/SemaCXX/constexpr-backtrace-limit.cpp b/clang/test/SemaCXX/constexpr-backtrace-limit.cpp index e867afdff5c3c..f0c1206a4b8d3 100644 --- a/clang/test/SemaCXX/constexpr-backtrace-limit.cpp +++ b/clang/test/SemaCXX/constexpr-backtrace-limit.cpp @@ -15,14 +15,14 @@ // RUN: not %clang_cc1 -std=c++11 -fsyntax-only %s -fconstexpr-backtrace-limit=2 -fconstexpr-depth=8 -fno-caret-diagnostics 2>&1 | FileCheck %s -check-prefix=TEST3 // TEST3: constant expression -// TEST3-NEXT: reinterpret_cast +// TEST3-NEXT: dereferencing a null pointer // TEST3-NEXT: in call to 'recurse(0)' // TEST3-NEXT: skipping 4 calls // TEST3-NEXT: in call to 'recurse(5)' // RUN: not %clang_cc1 -std=c++11 -fsyntax-only %s -fconstexpr-backtrace-limit=8 -fconstexpr-depth=8 -fno-caret-diagnostics 2>&1 | FileCheck %s -check-prefix=TEST4 // TEST4: constant expression -// TEST4-NEXT: reinterpret_cast +// TEST4-NEXT: dereferencing a null pointer // TEST4-NEXT: in call to 'recurse(0)' // TEST4-NEXT: in call to 'recurse(1)' // TEST4-NEXT: in call to 'recurse(2)' diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp index 0ca8d92800feb..94d2bd47e9806 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/increment.pass.cpp @@ -59,7 +59,7 @@ constexpr bool test() { { // bidi - int buffer[2] = {1, 2}; + int buffer[3] = {1, 2, 3}; std::ranges::zip_view v(BidiCommonView{buffer}); auto it = v.begin(); @@ -81,7 +81,7 @@ constexpr bool test() { { // forward - int buffer[2] = {1, 2}; + int buffer[3] = {1, 2, 3}; std::ranges::zip_view v(ForwardSizedView{buffer}); auto it = v.begin(); From 7392a546bb34c6fe569e30341138bb70115ca905 Mon Sep 17 00:00:00 2001 From: Cullen Rhodes Date: Wed, 16 Jul 2025 13:35:40 +0100 Subject: [PATCH 027/813] [InstCombine] Treat identical operands as one in pushFreezeToPreventPoisonFromPropagating (#145348) To push a freeze through an instruction, only one operand may produce poison. However, this currently fails for identical operands which are treated as separate. This patch fixes this by treating them as a single operand. --- .../InstCombine/InstructionCombining.cpp | 17 +++++++++-------- llvm/test/Transforms/InstCombine/freeze.ll | 11 +++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index b587d76465803..6de5422aeb084 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -4899,13 +4899,14 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) { // If operand is guaranteed not to be poison, there is no need to add freeze // to the operand. So we first find the operand that is not guaranteed to be // poison. - Use *MaybePoisonOperand = nullptr; - for (Use &U : OrigOpInst->operands()) { - if (isa(U.get()) || - isGuaranteedNotToBeUndefOrPoison(U.get())) + Value *MaybePoisonOperand = nullptr; + for (Value *V : OrigOpInst->operands()) { + if (isa(V) || isGuaranteedNotToBeUndefOrPoison(V) || + // Treat identical operands as a single operand. + (MaybePoisonOperand && MaybePoisonOperand == V)) continue; if (!MaybePoisonOperand) - MaybePoisonOperand = &U; + MaybePoisonOperand = V; else return nullptr; } @@ -4917,10 +4918,10 @@ InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating(FreezeInst &OrigFI) { return OrigOp; Builder.SetInsertPoint(OrigOpInst); - auto *FrozenMaybePoisonOperand = Builder.CreateFreeze( - MaybePoisonOperand->get(), MaybePoisonOperand->get()->getName() + ".fr"); + Value *FrozenMaybePoisonOperand = Builder.CreateFreeze( + MaybePoisonOperand, MaybePoisonOperand->getName() + ".fr"); - replaceUse(*MaybePoisonOperand, FrozenMaybePoisonOperand); + OrigOpInst->replaceUsesOfWith(MaybePoisonOperand, FrozenMaybePoisonOperand); return OrigOp; } diff --git a/llvm/test/Transforms/InstCombine/freeze.ll b/llvm/test/Transforms/InstCombine/freeze.ll index 9733f1b732c3f..3fedead2feab8 100644 --- a/llvm/test/Transforms/InstCombine/freeze.ll +++ b/llvm/test/Transforms/InstCombine/freeze.ll @@ -142,6 +142,17 @@ define i32 @early_freeze_test3(i32 %v1) { ret i32 %v4.fr } +define i32 @early_freeze_test4(i32 %v1) { +; CHECK-LABEL: @early_freeze_test4( +; CHECK-NEXT: [[V2_FR:%.*]] = freeze i32 [[V2:%.*]] +; CHECK-NEXT: [[V3:%.*]] = mul i32 [[V2_FR]], [[V2_FR]] +; CHECK-NEXT: ret i32 [[V3]] +; + %v2 = mul i32 %v1, %v1 + %v2.fr = freeze i32 %v2 + ret i32 %v2.fr +} + ; If replace all dominated uses of v to freeze(v). define void @freeze_dominated_uses_test1(i32 %v) { From 4cc9af219fbeece77e0d874d9eb4b479b6e475d1 Mon Sep 17 00:00:00 2001 From: Chaitanya Koparkar Date: Wed, 16 Jul 2025 08:37:12 -0400 Subject: [PATCH 028/813] [mlir][bufferization] Fix a typo in to_tensor op's summary field (#149082) Fixes #149081 --- mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td index f175b15c8770f..271b42025e0af 100644 --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td @@ -401,7 +401,7 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [ SameOperandsAndResultElementType, Bufferization_TensorAndBufferMatch<"result", "buffer"> ]> { - let summary = "create a buffer-like type from a tensor-like type"; + let summary = "create a tensor-like type from a buffer-like type"; let description = [{ An operation that creates a tensor from a buffer. The result value is a tensor-like type that must match the corresponding buffer-like operand as From c71b92d09f584d41ed8536791e6dac4fa079eedc Mon Sep 17 00:00:00 2001 From: Serge Pavlov Date: Wed, 16 Jul 2025 19:56:31 +0700 Subject: [PATCH 029/813] [RISCV][FPE] Remove unused variable (#149054) It was added by me in 905bb5bddb690765cab5416d55ab017d7c832eb3, which committed PR https://github.com/llvm/llvm-project/pull/148569. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index de830666d89b8..0cee5c87e999d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14021,7 +14021,6 @@ SDValue RISCVTargetLowering::lowerGET_FPMODE(SDValue Op, SDLoc DL(Op); SDValue Chain = Op->getOperand(0); SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); - SDValue ModeMask = DAG.getConstant(ModeMaskValue, DL, XLenVT); SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other); SDValue Result = DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo); Chain = Result.getValue(1); From 1742966c0df03d1f00a177a8d8a2a2afaec6938d Mon Sep 17 00:00:00 2001 From: Akash Banerjee Date: Wed, 16 Jul 2025 13:59:20 +0100 Subject: [PATCH 030/813] [Flang] Force lowering to Complex for AMDGPU (#144927) --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 7 +++++-- flang/test/Lower/amdgcn-complex.f90 | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 flang/test/Lower/amdgcn-complex.f90 diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index d32c1fde59f27..8d0a511744e25 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -1231,8 +1231,11 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc, llvm::StringRef mathLibFuncName = mathOp.runtimeFunc; if (!mathLibFuncName.empty()) { // If we enabled MLIR complex or can use approximate operations, we should - // NOT use libm. - if (!forceMlirComplex && !canUseApprox) { + // NOT use libm. Avoid libm when targeting AMDGPU as those symbols are not + // available on the device and we rely on MLIR complex operations to + // later map to OCML calls. + bool isAMDGPU = fir::getTargetTriple(builder.getModule()).isAMDGCN(); + if (!forceMlirComplex && !canUseApprox && !isAMDGPU) { result = genLibCall(builder, loc, mathOp, mathLibFuncType, args); LLVM_DEBUG(result.dump(); llvm::dbgs() << "\n"); return result; diff --git a/flang/test/Lower/amdgcn-complex.f90 b/flang/test/Lower/amdgcn-complex.f90 new file mode 100644 index 0000000000000..f15c7db2b7316 --- /dev/null +++ b/flang/test/Lower/amdgcn-complex.f90 @@ -0,0 +1,21 @@ +! REQUIRES: amdgpu-registered-target +! RUN: %flang_fc1 -triple amdgcn-amd-amdhsa -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s + +subroutine cabsf_test(a, b) + complex :: a + real :: b + b = abs(a) +end subroutine + +! CHECK-LABEL: func @_QPcabsf_test( +! CHECK: complex.abs +! CHECK-NOT: fir.call @cabsf + +subroutine cexpf_test(a, b) + complex :: a, b + b = exp(a) +end subroutine + +! CHECK-LABEL: func @_QPcexpf_test( +! CHECK: complex.exp +! CHECK-NOT: fir.call @cexpf From fc114e4d931ae25f74a15e42371dbead1387ad51 Mon Sep 17 00:00:00 2001 From: Akash Banerjee Date: Wed, 16 Jul 2025 13:59:41 +0100 Subject: [PATCH 031/813] [MLIR] Add ComplexTOROCDLLibraryCalls pass (#144926) --- flang/lib/Optimizer/CodeGen/CMakeLists.txt | 1 + flang/lib/Optimizer/CodeGen/CodeGen.cpp | 17 ++-- .../ComplexToROCDLLibraryCalls.h | 27 ++++++ mlir/include/mlir/Conversion/Passes.h | 1 + mlir/include/mlir/Conversion/Passes.td | 12 +++ mlir/lib/Conversion/CMakeLists.txt | 1 + .../ComplexToROCDLLibraryCalls/CMakeLists.txt | 18 ++++ .../ComplexToROCDLLibraryCalls.cpp | 92 +++++++++++++++++++ .../complex-to-rocdl-library-calls.mlir | 26 ++++++ 9 files changed, 188 insertions(+), 7 deletions(-) create mode 100644 mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h create mode 100644 mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt create mode 100644 mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp create mode 100644 mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt index 980307db315d9..16c7944a885a1 100644 --- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt +++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt @@ -34,6 +34,7 @@ add_flang_library(FIRCodeGen MLIR_LIBS MLIRComplexToLLVM + MLIRComplexToROCDLLibraryCalls MLIRComplexToStandard MLIRGPUDialect MLIRMathToFuncs diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index ecc04a6c9a2be..5ca53ee48955e 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -33,6 +33,7 @@ #include "mlir/Conversion/ArithCommon/AttrToLLVMConverter.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h" +#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h" #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h" #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" @@ -4145,22 +4146,24 @@ class FIRToLLVMLowering // conversions that affect the ModuleOp, e.g. create new // function operations in it. We have to run such conversions // as passes here. - mlir::OpPassManager mathConvertionPM("builtin.module"); + mlir::OpPassManager mathConversionPM("builtin.module"); bool isAMDGCN = fir::getTargetTriple(mod).isAMDGCN(); // If compiling for AMD target some math operations must be lowered to AMD // GPU library calls, the rest can be converted to LLVM intrinsics, which // is handled in the mathToLLVM conversion. The lowering to libm calls is // not needed since all math operations are handled this way. - if (isAMDGCN) - mathConvertionPM.addPass(mlir::createConvertMathToROCDL()); + if (isAMDGCN) { + mathConversionPM.addPass(mlir::createConvertMathToROCDL()); + mathConversionPM.addPass(mlir::createConvertComplexToROCDLLibraryCalls()); + } // Convert math::FPowI operations to inline implementation // only if the exponent's width is greater than 32, otherwise, // it will be lowered to LLVM intrinsic operation by a later conversion. mlir::ConvertMathToFuncsOptions mathToFuncsOptions{}; mathToFuncsOptions.minWidthOfFPowIExponent = 33; - mathConvertionPM.addPass( + mathConversionPM.addPass( mlir::createConvertMathToFuncs(mathToFuncsOptions)); mlir::ConvertComplexToStandardPassOptions complexToStandardOptions{}; @@ -4173,15 +4176,15 @@ class FIRToLLVMLowering complexToStandardOptions.complexRange = mlir::complex::ComplexRangeFlags::improved; } - mathConvertionPM.addPass( + mathConversionPM.addPass( mlir::createConvertComplexToStandardPass(complexToStandardOptions)); // Convert Math dialect operations into LLVM dialect operations. // There is no way to prefer MathToLLVM patterns over MathToLibm // patterns (applied below), so we have to run MathToLLVM conversion here. - mathConvertionPM.addNestedPass( + mathConversionPM.addNestedPass( mlir::createConvertMathToLLVMPass()); - if (mlir::failed(runPipeline(mathConvertionPM, mod))) + if (mlir::failed(runPipeline(mathConversionPM, mod))) return signalPassFailure(); std::optional dl = diff --git a/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h b/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h new file mode 100644 index 0000000000000..daac2a99ed80f --- /dev/null +++ b/mlir/include/mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h @@ -0,0 +1,27 @@ +//===- ComplexToROCDLLibraryCalls.h - convert from Complex to ROCDL calls -===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_ +#define MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_ + +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" + +namespace mlir { +class RewritePatternSet; + +#define GEN_PASS_DECL_CONVERTCOMPLEXTOROCDLLIBRARYCALLS +#include "mlir/Conversion/Passes.h.inc" + +/// Populate the given list with patterns that convert from Complex to ROCDL +/// calls. +void populateComplexToROCDLLibraryCallsConversionPatterns( + RewritePatternSet &patterns); +} // namespace mlir + +#endif // MLIR_CONVERSION_COMPLEXTOROCDLLIBRARYCALLS_COMPLEXTOROCDLLIBRARYCALLS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h index 8a5976e547169..d93fbefab74aa 100644 --- a/mlir/include/mlir/Conversion/Passes.h +++ b/mlir/include/mlir/Conversion/Passes.h @@ -23,6 +23,7 @@ #include "mlir/Conversion/BufferizationToMemRef/BufferizationToMemRef.h" #include "mlir/Conversion/ComplexToLLVM/ComplexToLLVM.h" #include "mlir/Conversion/ComplexToLibm/ComplexToLibm.h" +#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h" #include "mlir/Conversion/ComplexToSPIRV/ComplexToSPIRVPass.h" #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h" #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index 50c67da91a4af..76e751243a12c 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -312,6 +312,18 @@ def ConvertComplexToLibm : Pass<"convert-complex-to-libm", "ModuleOp"> { let dependentDialects = ["func::FuncDialect"]; } +//===----------------------------------------------------------------------===// +// ComplexToROCDLLibraryCalls +//===----------------------------------------------------------------------===// + +def ConvertComplexToROCDLLibraryCalls : Pass<"convert-complex-to-rocdl-library-calls", "ModuleOp"> { + let summary = "Convert Complex dialect to ROCDL library calls"; + let description = [{ + This pass converts supported Complex ops to calls to the AMD device library. + }]; + let dependentDialects = ["func::FuncDialect"]; +} + //===----------------------------------------------------------------------===// // ComplexToSPIRV //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 24a48993ad80c..f84375b6b8d6a 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -13,6 +13,7 @@ add_subdirectory(AsyncToLLVM) add_subdirectory(BufferizationToMemRef) add_subdirectory(ComplexCommon) add_subdirectory(ComplexToLibm) +add_subdirectory(ComplexToROCDLLibraryCalls) add_subdirectory(ComplexToLLVM) add_subdirectory(ComplexToSPIRV) add_subdirectory(ComplexToStandard) diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt new file mode 100644 index 0000000000000..695bb2dd0a82c --- /dev/null +++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/CMakeLists.txt @@ -0,0 +1,18 @@ +add_mlir_conversion_library(MLIRComplexToROCDLLibraryCalls + ComplexToROCDLLibraryCalls.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ComplexToROCDLLibraryCalls + + DEPENDS + MLIRConversionPassIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRComplexDialect + MLIRFuncDialect + MLIRPass + MLIRTransformUtils + ) diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp new file mode 100644 index 0000000000000..99d5424aef79a --- /dev/null +++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp @@ -0,0 +1,92 @@ +//=== ComplexToROCDLLibraryCalls.cpp - convert from Complex to ROCDL calls ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.h" +#include "mlir/Dialect/Complex/IR/Complex.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Transforms/DialectConversion.h" + +namespace mlir { +#define GEN_PASS_DEF_CONVERTCOMPLEXTOROCDLLIBRARYCALLS +#include "mlir/Conversion/Passes.h.inc" +} // namespace mlir + +using namespace mlir; + +namespace { + +template +// Pattern to convert Complex ops to ROCDL function calls. +struct ComplexOpToROCDLLibraryCalls : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + ComplexOpToROCDLLibraryCalls(MLIRContext *context, StringRef funcName, + PatternBenefit benefit = 1) + : OpRewritePattern(context, benefit), funcName(funcName) {} + + LogicalResult matchAndRewrite(Op op, PatternRewriter &rewriter) const final { + Operation *symTable = SymbolTable::getNearestSymbolTable(op); + Type resType = op.getType(); + if (auto complexType = dyn_cast(resType)) + resType = complexType.getElementType(); + if (!isa(resType)) + return failure(); + + auto opFunc = dyn_cast_or_null( + SymbolTable::lookupSymbolIn(symTable, funcName)); + if (!opFunc) { + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(&symTable->getRegion(0).front()); + auto funcTy = FunctionType::get( + rewriter.getContext(), op->getOperandTypes(), op->getResultTypes()); + opFunc = rewriter.create(rewriter.getUnknownLoc(), funcName, + funcTy); + opFunc.setPrivate(); + } + rewriter.replaceOpWithNewOp(op, funcName, op.getType(), + op->getOperands()); + return success(); + } + +private: + std::string funcName; +}; +} // namespace + +void mlir::populateComplexToROCDLLibraryCallsConversionPatterns( + RewritePatternSet &patterns) { + patterns.add>( + patterns.getContext(), "__ocml_cabs_f32"); + patterns.add>( + patterns.getContext(), "__ocml_cabs_f64"); + patterns.add>( + patterns.getContext(), "__ocml_cexp_f32"); + patterns.add>( + patterns.getContext(), "__ocml_cexp_f64"); +} + +namespace { +struct ConvertComplexToROCDLLibraryCallsPass + : public impl::ConvertComplexToROCDLLibraryCallsBase< + ConvertComplexToROCDLLibraryCallsPass> { + void runOnOperation() override; +}; +} // namespace + +void ConvertComplexToROCDLLibraryCallsPass::runOnOperation() { + Operation *op = getOperation(); + + RewritePatternSet patterns(&getContext()); + populateComplexToROCDLLibraryCallsConversionPatterns(patterns); + + ConversionTarget target(getContext()); + target.addLegalDialect(); + target.addIllegalOp(); + if (failed(applyPartialConversion(op, target, std::move(patterns)))) + signalPassFailure(); +} diff --git a/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir b/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir new file mode 100644 index 0000000000000..bae7c5986ef9e --- /dev/null +++ b/mlir/test/Conversion/ComplexToROCDLLibraryCalls/complex-to-rocdl-library-calls.mlir @@ -0,0 +1,26 @@ +// RUN: mlir-opt %s -convert-complex-to-rocdl-library-calls | FileCheck %s + +// CHECK-DAG: @__ocml_cabs_f32(complex) -> f32 +// CHECK-DAG: @__ocml_cabs_f64(complex) -> f64 +// CHECK-DAG: @__ocml_cexp_f32(complex) -> complex +// CHECK-DAG: @__ocml_cexp_f64(complex) -> complex + +//CHECK-LABEL: @abs_caller +func.func @abs_caller(%f: complex, %d: complex) -> (f32, f64) { + // CHECK: %[[RF:.*]] = call @__ocml_cabs_f32(%{{.*}}) + %rf = complex.abs %f : complex + // CHECK: %[[RD:.*]] = call @__ocml_cabs_f64(%{{.*}}) + %rd = complex.abs %d : complex + // CHECK: return %[[RF]], %[[RD]] + return %rf, %rd : f32, f64 +} + +//CHECK-LABEL: @exp_caller +func.func @exp_caller(%f: complex, %d: complex) -> (complex, complex) { + // CHECK: %[[EF:.*]] = call @__ocml_cexp_f32(%{{.*}}) + %ef = complex.exp %f : complex + // CHECK: %[[ED:.*]] = call @__ocml_cexp_f64(%{{.*}}) + %ed = complex.exp %d : complex + // CHECK: return %[[EF]], %[[ED]] + return %ef, %ed : complex, complex +} From dbb12109b947995d5882ca521f7b7716d6ac9ae4 Mon Sep 17 00:00:00 2001 From: Akash Banerjee Date: Wed, 16 Jul 2025 14:00:06 +0100 Subject: [PATCH 032/813] [OpenMP] Add TargetAMDGPU support for Complex argument and return types (#144924) --- flang/lib/Optimizer/CodeGen/Target.cpp | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/Target.cpp b/flang/lib/Optimizer/CodeGen/Target.cpp index 7dbf21ce0c125..b60a72e4340b9 100644 --- a/flang/lib/Optimizer/CodeGen/Target.cpp +++ b/flang/lib/Optimizer/CodeGen/Target.cpp @@ -1443,14 +1443,35 @@ struct TargetAMDGPU : public GenericTarget { CodeGenSpecifics::Marshalling complexArgumentType(mlir::Location loc, mlir::Type eleTy) const override { CodeGenSpecifics::Marshalling marshal; - TODO(loc, "handle complex argument types"); + const auto *sem = &floatToSemantics(kindMap, eleTy); + if (sem == &llvm::APFloat::IEEEsingle()) { + // Lower COMPLEX(KIND=4) as an array of two element values. + marshal.emplace_back(fir::SequenceType::get({2}, eleTy), AT{}); + } else if (sem == &llvm::APFloat::IEEEdouble()) { + // Pass COMPLEX(KIND=8) as two separate arguments. + marshal.emplace_back(eleTy, AT{}); + marshal.emplace_back(eleTy, AT{}); + } else { + typeTodo(sem, loc, "argument"); + } return marshal; } CodeGenSpecifics::Marshalling complexReturnType(mlir::Location loc, mlir::Type eleTy) const override { CodeGenSpecifics::Marshalling marshal; - TODO(loc, "handle complex return types"); + const auto *sem = &floatToSemantics(kindMap, eleTy); + if (sem == &llvm::APFloat::IEEEsingle()) { + // Return COMPLEX(KIND=4) as an array of two elements. + marshal.emplace_back(fir::SequenceType::get({2}, eleTy), AT{}); + } else if (sem == &llvm::APFloat::IEEEdouble()) { + // Return COMPLEX(KIND=8) via an aggregate with two fields. + marshal.emplace_back(mlir::TupleType::get(eleTy.getContext(), + mlir::TypeRange{eleTy, eleTy}), + AT{}); + } else { + typeTodo(sem, loc, "return"); + } return marshal; } }; From 88a498c3b110b73c10362df8c18ca13fe1873744 Mon Sep 17 00:00:00 2001 From: nerix Date: Wed, 16 Jul 2025 15:00:18 +0200 Subject: [PATCH 033/813] [LLDB] Add formatters for MSVC STL std::(forward_)list (#148285) Adds synthetic providers for MSVC's `std::forward_list` and `std::list`. It refactors `LibCxxList` to be generic over the STL type (currently libc++ or MSVC STL). The libstdc++ synthetic providers use something similar in Python [here](https://github.com/llvm/llvm-project/blob/3092b765ba0b2d20bd716944dda86ea8e4ad12e3/lldb/examples/synthetic/gnu_libstdcpp.py#L134). Eventually, this could be ported to C++ as well. Towards #24834. --- .../Plugins/Language/CPlusPlus/CMakeLists.txt | 2 +- .../Language/CPlusPlus/CPlusPlusLanguage.cpp | 57 +++- .../{LibCxxList.cpp => GenericList.cpp} | 294 +++++++++++++++--- .../Plugins/Language/CPlusPlus/MsvcStl.h | 9 + .../TestDataFormatterGenericForwardList.py | 33 +- .../list/TestDataFormatterGenericList.py | 35 ++- .../loop/TestDataFormatterGenericListLoop.py | 18 +- .../generic/list/loop/main.cpp | 5 - 8 files changed, 355 insertions(+), 98 deletions(-) rename lldb/source/Plugins/Language/CPlusPlus/{LibCxxList.cpp => GenericList.cpp} (58%) diff --git a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt index 8ee6e2a246c55..5905d9b9a6d03 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt +++ b/lldb/source/Plugins/Language/CPlusPlus/CMakeLists.txt @@ -14,11 +14,11 @@ add_lldb_library(lldbPluginCPlusPlusLanguage PLUGIN CxxStringTypes.cpp Generic.cpp GenericBitset.cpp + GenericList.cpp GenericOptional.cpp LibCxx.cpp LibCxxAtomic.cpp LibCxxInitializerList.cpp - LibCxxList.cpp LibCxxMap.cpp LibCxxQueue.cpp LibCxxRangesRefView.cpp diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp index 4a3fdede84d32..a8ebde0b55815 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp @@ -1440,14 +1440,12 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { stl_deref_flags, "lldb.formatters.cpp.gnu_libstdcpp.StdUnorderedMapSynthProvider"))); cpp_category_sp->AddTypeSynthetic( - "^std::((__debug::)?|(__cxx11::)?)list<.+>(( )?&)?$", - eFormatterMatchRegex, + "^std::__(debug|cxx11)::list<.+>(( )?&)?$", eFormatterMatchRegex, SyntheticChildrenSP(new ScriptedSyntheticChildren( stl_deref_flags, "lldb.formatters.cpp.gnu_libstdcpp.StdListSynthProvider"))); cpp_category_sp->AddTypeSynthetic( - "^std::((__debug::)?|(__cxx11::)?)forward_list<.+>(( )?&)?$", - eFormatterMatchRegex, + "^std::__(debug|cxx11)::forward_list<.+>(( )?&)?$", eFormatterMatchRegex, SyntheticChildrenSP(new ScriptedSyntheticChildren( stl_synth_flags, "lldb.formatters.cpp.gnu_libstdcpp.StdForwardListSynthProvider"))); @@ -1501,15 +1499,13 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { "^std::(__debug::)?unordered_(multi)?(map|set)<.+> >$", stl_summary_flags, true); - AddCXXSummary(cpp_category_sp, - lldb_private::formatters::ContainerSizeSummaryProvider, - "libstdc++ std::list summary provider", - "^std::((__debug::)?|(__cxx11::)?)list<.+>(( )?&)?$", - stl_summary_flags, true); + AddCXXSummary( + cpp_category_sp, lldb_private::formatters::ContainerSizeSummaryProvider, + "libstdc++ debug std::list summary provider", + "^std::__(debug|cxx11)::list<.+>(( )?&)?$", stl_summary_flags, true); cpp_category_sp->AddTypeSummary( - "^std::((__debug::)?|(__cxx11::)?)forward_list<.+>(( )?&)?$", - eFormatterMatchRegex, + "^std::__(debug|cxx11)::forward_list<.+>(( )?&)?$", eFormatterMatchRegex, TypeSummaryImplSP(new ScriptSummaryFormat( stl_summary_flags, "lldb.formatters.cpp.gnu_libstdcpp.ForwardListSummaryProvider"))); @@ -1627,6 +1623,31 @@ GenericVectorSyntheticFrontEndCreator(CXXSyntheticChildren *children, "lldb.formatters.cpp.gnu_libstdcpp.StdVectorSynthProvider", *valobj_sp); } +static SyntheticChildrenFrontEnd * +GenericListSyntheticFrontEndCreator(CXXSyntheticChildren *children, + lldb::ValueObjectSP valobj_sp) { + if (!valobj_sp) + return nullptr; + + if (IsMsvcStlList(*valobj_sp)) + return MsvcStlListSyntheticFrontEndCreator(children, valobj_sp); + return new ScriptedSyntheticChildren::FrontEnd( + "lldb.formatters.cpp.gnu_libstdcpp.StdListSynthProvider", *valobj_sp); +} + +static SyntheticChildrenFrontEnd * +GenericForwardListSyntheticFrontEndCreator(CXXSyntheticChildren *children, + lldb::ValueObjectSP valobj_sp) { + if (!valobj_sp) + return nullptr; + + if (IsMsvcStlList(*valobj_sp)) + return MsvcStlForwardListSyntheticFrontEndCreator(children, valobj_sp); + return new ScriptedSyntheticChildren::FrontEnd( + "lldb.formatters.cpp.gnu_libstdcpp.StdForwardListSynthProvider", + *valobj_sp); +} + /// Load formatters that are formatting types from more than one STL static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { if (!cpp_category_sp) @@ -1685,6 +1706,12 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { AddCXXSynthetic(cpp_category_sp, GenericTupleSyntheticFrontEndCreator, "std::tuple synthetic children", "^std::tuple<.*>(( )?&)?$", stl_synth_flags, true); + AddCXXSynthetic(cpp_category_sp, GenericListSyntheticFrontEndCreator, + "std::list synthetic children", "^std::list<.+>(( )?&)?$", + stl_synth_flags, true); + AddCXXSynthetic(cpp_category_sp, GenericForwardListSyntheticFrontEndCreator, + "std::forward_list synthetic children", + "^std::forward_list<.+>(( )?&)?$", stl_synth_flags, true); AddCXXSummary(cpp_category_sp, GenericSmartPointerSummaryProvider, "MSVC STL/libstdc++ std::shared_ptr summary provider", @@ -1704,6 +1731,14 @@ static void LoadCommonStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { AddCXXSynthetic(cpp_category_sp, GenericVectorSyntheticFrontEndCreator, "MSVC/libstdc++ std::vector synthetic provider", "^std::vector<.+>(( )?&)?$", stl_synth_flags, true); + AddCXXSummary(cpp_category_sp, ContainerSizeSummaryProvider, + "MSVC STL/libstdc++ std::list summary provider", + "^std::list<.+>(( )?&)?$", stl_summary_flags, true); + cpp_category_sp->AddTypeSummary( + "^std::forward_list<.+>(( )?&)?$", eFormatterMatchRegex, + TypeSummaryImplSP(new ScriptSummaryFormat( + stl_summary_flags, + "lldb.formatters.cpp.gnu_libstdcpp.ForwardListSummaryProvider"))); } static void LoadMsvcStlFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp similarity index 58% rename from lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp rename to lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp index 826e6ab090e10..ea1edbfd3ac9b 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp @@ -1,4 +1,4 @@ -//===-- LibCxxList.cpp ----------------------------------------------------===// +//===-- GenericList.cpp ---------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,14 +7,11 @@ //===----------------------------------------------------------------------===// #include "LibCxx.h" +#include "MsvcStl.h" -#include "Plugins/TypeSystem/Clang/TypeSystemClang.h" #include "lldb/DataFormatters/FormattersHelpers.h" #include "lldb/Target/Target.h" -#include "lldb/Utility/DataBufferHeap.h" -#include "lldb/Utility/Endian.h" #include "lldb/Utility/Status.h" -#include "lldb/Utility/Stream.h" #include "lldb/ValueObject/ValueObject.h" #include "lldb/ValueObject/ValueObjectConstResult.h" #include "lldb/lldb-enumerations.h" @@ -25,31 +22,27 @@ using namespace lldb_private::formatters; namespace { -class ListEntry { +enum class StlType { + LibCxx, + MsvcStl, +}; + +template class ListEntry { public: ListEntry() = default; ListEntry(ValueObjectSP entry_sp) : m_entry_sp(std::move(entry_sp)) {} ListEntry(ValueObject *entry) : m_entry_sp(entry ? entry->GetSP() : ValueObjectSP()) {} - ListEntry next() { - if (!m_entry_sp) - return ListEntry(); - return ListEntry(m_entry_sp->GetChildMemberWithName("__next_")); - } - - ListEntry prev() { - if (!m_entry_sp) - return ListEntry(); - return ListEntry(m_entry_sp->GetChildMemberWithName("__prev_")); - } - uint64_t value() const { if (!m_entry_sp) return 0; return m_entry_sp->GetValueAsUnsigned(0); } + ListEntry next(); + ListEntry prev(); + bool null() { return (value() == 0); } explicit operator bool() { return GetEntry() && !null(); } @@ -66,10 +59,34 @@ class ListEntry { ValueObjectSP m_entry_sp; }; -class ListIterator { +template <> ListEntry ListEntry::next() { + if (!m_entry_sp) + return ListEntry(); + return ListEntry(m_entry_sp->GetChildMemberWithName("__next_")); +} + +template <> ListEntry ListEntry::prev() { + if (!m_entry_sp) + return ListEntry(); + return ListEntry(m_entry_sp->GetChildMemberWithName("__prev_")); +} + +template <> ListEntry ListEntry::next() { + if (!m_entry_sp) + return ListEntry(); + return ListEntry(m_entry_sp->GetChildMemberWithName("_Next")); +} + +template <> ListEntry ListEntry::prev() { + if (!m_entry_sp) + return ListEntry(); + return ListEntry(m_entry_sp->GetChildMemberWithName("_Prev")); +} + +template class ListIterator { public: ListIterator() = default; - ListIterator(ListEntry entry) : m_entry(std::move(entry)) {} + ListIterator(ListEntry entry) : m_entry(std::move(entry)) {} ListIterator(ValueObjectSP entry) : m_entry(std::move(entry)) {} ListIterator(ValueObject *entry) : m_entry(entry) {} @@ -101,9 +118,10 @@ class ListIterator { void prev() { m_entry = m_entry.prev(); } private: - ListEntry m_entry; + ListEntry m_entry; }; +template class AbstractListFrontEnd : public SyntheticChildrenFrontEnd { public: llvm::Expected GetIndexOfChildWithName(ConstString name) override { @@ -124,33 +142,31 @@ class AbstractListFrontEnd : public SyntheticChildrenFrontEnd { ValueObject *m_head = nullptr; static constexpr bool g_use_loop_detect = true; - size_t m_loop_detected = 0; // The number of elements that have had loop - // detection run over them. - ListEntry m_slow_runner; // Used for loop detection - ListEntry m_fast_runner; // Used for loop detection + size_t m_loop_detected = 0; // The number of elements that have had loop + // detection run over them. + ListEntry m_slow_runner; // Used for loop detection + ListEntry m_fast_runner; // Used for loop detection size_t m_list_capping_size = 0; CompilerType m_element_type; - std::map m_iterators; + std::map> m_iterators; bool HasLoop(size_t count); ValueObjectSP GetItem(size_t idx); }; -class ForwardListFrontEnd : public AbstractListFrontEnd { +class LibCxxForwardListFrontEnd : public AbstractListFrontEnd { public: - ForwardListFrontEnd(ValueObject &valobj); + LibCxxForwardListFrontEnd(ValueObject &valobj); llvm::Expected CalculateNumChildren() override; ValueObjectSP GetChildAtIndex(uint32_t idx) override; lldb::ChildCacheState Update() override; }; -class ListFrontEnd : public AbstractListFrontEnd { +class LibCxxListFrontEnd : public AbstractListFrontEnd { public: - ListFrontEnd(lldb::ValueObjectSP valobj_sp); - - ~ListFrontEnd() override = default; + LibCxxListFrontEnd(lldb::ValueObjectSP valobj_sp); llvm::Expected CalculateNumChildren() override; @@ -163,9 +179,34 @@ class ListFrontEnd : public AbstractListFrontEnd { ValueObject *m_tail = nullptr; }; +class MsvcStlForwardListFrontEnd + : public AbstractListFrontEnd { +public: + MsvcStlForwardListFrontEnd(ValueObject &valobj); + + llvm::Expected CalculateNumChildren() override; + ValueObjectSP GetChildAtIndex(uint32_t idx) override; + lldb::ChildCacheState Update() override; +}; + +class MsvcStlListFrontEnd : public AbstractListFrontEnd { +public: + MsvcStlListFrontEnd(lldb::ValueObjectSP valobj_sp); + + llvm::Expected CalculateNumChildren() override; + + lldb::ValueObjectSP GetChildAtIndex(uint32_t idx) override; + + lldb::ChildCacheState Update() override; + +private: + ValueObject *m_tail = nullptr; +}; + } // end anonymous namespace -lldb::ChildCacheState AbstractListFrontEnd::Update() { +template +lldb::ChildCacheState AbstractListFrontEnd::Update() { m_loop_detected = 0; m_count = UINT32_MAX; m_head = nullptr; @@ -191,7 +232,7 @@ lldb::ChildCacheState AbstractListFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } -bool AbstractListFrontEnd::HasLoop(size_t count) { +template bool AbstractListFrontEnd::HasLoop(size_t count) { if (!g_use_loop_detect) return false; // don't bother checking for a loop if we won't actually need to jump nodes @@ -201,7 +242,7 @@ bool AbstractListFrontEnd::HasLoop(size_t count) { if (m_loop_detected == 0) { // This is the first time we are being run (after the last update). Set up // the loop invariant for the first element. - m_slow_runner = ListEntry(m_head).next(); + m_slow_runner = ListEntry(m_head).next(); m_fast_runner = m_slow_runner.next(); m_loop_detected = 1; } @@ -225,9 +266,10 @@ bool AbstractListFrontEnd::HasLoop(size_t count) { return m_slow_runner == m_fast_runner; } -ValueObjectSP AbstractListFrontEnd::GetItem(size_t idx) { +template +ValueObjectSP AbstractListFrontEnd::GetItem(size_t idx) { size_t advance = idx; - ListIterator current(m_head); + ListIterator current(m_head); if (idx > 0) { auto cached_iterator = m_iterators.find(idx - 1); if (cached_iterator != m_iterators.end()) { @@ -240,16 +282,16 @@ ValueObjectSP AbstractListFrontEnd::GetItem(size_t idx) { return value_sp; } -ForwardListFrontEnd::ForwardListFrontEnd(ValueObject &valobj) +LibCxxForwardListFrontEnd::LibCxxForwardListFrontEnd(ValueObject &valobj) : AbstractListFrontEnd(valobj) { Update(); } -llvm::Expected ForwardListFrontEnd::CalculateNumChildren() { +llvm::Expected LibCxxForwardListFrontEnd::CalculateNumChildren() { if (m_count != UINT32_MAX) return m_count; - ListEntry current(m_head); + ListEntry current(m_head); m_count = 0; while (current && m_count < m_list_capping_size) { ++m_count; @@ -258,7 +300,7 @@ llvm::Expected ForwardListFrontEnd::CalculateNumChildren() { return m_count; } -ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(uint32_t idx) { +ValueObjectSP LibCxxForwardListFrontEnd::GetChildAtIndex(uint32_t idx) { if (idx >= CalculateNumChildrenIgnoringErrors()) return nullptr; @@ -289,7 +331,7 @@ ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(uint32_t idx) { m_element_type); } -lldb::ChildCacheState ForwardListFrontEnd::Update() { +lldb::ChildCacheState LibCxxForwardListFrontEnd::Update() { AbstractListFrontEnd::Update(); Status err; @@ -312,13 +354,13 @@ lldb::ChildCacheState ForwardListFrontEnd::Update() { return ChildCacheState::eRefetch; } -ListFrontEnd::ListFrontEnd(lldb::ValueObjectSP valobj_sp) +LibCxxListFrontEnd::LibCxxListFrontEnd(lldb::ValueObjectSP valobj_sp) : AbstractListFrontEnd(*valobj_sp) { if (valobj_sp) Update(); } -llvm::Expected ListFrontEnd::CalculateNumChildren() { +llvm::Expected LibCxxListFrontEnd::CalculateNumChildren() { if (m_count != UINT32_MAX) return m_count; if (!m_head || !m_tail || m_node_address == 0) @@ -351,7 +393,7 @@ llvm::Expected ListFrontEnd::CalculateNumChildren() { if (next_val == prev_val) return 1; uint64_t size = 2; - ListEntry current(m_head); + ListEntry current(m_head); while (current.next() && current.next().value() != m_node_address) { size++; current = current.next(); @@ -361,7 +403,7 @@ llvm::Expected ListFrontEnd::CalculateNumChildren() { return m_count = (size - 1); } -lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(uint32_t idx) { +lldb::ValueObjectSP LibCxxListFrontEnd::GetChildAtIndex(uint32_t idx) { static ConstString g_value("__value_"); static ConstString g_next("__next_"); @@ -412,7 +454,7 @@ lldb::ValueObjectSP ListFrontEnd::GetChildAtIndex(uint32_t idx) { m_element_type); } -lldb::ChildCacheState ListFrontEnd::Update() { +lldb::ChildCacheState LibCxxListFrontEnd::Update() { AbstractListFrontEnd::Update(); m_tail = nullptr; m_node_address = 0; @@ -432,13 +474,167 @@ lldb::ChildCacheState ListFrontEnd::Update() { return lldb::ChildCacheState::eRefetch; } +MsvcStlForwardListFrontEnd::MsvcStlForwardListFrontEnd(ValueObject &valobj) + : AbstractListFrontEnd(valobj) { + Update(); +} + +llvm::Expected MsvcStlForwardListFrontEnd::CalculateNumChildren() { + if (m_count != UINT32_MAX) + return m_count; + + ListEntry current(m_head); + m_count = 0; + while (current && m_count < m_list_capping_size) { + ++m_count; + current = current.next(); + } + return m_count; +} + +ValueObjectSP MsvcStlForwardListFrontEnd::GetChildAtIndex(uint32_t idx) { + if (idx >= CalculateNumChildrenIgnoringErrors()) + return nullptr; + + if (!m_head) + return nullptr; + + if (HasLoop(idx + 1)) + return nullptr; + + ValueObjectSP current_sp = GetItem(idx); + if (!current_sp) + return nullptr; + + current_sp = current_sp->GetChildAtIndex(1); // get the _Myval child + if (!current_sp) + return nullptr; + + // we need to copy current_sp into a new object otherwise we will end up with + // all items named _Myval + DataExtractor data; + Status error; + current_sp->GetData(data, error); + if (error.Fail()) + return nullptr; + + return CreateValueObjectFromData(llvm::formatv("[{0}]", idx).str(), data, + m_backend.GetExecutionContextRef(), + m_element_type); +} + +lldb::ChildCacheState MsvcStlForwardListFrontEnd::Update() { + AbstractListFrontEnd::Update(); + + if (auto head_sp = + m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"})) + m_head = head_sp.get(); + + return ChildCacheState::eRefetch; +} + +MsvcStlListFrontEnd::MsvcStlListFrontEnd(lldb::ValueObjectSP valobj_sp) + : AbstractListFrontEnd(*valobj_sp) { + if (valobj_sp) + Update(); +} + +llvm::Expected MsvcStlListFrontEnd::CalculateNumChildren() { + if (m_count != UINT32_MAX) + return m_count; + if (!m_head || !m_tail) + return 0; + + auto size_sp = + m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Mysize"}); + if (!size_sp) + return llvm::createStringError("Failed to resolve size."); + + m_count = size_sp->GetValueAsUnsigned(UINT32_MAX); + if (m_count == UINT32_MAX) + return llvm::createStringError("Failed to read size value."); + + return m_count; +} + +lldb::ValueObjectSP MsvcStlListFrontEnd::GetChildAtIndex(uint32_t idx) { + if (idx >= CalculateNumChildrenIgnoringErrors()) + return lldb::ValueObjectSP(); + + if (!m_head || !m_tail) + return lldb::ValueObjectSP(); + + if (HasLoop(idx + 1)) + return lldb::ValueObjectSP(); + + ValueObjectSP current_sp = GetItem(idx); + if (!current_sp) + return lldb::ValueObjectSP(); + + current_sp = current_sp->GetChildAtIndex(2); // get the _Myval child + if (!current_sp) + return lldb::ValueObjectSP(); + + // we need to copy current_sp into a new object otherwise we will end up with + // all items named _Myval + DataExtractor data; + Status error; + current_sp->GetData(data, error); + if (error.Fail()) + return lldb::ValueObjectSP(); + + StreamString name; + name.Printf("[%" PRIu64 "]", (uint64_t)idx); + return CreateValueObjectFromData(name.GetString(), data, + m_backend.GetExecutionContextRef(), + m_element_type); +} + +lldb::ChildCacheState MsvcStlListFrontEnd::Update() { + AbstractListFrontEnd::Update(); + m_tail = nullptr; + m_head = nullptr; + + ValueObjectSP last = + m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"}); + if (!last) + return lldb::ChildCacheState::eRefetch; + ValueObjectSP first = last->GetChildMemberWithName("_Next"); + if (!first) + return lldb::ChildCacheState::eRefetch; + + m_head = first.get(); + m_tail = last.get(); + + return lldb::ChildCacheState::eRefetch; +} + SyntheticChildrenFrontEnd *formatters::LibcxxStdListSyntheticFrontEndCreator( CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) { - return (valobj_sp ? new ListFrontEnd(valobj_sp) : nullptr); + return (valobj_sp ? new LibCxxListFrontEnd(valobj_sp) : nullptr); } SyntheticChildrenFrontEnd * formatters::LibcxxStdForwardListSyntheticFrontEndCreator( CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) { - return valobj_sp ? new ForwardListFrontEnd(*valobj_sp) : nullptr; + return valobj_sp ? new LibCxxForwardListFrontEnd(*valobj_sp) : nullptr; +} + +bool formatters::IsMsvcStlList(ValueObject &valobj) { + if (auto valobj_sp = valobj.GetNonSyntheticValue()) + return valobj_sp->GetChildMemberWithName("_Mypair") != nullptr; + + return false; +} + +SyntheticChildrenFrontEnd * +formatters::MsvcStlListSyntheticFrontEndCreator(CXXSyntheticChildren *, + lldb::ValueObjectSP valobj_sp) { + return (valobj_sp ? new MsvcStlListFrontEnd(valobj_sp) : nullptr); +} + +SyntheticChildrenFrontEnd * +formatters::MsvcStlForwardListSyntheticFrontEndCreator( + CXXSyntheticChildren *, lldb::ValueObjectSP valobj_sp) { + return valobj_sp ? new MsvcStlForwardListFrontEnd(*valobj_sp) : nullptr; } diff --git a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h index 81397851b6010..0f3db4b50eeaf 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h +++ b/lldb/source/Plugins/Language/CPlusPlus/MsvcStl.h @@ -56,6 +56,15 @@ bool IsMsvcStlVector(ValueObject &valobj); lldb_private::SyntheticChildrenFrontEnd * MsvcStlVectorSyntheticFrontEndCreator(lldb::ValueObjectSP valobj_sp); +// MSVC STL std::list and std::forward_list +bool IsMsvcStlList(ValueObject &valobj); +SyntheticChildrenFrontEnd * +MsvcStlForwardListSyntheticFrontEndCreator(CXXSyntheticChildren *, + lldb::ValueObjectSP valobj_sp); +SyntheticChildrenFrontEnd * +MsvcStlListSyntheticFrontEndCreator(CXXSyntheticChildren *, + lldb::ValueObjectSP valobj_sp); + } // namespace formatters } // namespace lldb_private diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py index f63f8fe1d6a62..45695c43b42a9 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py @@ -7,9 +7,6 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil -USE_LIBSTDCPP = "USE_LIBSTDCPP" -USE_LIBCPP = "USE_LIBCPP" - class TestDataFormatterGenericForwardList(TestBase): def setUp(self): @@ -17,9 +14,8 @@ def setUp(self): self.line = line_number("main.cpp", "// break here") self.namespace = "std" - def do_test(self, stdlib_type): + def do_test(self): """Test that std::forward_list is displayed correctly""" - self.build(dictionary={stdlib_type: "1"}) lldbutil.run_to_source_breakpoint( self, "// break here", lldb.SBFileSpec("main.cpp", False) ) @@ -76,10 +72,8 @@ def do_test(self, stdlib_type): substrs=["size=24", "[0]", "[1]", "[2]", "..."], ) - def do_test_ptr_and_ref(self, stdlib_type): + def do_test_ptr_and_ref(self): """Test that ref and ptr to std::forward_list is displayed correctly""" - self.build(dictionary={stdlib_type: "1"}) - (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint( self, "Check ref and ptr", lldb.SBFileSpec("main.cpp", False) ) @@ -158,16 +152,31 @@ def do_test_ptr_and_ref(self, stdlib_type): @add_test_categories(["libstdcxx"]) def test_libstdcpp(self): - self.do_test(USE_LIBSTDCPP) + self.build(dictionary={"USE_LIBSTDCPP": 1}) + self.do_test() @add_test_categories(["libstdcxx"]) def test_ptr_and_ref_libstdcpp(self): - self.do_test_ptr_and_ref(USE_LIBSTDCPP) + self.build(dictionary={"USE_LIBSTDCPP": 1}) + self.do_test_ptr_and_ref() @add_test_categories(["libc++"]) def test_libcpp(self): - self.do_test(USE_LIBCPP) + self.build(dictionary={"USE_LIBCPP": 1}) + self.do_test() @add_test_categories(["libc++"]) def test_ptr_and_ref_libcpp(self): - self.do_test_ptr_and_ref(USE_LIBCPP) + self.build(dictionary={"USE_LIBCPP": 1}) + self.do_test_ptr_and_ref() + + @add_test_categories(["msvcstl"]) + def test_msvcstl(self): + # No flags, because the "msvcstl" category checks that the MSVC STL is used by default. + self.build() + self.do_test() + + @add_test_categories(["msvcstl"]) + def test_ptr_and_ref_msvcstl(self): + self.build() + self.do_test_ptr_and_ref() diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py index 78c93b1e3caea..c0207e6ab5911 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py @@ -8,9 +8,6 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil -USE_LIBSTDCPP = "USE_LIBSTDCPP" -USE_LIBCPP = "USE_LIBCPP" - class GenericListDataFormatterTestCase(TestBase): def setUp(self): @@ -25,9 +22,8 @@ def setUp(self): "main.cpp", "// Set final break point at this line." ) - def do_test_with_run_command(self, stdlib_type): + def do_test_with_run_command(self, *, is_libstdcpp=False): """Test that that file and class static variables display correctly.""" - self.build(dictionary={stdlib_type: "1"}) self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET) lldbutil.run_break_set_by_file_and_line( @@ -62,7 +58,7 @@ def cleanup(): "frame variable numbers_list --raw", matching=False, substrs=["size=0"] ) - if stdlib_type == USE_LIBSTDCPP: + if is_libstdcpp: self.expect( "frame variable &numbers_list._M_impl._M_node --raw", matching=False, @@ -230,10 +226,8 @@ def cleanup(): "text_list.MightHaveChildren() says False for non empty!", ) - def do_test_ptr_and_ref(self, stdlib_type): + def do_test_ptr_and_ref(self): """Test that ref and ptr to std::list is displayed correctly""" - self.build(dictionary={stdlib_type: "1"}) - (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint( self, "Check ref and ptr", lldb.SBFileSpec("main.cpp", False) ) @@ -302,16 +296,31 @@ def do_test_ptr_and_ref(self, stdlib_type): @add_test_categories(["libstdcxx"]) def test_with_run_command_libstdcpp(self): - self.do_test_with_run_command(USE_LIBSTDCPP) + self.build(dictionary={"USE_LIBSTDCPP": 1}) + self.do_test_with_run_command(is_libstdcpp=True) @add_test_categories(["libstdcxx"]) def test_ptr_and_ref_libstdcpp(self): - self.do_test_ptr_and_ref(USE_LIBSTDCPP) + self.build(dictionary={"USE_LIBSTDCPP": 1}) + self.do_test_ptr_and_ref() @add_test_categories(["libc++"]) def test_with_run_command_libcpp(self): - self.do_test_with_run_command(USE_LIBCPP) + self.build(dictionary={"USE_LIBCPP": 1}) + self.do_test_with_run_command() @add_test_categories(["libc++"]) def test_ptr_and_ref_libcpp(self): - self.do_test_ptr_and_ref(USE_LIBCPP) + self.build(dictionary={"USE_LIBCPP": 1}) + self.do_test_ptr_and_ref() + + @add_test_categories(["msvcstl"]) + def test_with_run_command_msvcstl(self): + # No flags, because the "msvcstl" category checks that the MSVC STL is used by default. + self.build() + self.do_test_with_run_command() + + @add_test_categories(["msvcstl"]) + def test_ptr_and_ref_msvcstl(self): + self.build() + self.do_test_ptr_and_ref() diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py index 039c703491759..f6174dd786380 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py @@ -9,15 +9,11 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil -USE_LIBSTDCPP = "USE_LIBSTDCPP" -USE_LIBCPP = "USE_LIBCPP" - class GenericListDataFormatterTestCase(TestBase): NO_DEBUG_INFO_TESTCASE = True - def do_test_with_run_command(self, stdlib_type): - self.build(dictionary={stdlib_type: "1"}) + def do_test_with_run_command(self): exe = self.getBuildArtifact("a.out") target = self.dbg.CreateTarget(exe) self.assertTrue(target and target.IsValid(), "Target is valid") @@ -64,8 +60,16 @@ def do_test_with_run_command(self, stdlib_type): @add_test_categories(["libstdcxx"]) def test_with_run_command_libstdcpp(self): - self.do_test_with_run_command(USE_LIBSTDCPP) + self.build(dictionary={"USE_LIBSTDCPP": 1}) + self.do_test_with_run_command() @add_test_categories(["libc++"]) def test_with_run_command_libcpp(self): - self.do_test_with_run_command(USE_LIBCPP) + self.build(dictionary={"USE_LIBCPP": 1}) + self.do_test_with_run_command() + + @add_test_categories(["msvcstl"]) + def test_with_run_command_msvcstl(self): + # No flags, because the "msvcstl" category checks that the MSVC STL is used by default. + self.build() + self.do_test_with_run_command() diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp index e797b3d04dd6b..b31d4ca909ecb 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/main.cpp @@ -1,8 +1,3 @@ -// Evil hack: To simulate memory corruption, we want to fiddle with some internals of std::list. -// Make those accessible to us. -#define private public -#define protected public - #include #include #include From 9ea27b841cdf3c702b5e0bc696eb404492dbc79f Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 16 Jul 2025 13:03:44 +0000 Subject: [PATCH 034/813] [gn build] Port 88a498c3b110 --- .../secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn index 0c8e3aa664e38..b6b8f2f64caf8 100644 --- a/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Plugins/Language/CPlusPlus/BUILD.gn @@ -41,11 +41,11 @@ static_library("CPlusPlus") { "CxxStringTypes.cpp", "Generic.cpp", "GenericBitset.cpp", + "GenericList.cpp", "GenericOptional.cpp", "LibCxx.cpp", "LibCxxAtomic.cpp", "LibCxxInitializerList.cpp", - "LibCxxList.cpp", "LibCxxMap.cpp", "LibCxxProxyArray.cpp", "LibCxxQueue.cpp", From 5328c732a47705363cd289cb281cbd0d3ccbb8fc Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 16 Jul 2025 14:13:53 +0100 Subject: [PATCH 035/813] [DebugInfo] Strip more debug-intrinsic code from local utils (#149037) SROA and a few other facilities use generic-lambdas and some overloaded functions to deal with both intrinsics and debug-records at the same time. As part of stripping out intrinsic support, delete a swathe of this code from things in the Utils directory. This is a large diff, but is mostly about removing functions that were duplicated during the migration to debug records. I've taken a few opportunities to replace comments about "intrinsics" with "records", and replace generic lambdas with plain lambdas (I believe this makes it more readable). All of this is chipping away at intrinsic-specific code until we get to removing parts of findDbgUsers, which is the final boss -- we can't remove that until almost everything else is gone. --- llvm/include/llvm/Transforms/Utils/Local.h | 34 +- .../Transforms/Utils/MemoryTaggingSupport.h | 3 - .../InstCombine/InstructionCombining.cpp | 3 - llvm/lib/Transforms/Scalar/SROA.cpp | 107 +----- llvm/lib/Transforms/Utils/CloneFunction.cpp | 30 +- llvm/lib/Transforms/Utils/Local.cpp | 305 ++---------------- .../Transforms/Utils/MemoryTaggingSupport.cpp | 26 +- .../Utils/PromoteMemoryToRegister.cpp | 108 ++----- 8 files changed, 71 insertions(+), 545 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h index df146458b4e6f..bb79d2568fca0 100644 --- a/llvm/include/llvm/Transforms/Utils/Local.h +++ b/llvm/include/llvm/Transforms/Utils/Local.h @@ -36,7 +36,6 @@ class BasicBlock; class BranchInst; class CallBase; class CallInst; -class DbgVariableIntrinsic; class DIBuilder; class DomTreeUpdater; class Function; @@ -275,36 +274,23 @@ LLVM_ABI CallInst *changeToCall(InvokeInst *II, DomTreeUpdater *DTU = nullptr); LLVM_ABI void InsertDebugValueAtStoreLoc(DbgVariableRecord *DVR, StoreInst *SI, DIBuilder &Builder); -/// Creates and inserts an llvm.dbg.value intrinsic before a store -/// that has an associated llvm.dbg.value intrinsic. -LLVM_ABI void InsertDebugValueAtStoreLoc(DbgVariableIntrinsic *DII, - StoreInst *SI, DIBuilder &Builder); - -/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value -/// that has an associated llvm.dbg.declare intrinsic. -LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, - StoreInst *SI, - DIBuilder &Builder); +/// Inserts a dbg.value record before a store to an alloca'd value +/// that has an associated dbg.declare record. LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, StoreInst *SI, DIBuilder &Builder); -/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value -/// that has an associated llvm.dbg.declare intrinsic. -LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, - LoadInst *LI, DIBuilder &Builder); +/// Inserts a dbg.value record before a load of an alloca'd value +/// that has an associated dbg.declare record. LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI, DIBuilder &Builder); -/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated -/// llvm.dbg.declare intrinsic. -LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, - PHINode *LI, DIBuilder &Builder); +/// Inserts a dbg.value record after a phi that has an associated +/// llvm.dbg.declare record. LLVM_ABI void ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, PHINode *LI, DIBuilder &Builder); -/// Lowers llvm.dbg.declare intrinsics into appropriate set of -/// llvm.dbg.value intrinsics. +/// Lowers dbg.declare records into appropriate set of dbg.value records. LLVM_ABI bool LowerDbgDeclare(Function &F); /// Propagate dbg.value intrinsics through the newly inserted PHIs. @@ -312,7 +298,7 @@ LLVM_ABI void insertDebugValuesForPHIs(BasicBlock *BB, SmallVectorImpl &InsertedPHIs); -/// Replaces llvm.dbg.declare instruction when the address it +/// Replaces dbg.declare record when the address it /// describes is replaced with a new value. If Deref is true, an /// additional DW_OP_deref is prepended to the expression. If Offset /// is non-zero, a constant displacement is added to the expression @@ -321,10 +307,10 @@ LLVM_ABI bool replaceDbgDeclare(Value *Address, Value *NewAddress, DIBuilder &Builder, uint8_t DIExprFlags, int Offset); -/// Replaces multiple llvm.dbg.value instructions when the alloca it describes +/// Replaces multiple dbg.value records when the alloca it describes /// is replaced with a new value. If Offset is non-zero, a constant displacement /// is added to the expression (after the mandatory Deref). Offset can be -/// negative. New llvm.dbg.value instructions are inserted at the locations of +/// negative. New dbg.value records are inserted at the locations of /// the instructions they replace. LLVM_ABI void replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, DIBuilder &Builder, int Offset = 0); diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h index 8b7daf616b110..f288bdfb84f49 100644 --- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h +++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h @@ -23,7 +23,6 @@ namespace llvm { class DominatorTree; -class DbgVariableIntrinsic; class IntrinsicInst; class PostDominatorTree; class AllocaInst; @@ -53,8 +52,6 @@ struct AllocaInfo { AllocaInst *AI; SmallVector LifetimeStart; SmallVector LifetimeEnd; - SmallVector DbgVariableIntrinsics; - // Non-intrinsic records of variable locations. SmallVector DbgVariableRecords; }; diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 6de5422aeb084..684b9a1f90161 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3644,9 +3644,6 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { ConstantInt::get(Type::getInt1Ty(C->getContext()), C->isFalseWhenEqual())); } else if (auto *SI = dyn_cast(I)) { - for (auto *DVI : DVIs) - if (DVI->isAddressOfVariable()) - ConvertDebugDeclareToDebugValue(DVI, SI, *DIB); for (auto *DVR : DVRs) if (DVR->isAddressOfVariable()) ConvertDebugDeclareToDebugValue(DVR, SI, *DIB); diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 70b4552190a4e..23256cf2acbd2 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -315,18 +315,11 @@ calculateFragment(DILocalVariable *Variable, return UseFrag; } -static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) { - return DebugVariable(DVI->getVariable(), std::nullopt, - DVI->getDebugLoc().getInlinedAt()); -} static DebugVariable getAggregateVariable(DbgVariableRecord *DVR) { return DebugVariable(DVR->getVariable(), std::nullopt, DVR->getDebugLoc().getInlinedAt()); } -/// Helpers for handling new and old debug info modes in migrateDebugInfo. -/// These overloads unwrap a DbgInstPtr {Instruction* | DbgRecord*} union based -/// on the \p Unused parameter type. DbgVariableRecord *UnwrapDbgInstPtr(DbgInstPtr P, DbgVariableRecord *Unused) { (void)Unused; return static_cast(cast(P)); @@ -376,9 +369,6 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, /// Map of aggregate variables to their fragment associated with OldAlloca. DenseMap> BaseFragments; - for (auto *DAI : at::getAssignmentMarkers(OldAlloca)) - BaseFragments[getAggregateVariable(DAI)] = - DAI->getExpression()->getFragmentInfo(); for (auto *DVR : at::getDVRAssignmentMarkers(OldAlloca)) BaseFragments[getAggregateVariable(DVR)] = DVR->getExpression()->getFragmentInfo(); @@ -391,7 +381,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false); assert(OldAlloca->isStaticAlloca()); - auto MigrateDbgAssign = [&](auto *DbgAssign) { + auto MigrateDbgAssign = [&](DbgVariableRecord *DbgAssign) { LLVM_DEBUG(dbgs() << " existing dbg.assign is: " << *DbgAssign << "\n"); auto *Expr = DbgAssign->getExpression(); @@ -486,7 +476,6 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit, LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n"); }; - for_each(MarkerRange, MigrateDbgAssign); for_each(DVRAssignMarkerRange, MigrateDbgAssign); } @@ -5119,36 +5108,13 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, } // There isn't a shared interface to get the "address" parts out of a -// dbg.declare and dbg.assign, so provide some wrappers now for -// both debug intrinsics and records. -const Value *getAddress(const DbgVariableIntrinsic *DVI) { - if (const auto *DAI = dyn_cast(DVI)) - return DAI->getAddress(); - return cast(DVI)->getAddress(); -} - -const Value *getAddress(const DbgVariableRecord *DVR) { - return DVR->getAddress(); -} - -bool isKillAddress(const DbgVariableIntrinsic *DVI) { - if (const auto *DAI = dyn_cast(DVI)) - return DAI->isKillAddress(); - return cast(DVI)->isKillLocation(); -} - +// dbg.declare and dbg.assign, so provide some wrappers. bool isKillAddress(const DbgVariableRecord *DVR) { if (DVR->getType() == DbgVariableRecord::LocationType::Assign) return DVR->isKillAddress(); return DVR->isKillLocation(); } -const DIExpression *getAddressExpression(const DbgVariableIntrinsic *DVI) { - if (const auto *DAI = dyn_cast(DVI)) - return DAI->getAddressExpression(); - return cast(DVI)->getExpression(); -} - const DIExpression *getAddressExpression(const DbgVariableRecord *DVR) { if (DVR->getType() == DbgVariableRecord::LocationType::Assign) return DVR->getAddressExpression(); @@ -5236,66 +5202,6 @@ static DIExpression *createOrReplaceFragment(const DIExpression *Expr, return DIExpression::get(Expr->getContext(), Ops); } -/// Insert a new dbg.declare. -/// \p Orig Original to copy debug loc and variable from. -/// \p NewAddr Location's new base address. -/// \p NewAddrExpr New expression to apply to address. -/// \p BeforeInst Insert position. -/// \p NewFragment New fragment (absolute, non-relative). -/// \p BitExtractAdjustment Offset to apply to any extract_bits op. -static void -insertNewDbgInst(DIBuilder &DIB, DbgDeclareInst *Orig, AllocaInst *NewAddr, - DIExpression *NewAddrExpr, Instruction *BeforeInst, - std::optional NewFragment, - int64_t BitExtractAdjustment) { - if (NewFragment) - NewAddrExpr = createOrReplaceFragment(NewAddrExpr, *NewFragment, - BitExtractAdjustment); - if (!NewAddrExpr) - return; - - DIB.insertDeclare(NewAddr, Orig->getVariable(), NewAddrExpr, - Orig->getDebugLoc(), BeforeInst->getIterator()); -} - -/// Insert a new dbg.assign. -/// \p Orig Original to copy debug loc, variable, value and value expression -/// from. -/// \p NewAddr Location's new base address. -/// \p NewAddrExpr New expression to apply to address. -/// \p BeforeInst Insert position. -/// \p NewFragment New fragment (absolute, non-relative). -/// \p BitExtractAdjustment Offset to apply to any extract_bits op. -static void -insertNewDbgInst(DIBuilder &DIB, DbgAssignIntrinsic *Orig, AllocaInst *NewAddr, - DIExpression *NewAddrExpr, Instruction *BeforeInst, - std::optional NewFragment, - int64_t BitExtractAdjustment) { - // DIBuilder::insertDbgAssign will insert the #dbg_assign after NewAddr. - (void)BeforeInst; - - // A dbg.assign puts fragment info in the value expression only. The address - // expression has already been built: NewAddrExpr. - DIExpression *NewFragmentExpr = Orig->getExpression(); - if (NewFragment) - NewFragmentExpr = createOrReplaceFragment(NewFragmentExpr, *NewFragment, - BitExtractAdjustment); - if (!NewFragmentExpr) - return; - - // Apply a DIAssignID to the store if it doesn't already have it. - if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) { - NewAddr->setMetadata(LLVMContext::MD_DIAssignID, - DIAssignID::getDistinct(NewAddr->getContext())); - } - - Instruction *NewAssign = cast(DIB.insertDbgAssign( - NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr, - NewAddrExpr, Orig->getDebugLoc())); - LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign << "\n"); - (void)NewAssign; -} - /// Insert a new DbgRecord. /// \p Orig Original to copy record type, debug loc and variable from, and /// additionally value and value expression for dbg_assign records. @@ -5457,12 +5363,12 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // Migrate debug information from the old alloca to the new alloca(s) // and the individual partitions. - auto MigrateOne = [&](auto *DbgVariable) { + auto MigrateOne = [&](DbgVariableRecord *DbgVariable) { // Can't overlap with undef memory. if (isKillAddress(DbgVariable)) return; - const Value *DbgPtr = getAddress(DbgVariable); + const Value *DbgPtr = DbgVariable->getAddress(); DIExpression::FragmentInfo VarFrag = DbgVariable->getFragmentOrEntireVariable(); // Get the address expression constant offset if one exists and the ops @@ -5543,7 +5449,6 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { if (SameVariableFragment(OldDII, DbgVariable)) OldDII->eraseFromParent(); }; - for_each(findDbgDeclares(Fragment.Alloca), RemoveOne); for_each(findDVRDeclares(Fragment.Alloca), RemoveOne); for_each(findDVRValues(Fragment.Alloca), RemoveOne); insertNewDbgInst(DIB, DbgVariable, Fragment.Alloca, NewExpr, &AI, @@ -5553,10 +5458,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { // Migrate debug information from the old alloca to the new alloca(s) // and the individual partitions. - for_each(findDbgDeclares(&AI), MigrateOne); for_each(findDVRDeclares(&AI), MigrateOne); for_each(findDVRValues(&AI), MigrateOne); - for_each(at::getAssignmentMarkers(&AI), MigrateOne); for_each(at::getDVRAssignmentMarkers(&AI), MigrateOne); return Changed; @@ -5777,8 +5680,6 @@ bool SROA::deleteDeadInstructions( // not be able to find it. if (AllocaInst *AI = dyn_cast(I)) { DeletedAllocas.insert(AI); - for (DbgDeclareInst *OldDII : findDbgDeclares(AI)) - OldDII->eraseFromParent(); for (DbgVariableRecord *OldDII : findDVRDeclares(AI)) OldDII->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index fccb73a36b182..b187208bc238c 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -576,9 +576,8 @@ void PruningFunctionCloner::CloneBlock( } // Eagerly remap operands to the newly cloned instruction, except for PHI - // nodes for which we defer processing until we update the CFG. Also defer - // debug intrinsic processing because they may contain use-before-defs. - if (!isa(NewInst) && !isa(NewInst)) { + // nodes for which we defer processing until we update the CFG. + if (!isa(NewInst)) { RemapInstruction(NewInst, VMap, ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges); @@ -733,15 +732,6 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, StartingInst = &StartingBB->front(); } - // Collect debug intrinsics for remapping later. - SmallVector DbgIntrinsics; - for (const auto &BB : *OldFunc) { - for (const auto &I : BB) { - if (const auto *DVI = dyn_cast(&I)) - DbgIntrinsics.push_back(DVI); - } - } - // Clone the entry block, and anything recursively reachable from it. std::vector CloneWorklist; PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist); @@ -899,21 +889,11 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc, // Restore attributes. NewFunc->setAttributes(Attrs); - // Remap debug intrinsic operands now that all values have been mapped. - // Doing this now (late) preserves use-before-defs in debug intrinsics. If + // Remap debug records operands now that all values have been mapped. + // Doing this now (late) preserves use-before-defs in debug records. If // we didn't do this, ValueAsMetadata(use-before-def) operands would be // replaced by empty metadata. This would signal later cleanup passes to - // remove the debug intrinsics, potentially causing incorrect locations. - for (const auto *DVI : DbgIntrinsics) { - if (DbgVariableIntrinsic *NewDVI = - cast_or_null(VMap.lookup(DVI))) - RemapInstruction(NewDVI, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper, Materializer); - } - - // Do the same for DbgVariableRecords, touching all the instructions in the - // cloned range of blocks. + // remove the debug records, potentially causing incorrect locations. Function::iterator Begin = cast(VMap[StartingBB])->getIterator(); for (BasicBlock &BB : make_range(Begin, NewFunc->end())) { for (Instruction &I : BB) { diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index ccdaca9b0e91c..72bc09431e9cb 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -428,10 +428,6 @@ bool llvm::wouldInstructionBeTriviallyDead(const Instruction *I, if (I->isEHPad()) return false; - // We don't want debug info removed by anything this general. - if (isa(I)) - return false; - if (const DbgLabelInst *DLI = dyn_cast(I)) { if (DLI->getLabel()) return false; @@ -1632,33 +1628,6 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar, /// describes an alloca'd variable, so we need to use the alloc size of the /// value when doing the comparison. E.g. an i1 value will be identified as /// covering an n-bit fragment, if the store size of i1 is at least n bits. -static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) { - const DataLayout &DL = DII->getDataLayout(); - TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy); - if (std::optional FragmentSize = - DII->getExpression()->getActiveBits(DII->getVariable())) - return TypeSize::isKnownGE(ValueSize, TypeSize::getFixed(*FragmentSize)); - - // We can't always calculate the size of the DI variable (e.g. if it is a - // VLA). Try to use the size of the alloca that the dbg intrinsic describes - // instead. - if (DII->isAddressOfVariable()) { - // DII should have exactly 1 location when it is an address. - assert(DII->getNumVariableLocationOps() == 1 && - "address of variable must have exactly 1 location operand."); - if (auto *AI = - dyn_cast_or_null(DII->getVariableLocationOp(0))) { - if (std::optional FragmentSize = - AI->getAllocationSizeInBits(DL)) { - return TypeSize::isKnownGE(ValueSize, *FragmentSize); - } - } - } - // Could not determine size of variable. Conservatively return false. - return false; -} -// RemoveDIs: duplicate implementation of the above, using DbgVariableRecords, -// the replacement for dbg.values. static bool valueCoversEntireFragment(Type *ValTy, DbgVariableRecord *DVR) { const DataLayout &DL = DVR->getModule()->getDataLayout(); TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy); @@ -1703,98 +1672,12 @@ static void insertDbgValueOrDbgVariableRecordAfter( insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, NextIt); } -/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value -/// that has an associated llvm.dbg.declare intrinsic. -void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, - StoreInst *SI, DIBuilder &Builder) { - assert(DII->isAddressOfVariable() || isa(DII)); - auto *DIVar = DII->getVariable(); - assert(DIVar && "Missing variable"); - auto *DIExpr = DII->getExpression(); - Value *DV = SI->getValueOperand(); - - DebugLoc NewLoc = getDebugValueLoc(DII); - - // If the alloca describes the variable itself, i.e. the expression in the - // dbg.declare doesn't start with a dereference, we can perform the - // conversion if the value covers the entire fragment of DII. - // If the alloca describes the *address* of DIVar, i.e. DIExpr is - // *just* a DW_OP_deref, we use DV as is for the dbg.value. - // We conservatively ignore other dereferences, because the following two are - // not equivalent: - // dbg.declare(alloca, ..., !Expr(deref, plus_uconstant, 2)) - // dbg.value(DV, ..., !Expr(deref, plus_uconstant, 2)) - // The former is adding 2 to the address of the variable, whereas the latter - // is adding 2 to the value of the variable. As such, we insist on just a - // deref expression. - bool CanConvert = - DIExpr->isDeref() || (!DIExpr->startsWithDeref() && - valueCoversEntireFragment(DV->getType(), DII)); - if (CanConvert) { - insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, - SI->getIterator()); - return; - } - - // FIXME: If storing to a part of the variable described by the dbg.declare, - // then we want to insert a dbg.value for the corresponding fragment. - LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " << *DII - << '\n'); - // For now, when there is a store to parts of the variable (but we do not - // know which part) we insert an dbg.value intrinsic to indicate that we - // know nothing about the variable's content. - DV = PoisonValue::get(DV->getType()); - insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, - SI->getIterator()); -} - static DIExpression *dropInitialDeref(const DIExpression *DIExpr) { int NumEltDropped = DIExpr->getElements()[0] == dwarf::DW_OP_LLVM_arg ? 3 : 1; return DIExpression::get(DIExpr->getContext(), DIExpr->getElements().drop_front(NumEltDropped)); } -void llvm::InsertDebugValueAtStoreLoc(DbgVariableIntrinsic *DII, StoreInst *SI, - DIBuilder &Builder) { - auto *DIVar = DII->getVariable(); - assert(DIVar && "Missing variable"); - auto *DIExpr = DII->getExpression(); - DIExpr = dropInitialDeref(DIExpr); - Value *DV = SI->getValueOperand(); - - DebugLoc NewLoc = getDebugValueLoc(DII); - - insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, - SI->getIterator()); -} - -/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value -/// that has an associated llvm.dbg.declare intrinsic. -void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, - LoadInst *LI, DIBuilder &Builder) { - auto *DIVar = DII->getVariable(); - auto *DIExpr = DII->getExpression(); - assert(DIVar && "Missing variable"); - - if (!valueCoversEntireFragment(LI->getType(), DII)) { - // FIXME: If only referring to a part of the variable described by the - // dbg.declare, then we want to insert a dbg.value for the corresponding - // fragment. - LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " - << *DII << '\n'); - return; - } - - DebugLoc NewLoc = getDebugValueLoc(DII); - - // We are now tracking the loaded value instead of the address. In the - // future if multi-location support is added to the IR, it might be - // preferable to keep tracking both the loaded value and the original - // address in case the alloca can not be elided. - insertDbgValueOrDbgVariableRecordAfter(Builder, LI, DIVar, DIExpr, NewLoc, - LI); -} - void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, StoreInst *SI, DIBuilder &Builder) { assert(DVR->isAddressOfVariable() || DVR->isDbgAssign()); @@ -1855,40 +1738,6 @@ void llvm::InsertDebugValueAtStoreLoc(DbgVariableRecord *DVR, StoreInst *SI, SI->getIterator()); } -/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated -/// llvm.dbg.declare intrinsic. -void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII, - PHINode *APN, DIBuilder &Builder) { - auto *DIVar = DII->getVariable(); - auto *DIExpr = DII->getExpression(); - assert(DIVar && "Missing variable"); - - if (PhiHasDebugValue(DIVar, DIExpr, APN)) - return; - - if (!valueCoversEntireFragment(APN->getType(), DII)) { - // FIXME: If only referring to a part of the variable described by the - // dbg.declare, then we want to insert a dbg.value for the corresponding - // fragment. - LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: " - << *DII << '\n'); - return; - } - - BasicBlock *BB = APN->getParent(); - auto InsertionPt = BB->getFirstInsertionPt(); - - DebugLoc NewLoc = getDebugValueLoc(DII); - - // The block may be a catchswitch block, which does not have a valid - // insertion point. - // FIXME: Insert dbg.value markers in the successors when appropriate. - if (InsertionPt != BB->end()) { - insertDbgValueOrDbgVariableRecord(Builder, APN, DIVar, DIExpr, NewLoc, - InsertionPt); - } -} - void llvm::ConvertDebugDeclareToDebugValue(DbgVariableRecord *DVR, LoadInst *LI, DIBuilder &Builder) { auto *DIVar = DVR->getVariable(); @@ -1981,7 +1830,7 @@ bool llvm::LowerDbgDeclare(Function &F) { if (Dbgs.empty() && DVRs.empty()) return Changed; - auto LowerOne = [&](auto *DDI) { + auto LowerOne = [&](DbgVariableRecord *DDI) { AllocaInst *AI = dyn_cast_or_null(DDI->getVariableLocationOp(0)); // If this is an alloca for a scalar variable, insert a dbg.value @@ -2036,7 +1885,6 @@ bool llvm::LowerDbgDeclare(Function &F) { Changed = true; }; - for_each(Dbgs, LowerOne); for_each(DVRs, LowerOne); if (Changed) @@ -2046,12 +1894,9 @@ bool llvm::LowerDbgDeclare(Function &F) { return Changed; } -// RemoveDIs: re-implementation of insertDebugValuesForPHIs, but which pulls the -// debug-info out of the block's DbgVariableRecords rather than dbg.value -// intrinsics. -static void -insertDbgVariableRecordsForPHIs(BasicBlock *BB, - SmallVectorImpl &InsertedPHIs) { +/// Propagate dbg.value records through the newly inserted PHIs. +void llvm::insertDebugValuesForPHIs(BasicBlock *BB, + SmallVectorImpl &InsertedPHIs) { assert(BB && "No BasicBlock to clone DbgVariableRecord(s) from."); if (InsertedPHIs.size() == 0) return; @@ -2113,76 +1958,12 @@ insertDbgVariableRecordsForPHIs(BasicBlock *BB, } } -/// Propagate dbg.value intrinsics through the newly inserted PHIs. -void llvm::insertDebugValuesForPHIs(BasicBlock *BB, - SmallVectorImpl &InsertedPHIs) { - assert(BB && "No BasicBlock to clone dbg.value(s) from."); - if (InsertedPHIs.size() == 0) - return; - - insertDbgVariableRecordsForPHIs(BB, InsertedPHIs); - - // Map existing PHI nodes to their dbg.values. - ValueToValueMapTy DbgValueMap; - for (auto &I : *BB) { - if (auto DbgII = dyn_cast(&I)) { - for (Value *V : DbgII->location_ops()) - if (auto *Loc = dyn_cast_or_null(V)) - DbgValueMap.insert({Loc, DbgII}); - } - } - if (DbgValueMap.size() == 0) - return; - - // Map a pair of the destination BB and old dbg.value to the new dbg.value, - // so that if a dbg.value is being rewritten to use more than one of the - // inserted PHIs in the same destination BB, we can update the same dbg.value - // with all the new PHIs instead of creating one copy for each. - MapVector, - DbgVariableIntrinsic *> - NewDbgValueMap; - // Then iterate through the new PHIs and look to see if they use one of the - // previously mapped PHIs. If so, create a new dbg.value intrinsic that will - // propagate the info through the new PHI. If we use more than one new PHI in - // a single destination BB with the same old dbg.value, merge the updates so - // that we get a single new dbg.value with all the new PHIs. - for (auto *PHI : InsertedPHIs) { - BasicBlock *Parent = PHI->getParent(); - // Avoid inserting an intrinsic into an EH block. - if (Parent->getFirstNonPHIIt()->isEHPad()) - continue; - for (auto *VI : PHI->operand_values()) { - auto V = DbgValueMap.find(VI); - if (V != DbgValueMap.end()) { - auto *DbgII = cast(V->second); - auto [NewDI, Inserted] = NewDbgValueMap.try_emplace({Parent, DbgII}); - if (Inserted) - NewDI->second = cast(DbgII->clone()); - DbgVariableIntrinsic *NewDbgII = NewDI->second; - // If PHI contains VI as an operand more than once, we may - // replaced it in NewDbgII; confirm that it is present. - if (is_contained(NewDbgII->location_ops(), VI)) - NewDbgII->replaceVariableLocationOp(VI, PHI); - } - } - } - // Insert thew new dbg.values into their destination blocks. - for (auto DI : NewDbgValueMap) { - BasicBlock *Parent = DI.first.first; - auto *NewDbgII = DI.second; - auto InsertionPt = Parent->getFirstInsertionPt(); - assert(InsertionPt != Parent->end() && "Ill-formed basic block"); - NewDbgII->insertBefore(InsertionPt); - } -} - bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, DIBuilder &Builder, uint8_t DIExprFlags, int Offset) { - TinyPtrVector DbgDeclares = findDbgDeclares(Address); TinyPtrVector DVRDeclares = findDVRDeclares(Address); - auto ReplaceOne = [&](auto *DII) { + auto ReplaceOne = [&](DbgVariableRecord *DII) { assert(DII->getVariable() && "Missing variable"); auto *DIExpr = DII->getExpression(); DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset); @@ -2190,10 +1971,9 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, DII->replaceVariableLocationOp(Address, NewAddress); }; - for_each(DbgDeclares, ReplaceOne); for_each(DVRDeclares, ReplaceOne); - return !DbgDeclares.empty() || !DVRDeclares.empty(); + return !DVRDeclares.empty(); } static void updateOneDbgValueForAlloca(const DebugLoc &Loc, @@ -2645,7 +2425,6 @@ using DbgValReplacement = std::optional; /// changes are made. static bool rewriteDebugUsers( Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT, - function_ref RewriteExpr, function_ref RewriteDVRExpr) { // Find debug users of From. SmallVector Users; @@ -2654,43 +2433,32 @@ static bool rewriteDebugUsers( if (Users.empty() && DPUsers.empty()) return false; + // Ignore intrinsic-users: they are no longer supported and should never + // appear. + assert(Users.empty()); + // Prevent use-before-def of To. bool Changed = false; - SmallPtrSet UndefOrSalvage; SmallPtrSet UndefOrSalvageDVR; if (isa(&To)) { bool DomPointAfterFrom = From.getNextNode() == &DomPoint; - for (auto *DII : Users) { - // It's common to see a debug user between From and DomPoint. Move it - // after DomPoint to preserve the variable update without any reordering. - if (DomPointAfterFrom && DII->getNextNode() == &DomPoint) { - LLVM_DEBUG(dbgs() << "MOVE: " << *DII << '\n'); - DII->moveAfter(&DomPoint); - Changed = true; - - // Users which otherwise aren't dominated by the replacement value must - // be salvaged or deleted. - } else if (!DT.dominates(&DomPoint, DII)) { - UndefOrSalvage.insert(DII); - } - } - // DbgVariableRecord implementation of the above. for (auto *DVR : DPUsers) { Instruction *MarkedInstr = DVR->getMarker()->MarkedInstr; Instruction *NextNonDebug = MarkedInstr; - // The next instruction might still be a dbg.declare, skip over it. - if (isa(NextNonDebug)) - NextNonDebug = NextNonDebug->getNextNode(); + // It's common to see a debug user between From and DomPoint. Move it + // after DomPoint to preserve the variable update without any reordering. if (DomPointAfterFrom && NextNonDebug == &DomPoint) { LLVM_DEBUG(dbgs() << "MOVE: " << *DVR << '\n'); DVR->removeFromParent(); - // Ensure there's a marker. DomPoint.getParent()->insertDbgRecordAfter(DVR, &DomPoint); Changed = true; + + // Users which otherwise aren't dominated by the replacement value must + // be salvaged or deleted. } else if (!DT.dominates(&DomPoint, MarkedInstr)) { UndefOrSalvageDVR.insert(DVR); } @@ -2698,19 +2466,6 @@ static bool rewriteDebugUsers( } // Update debug users without use-before-def risk. - for (auto *DII : Users) { - if (UndefOrSalvage.count(DII)) - continue; - - DbgValReplacement DVRepl = RewriteExpr(*DII); - if (!DVRepl) - continue; - - DII->replaceVariableLocationOp(&From, &To); - DII->setExpression(*DVRepl); - LLVM_DEBUG(dbgs() << "REWRITE: " << *DII << '\n'); - Changed = true; - } for (auto *DVR : DPUsers) { if (UndefOrSalvageDVR.count(DVR)) continue; @@ -2725,7 +2480,7 @@ static bool rewriteDebugUsers( Changed = true; } - if (!UndefOrSalvage.empty() || !UndefOrSalvageDVR.empty()) { + if (!UndefOrSalvageDVR.empty()) { // Try to salvage the remaining debug users. salvageDebugInfo(From); Changed = true; @@ -2770,9 +2525,6 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To, Type *FromTy = From.getType(); Type *ToTy = To.getType(); - auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement { - return DII.getExpression(); - }; auto IdentityDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement { return DVR.getExpression(); }; @@ -2781,7 +2533,7 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To, Module &M = *From.getModule(); const DataLayout &DL = M.getDataLayout(); if (isBitCastSemanticsPreserving(DL, FromTy, ToTy)) - return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR); + return rewriteDebugUsers(From, To, DomPoint, DT, IdentityDVR); // Handle integer-to-integer widening and narrowing. // FIXME: Use DW_OP_convert when it's available everywhere. @@ -2793,24 +2545,10 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To, // When the width of the result grows, assume that a debugger will only // access the low `FromBits` bits when inspecting the source variable. if (FromBits < ToBits) - return rewriteDebugUsers(From, To, DomPoint, DT, Identity, IdentityDVR); + return rewriteDebugUsers(From, To, DomPoint, DT, IdentityDVR); // The width of the result has shrunk. Use sign/zero extension to describe // the source variable's high bits. - auto SignOrZeroExt = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement { - DILocalVariable *Var = DII.getVariable(); - - // Without knowing signedness, sign/zero extension isn't possible. - auto Signedness = Var->getSignedness(); - if (!Signedness) - return std::nullopt; - - bool Signed = *Signedness == DIBasicType::Signedness::Signed; - return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits, - Signed); - }; - // RemoveDIs: duplicate implementation working on DbgVariableRecords rather - // than on dbg.value intrinsics. auto SignOrZeroExtDVR = [&](DbgVariableRecord &DVR) -> DbgValReplacement { DILocalVariable *Var = DVR.getVariable(); @@ -2823,8 +2561,7 @@ bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To, return DIExpression::appendExt(DVR.getExpression(), ToBits, FromBits, Signed); }; - return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt, - SignOrZeroExtDVR); + return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExtDVR); } // TODO: Floating-point conversions, vectors. @@ -3800,10 +3537,6 @@ void llvm::remapDebugVariable(ValueToValueMapTy &Mapping, Instruction *Inst) { if (I != Mapping.end()) DA->setAddress(I->second); }; - if (auto DVI = dyn_cast(Inst)) - RemapDebugOperands(DVI, DVI->location_ops()); - if (auto DAI = dyn_cast(Inst)) - RemapAssignAddress(DAI); for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange())) { RemapDebugOperands(&DVR, DVR.location_ops()); if (DVR.isDbgAssign()) diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 511c15555fa83..6226596017980 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -168,22 +168,6 @@ void StackInfoBuilder::visit(OptimizationRemarkEmitter &ORE, Info.AllocasToInstrument[AI].LifetimeEnd.push_back(II); return; } - if (auto *DVI = dyn_cast(&Inst)) { - auto AddIfInteresting = [&](Value *V) { - if (auto *AI = dyn_cast_or_null(V)) { - if (getAllocaInterestingness(*AI) != - AllocaInterestingness::kInteresting) - return; - AllocaInfo &AInfo = Info.AllocasToInstrument[AI]; - auto &DVIVec = AInfo.DbgVariableIntrinsics; - if (DVIVec.empty() || DVIVec.back() != DVI) - DVIVec.push_back(DVI); - } - }; - for_each(DVI->location_ops(), AddIfInteresting); - if (auto *DAI = dyn_cast(DVI)) - AddIfInteresting(DAI->getAddress()); - } Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst); if (ExitUntag) @@ -297,19 +281,12 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) { IRB.CreateCall(ThreadPointerFunc), 8 * Slot); } -static DbgAssignIntrinsic *DynCastToDbgAssign(DbgVariableIntrinsic *DVI) { - return dyn_cast(DVI); -} - static DbgVariableRecord *DynCastToDbgAssign(DbgVariableRecord *DVR) { return DVR->isDbgAssign() ? DVR : nullptr; } void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag) { - // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records, - // abstracted over whether they're intrinsic-stored or DbgVariableRecord - // stored. - auto AnnotateDbgRecord = [&](auto *DPtr) { + auto AnnotateDbgRecord = [&](DbgVariableRecord *DPtr) { // Prepend "tag_offset, N" to the dwarf expression. // Tag offset logically applies to the alloca pointer, and it makes sense // to put it at the beginning of the expression. @@ -325,7 +302,6 @@ void annotateDebugRecords(AllocaInfo &Info, unsigned int Tag) { } }; - llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord); llvm::for_each(Info.DbgVariableRecords, AnnotateDbgRecord); } diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 46808a818cb26..ccd7ee360e014 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -115,29 +115,17 @@ static void createDebugValue(DIBuilder &DIB, Value *NewValue, DbgVariableRecord::createDbgVariableRecord(NewValue, Variable, Expression, DI, *InsertBefore); } -static void createDebugValue(DIBuilder &DIB, Value *NewValue, - DILocalVariable *Variable, - DIExpression *Expression, const DILocation *DI, - Instruction *InsertBefore) { - DIB.insertDbgValueIntrinsic(NewValue, Variable, Expression, DI, - InsertBefore->getIterator()); -} /// Helper for updating assignment tracking debug info when promoting allocas. class AssignmentTrackingInfo { /// DbgAssignIntrinsics linked to the alloca with at most one per variable /// fragment. (i.e. not be a comprehensive set if there are multiple /// dbg.assigns for one variable fragment). - SmallVector DbgAssigns; SmallVector DVRAssigns; public: void init(AllocaInst *AI) { SmallSet Vars; - for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(AI)) { - if (Vars.insert(DebugVariable(DAI)).second) - DbgAssigns.push_back(DAI); - } for (DbgVariableRecord *DVR : at::getDVRAssignmentMarkers(AI)) { if (Vars.insert(DebugVariable(DVR)).second) DVRAssigns.push_back(DVR); @@ -148,11 +136,10 @@ class AssignmentTrackingInfo { /// \p ToDelete that stores to this alloca. void updateForDeletedStore( StoreInst *ToDelete, DIBuilder &DIB, - SmallSet *DbgAssignsToDelete, SmallSet *DVRAssignsToDelete) const { // There's nothing to do if the alloca doesn't have any variables using // assignment tracking. - if (DbgAssigns.empty() && DVRAssigns.empty()) + if (DVRAssigns.empty()) return; // Insert a dbg.value where the linked dbg.assign is and remember to delete @@ -169,25 +156,22 @@ class AssignmentTrackingInfo { DbgAssign->getExpression(), DbgAssign->getDebugLoc(), DbgAssign); }; - for (auto *Assign : at::getAssignmentMarkers(ToDelete)) - InsertValueForAssign(Assign, DbgAssignsToDelete); for (auto *Assign : at::getDVRAssignmentMarkers(ToDelete)) InsertValueForAssign(Assign, DVRAssignsToDelete); // It's possible for variables using assignment tracking to have no - // dbg.assign linked to this store. These are variables in DbgAssigns that + // dbg.assign linked to this store. These are variables in DVRAssigns that // are missing from VarHasDbgAssignForStore. Since there isn't a dbg.assign // to mark the assignment - and the store is going to be deleted - insert a // dbg.value to do that now. An untracked store may be either one that // cannot be represented using assignment tracking (non-const offset or // size) or one that is trackable but has had its DIAssignID attachment // dropped accidentally. - auto ConvertUnlinkedAssignToValue = [&](auto *Assign) { + auto ConvertUnlinkedAssignToValue = [&](DbgVariableRecord *Assign) { if (VarHasDbgAssignForStore.contains(DebugVariableAggregate(Assign))) return; ConvertDebugDeclareToDebugValue(Assign, ToDelete, DIB); }; - for_each(DbgAssigns, ConvertUnlinkedAssignToValue); for_each(DVRAssigns, ConvertUnlinkedAssignToValue); } @@ -197,17 +181,12 @@ class AssignmentTrackingInfo { // Regardless of the position of dbg.assigns relative to stores, the // incoming values into a new PHI should be the same for the (imaginary) // debug-phi. - for (auto *DAI : DbgAssigns) - ConvertDebugDeclareToDebugValue(DAI, NewPhi, DIB); for (auto *DVR : DVRAssigns) ConvertDebugDeclareToDebugValue(DVR, NewPhi, DIB); } - void clear() { - DbgAssigns.clear(); - DVRAssigns.clear(); - } - bool empty() { return DbgAssigns.empty() && DVRAssigns.empty(); } + void clear() { DVRAssigns.clear(); } + bool empty() { return DVRAssigns.empty(); } }; struct AllocaInfo { @@ -412,7 +391,6 @@ struct PromoteMem2Reg { SmallVector AllocaATInfo; /// A set of dbg.assigns to delete because they've been demoted to /// dbg.values. Call cleanUpDbgAssigns to delete them. - SmallSet DbgAssignsToDelete; SmallSet DVRAssignsToDelete; /// The set of basic blocks the renamer has already visited. @@ -467,9 +445,6 @@ struct PromoteMem2Reg { /// Delete dbg.assigns that have been demoted to dbg.values. void cleanUpDbgAssigns() { - for (auto *DAI : DbgAssignsToDelete) - DAI->eraseFromParent(); - DbgAssignsToDelete.clear(); for (auto *DVR : DVRAssignsToDelete) DVR->eraseFromParent(); DVRAssignsToDelete.clear(); @@ -571,7 +546,6 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, const DataLayout &DL, DominatorTree &DT, AssumptionCache *AC, - SmallSet *DbgAssignsToDelete, SmallSet *DVRAssignsToDelete) { StoreInst *OnlyStore = Info.OnlyStore; Value *ReplVal = OnlyStore->getOperand(0); @@ -637,27 +611,23 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false); // Update assignment tracking info for the store we're going to delete. - Info.AssignmentTracking.updateForDeletedStore( - Info.OnlyStore, DIB, DbgAssignsToDelete, DVRAssignsToDelete); + Info.AssignmentTracking.updateForDeletedStore(Info.OnlyStore, DIB, + DVRAssignsToDelete); // Record debuginfo for the store and remove the declaration's // debuginfo. - auto ConvertDebugInfoForStore = [&](auto &Container) { - for (auto *DbgItem : Container) { - if (DbgItem->isAddressOfVariable()) { - ConvertDebugDeclareToDebugValue(DbgItem, Info.OnlyStore, DIB); - DbgItem->eraseFromParent(); - } else if (DbgItem->isValueOfVariable() && - DbgItem->getExpression()->startsWithDeref()) { - InsertDebugValueAtStoreLoc(DbgItem, Info.OnlyStore, DIB); - DbgItem->eraseFromParent(); - } else if (DbgItem->getExpression()->startsWithDeref()) { - DbgItem->eraseFromParent(); - } + for (DbgVariableRecord *DbgItem : Info.DPUsers) { + if (DbgItem->isAddressOfVariable()) { + ConvertDebugDeclareToDebugValue(DbgItem, Info.OnlyStore, DIB); + DbgItem->eraseFromParent(); + } else if (DbgItem->isValueOfVariable() && + DbgItem->getExpression()->startsWithDeref()) { + InsertDebugValueAtStoreLoc(DbgItem, Info.OnlyStore, DIB); + DbgItem->eraseFromParent(); + } else if (DbgItem->getExpression()->startsWithDeref()) { + DbgItem->eraseFromParent(); } - }; - ConvertDebugInfoForStore(Info.DbgUsers); - ConvertDebugInfoForStore(Info.DPUsers); + } // Remove dbg.assigns linked to the alloca as these are now redundant. at::deleteAssignmentMarkers(AI); @@ -690,7 +660,6 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, LargeBlockInfo &LBI, const DataLayout &DL, DominatorTree &DT, AssumptionCache *AC, - SmallSet *DbgAssignsToDelete, SmallSet *DVRAssignsToDelete) { // The trickiest case to handle is when we have large blocks. Because of this, // this code is optimized assuming that large blocks happen. This does not @@ -755,18 +724,13 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, while (!AI->use_empty()) { StoreInst *SI = cast(AI->user_back()); // Update assignment tracking info for the store we're going to delete. - Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DbgAssignsToDelete, - DVRAssignsToDelete); + Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DVRAssignsToDelete); // Record debuginfo for the store before removing it. - auto DbgUpdateForStore = [&](auto &Container) { - for (auto *DbgItem : Container) { - if (DbgItem->isAddressOfVariable()) { - ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB); - } + for (DbgVariableRecord *DbgItem : Info.DPUsers) { + if (DbgItem->isAddressOfVariable()) { + ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB); } - }; - DbgUpdateForStore(Info.DbgUsers); - DbgUpdateForStore(Info.DPUsers); + } SI->eraseFromParent(); LBI.deleteValue(SI); @@ -830,7 +794,7 @@ void PromoteMem2Reg::run() { // it that are directly dominated by the definition with the value stored. if (Info.DefiningBlocks.size() == 1) { if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC, - &DbgAssignsToDelete, &DVRAssignsToDelete)) { + &DVRAssignsToDelete)) { // The alloca has been processed, move on. RemoveFromAllocasList(AllocaNum); ++NumSingleStore; @@ -842,7 +806,7 @@ void PromoteMem2Reg::run() { // linear sweep over the block to eliminate it. if (Info.OnlyUsedInOneBlock && promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC, - &DbgAssignsToDelete, &DVRAssignsToDelete)) { + &DVRAssignsToDelete)) { // The alloca has been processed, move on. RemoveFromAllocasList(AllocaNum); continue; @@ -1182,13 +1146,9 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred) { // The currently active variable for this block is now the PHI. IncomingVals.set(AllocaNo, APN); AllocaATInfo[AllocaNo].updateForNewPhi(APN, DIB); - auto ConvertDbgDeclares = [&](auto &Container) { - for (auto *DbgItem : Container) - if (DbgItem->isAddressOfVariable()) - ConvertDebugDeclareToDebugValue(DbgItem, APN, DIB); - }; - ConvertDbgDeclares(AllocaDbgUsers[AllocaNo]); - ConvertDbgDeclares(AllocaDPUsers[AllocaNo]); + for (DbgVariableRecord *DbgItem : AllocaDPUsers[AllocaNo]) + if (DbgItem->isAddressOfVariable()) + ConvertDebugDeclareToDebugValue(DbgItem, APN, DIB); // Get the next phi node. ++PNI; @@ -1242,15 +1202,11 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred) { // Record debuginfo for the store before removing it. IncomingLocs.set(AllocaNo, SI->getDebugLoc()); - AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB, &DbgAssignsToDelete, + AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB, &DVRAssignsToDelete); - auto ConvertDbgDeclares = [&](auto &Container) { - for (auto *DbgItem : Container) - if (DbgItem->isAddressOfVariable()) - ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB); - }; - ConvertDbgDeclares(AllocaDbgUsers[ai->second]); - ConvertDbgDeclares(AllocaDPUsers[ai->second]); + for (DbgVariableRecord *DbgItem : AllocaDPUsers[ai->second]) + if (DbgItem->isAddressOfVariable()) + ConvertDebugDeclareToDebugValue(DbgItem, SI, DIB); SI->eraseFromParent(); } } From 85349b49364d240a2e82981a7d7e0d01b13b1284 Mon Sep 17 00:00:00 2001 From: zGoldthorpe Date: Wed, 16 Jul 2025 07:23:09 -0600 Subject: [PATCH 036/813] [clang][amdgpu] Add builtin for struct buffer lds load (#148950) This is essentially just a revision of #137678 which only exposes a builtin for the intrinsic `llvm.amdgcn.struct.ptr.buffer.load.lds`, which expects an `__amdgpu_buffer_rsrc_t` rather than a `v4i32` as its first argument. The reason for excluding the other intrinsics exposed by the cited PR is because the intrinsics taking a `v4i32` are legacy and should be deprecated. --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/Sema/SemaAMDGPU.cpp | 1 + .../CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl | 9 +++++++++ .../builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl | 7 +++++++ ...iltins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl | 3 ++- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 4 +++- 6 files changed, 23 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 29e1e99bba9ef..313c0e640d240 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -164,6 +164,7 @@ BUILTIN(__builtin_amdgcn_raw_buffer_load_b96, "V3UiQbiiIi", "n") BUILTIN(__builtin_amdgcn_raw_buffer_load_b128, "V4UiQbiiIi", "n") TARGET_BUILTIN(__builtin_amdgcn_raw_ptr_buffer_load_lds, "vQbv*3IUiiiIiIi", "t", "vmem-to-lds-load-insts") +TARGET_BUILTIN(__builtin_amdgcn_struct_ptr_buffer_load_lds, "vQbv*3IUiiiiIiIi", "t", "vmem-to-lds-load-insts") //===----------------------------------------------------------------------===// // Ballot builtins. diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index e6414a623b929..c23c98aa3aaeb 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -36,6 +36,7 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, switch (BuiltinID) { case AMDGPU::BI__builtin_amdgcn_raw_ptr_buffer_load_lds: + case AMDGPU::BI__builtin_amdgcn_struct_ptr_buffer_load_lds: case AMDGPU::BI__builtin_amdgcn_load_to_lds: case AMDGPU::BI__builtin_amdgcn_global_load_lds: { constexpr const int SizeIdx = 2; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl index 8256b61525f9d..177165972b7a9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raw-buffer-load-lds.cl @@ -10,3 +10,12 @@ void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int offset, int soffset) { __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 1, offset, soffset, 2, 3); } + +// CHECK-LABEL: @test_amdgcn_struct_ptr_buffer_load_lds( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.struct.ptr.buffer.load.lds(ptr addrspace(8) [[RSRC:%.*]], ptr addrspace(3) [[LDS:%.*]], i32 4, i32 [[VINDEX:%.*]], i32 [[VOFFSET:%.*]], i32 [[SOFFSET:%.*]], i32 2, i32 3) +// CHECK-NEXT: ret void +// +void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int size, int vindex, int voffset, int soffset) { + __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 2, 3); +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl index 5915393ae7f56..8fbffbeea0531 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-error.cl @@ -8,3 +8,10 @@ void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local vo __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_raw_ptr_buffer_load_lds' must be a constant integer}} __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 3, offset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}} } + +void test_amdgcn_struct_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void * lds, int size, int vindex, int voffset, int soffset, int x) { + __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, x, vindex, voffset, soffset, 0, 0); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, x, 0); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, voffset, soffset, 0, x); //expected-error{{argument to '__builtin_amdgcn_struct_ptr_buffer_load_lds' must be a constant integer}} + __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 3, vindex, voffset, soffset, 0, 0); //expected-error{{invalid size value}} gfx950-note{{size must be 1, 2, 4, 12 or 16}} gfx90a-note{{size must be 1, 2, or 4}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl index 74944f2d93c72..cb832b9aa4845 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-raw-ptr-buffer-load-lds-target-error.cl @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -verify -o - %s // REQUIRES: amdgpu-registered-target -void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int offset, int soffset, int x) { +void test_amdgcn_raw_ptr_buffer_load_lds(__amdgpu_buffer_rsrc_t rsrc, __local void* lds, int vindex, int offset, int soffset) { __builtin_amdgcn_raw_ptr_buffer_load_lds(rsrc, lds, 4, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}} + __builtin_amdgcn_struct_ptr_buffer_load_lds(rsrc, lds, 4, vindex, offset, soffset, 0, 0); //expected-error{{needs target feature vmem-to-lds-load-insts}} } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index eb2c63c24e4b5..d8fda0e2bcfa3 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1936,7 +1936,9 @@ class AMDGPUStructBufferLoadLDS : Intrinsic < ImmArg>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_lds : AMDGPUStructBufferLoadLDS; -class AMDGPUStructPtrBufferLoadLDS : Intrinsic < +class AMDGPUStructPtrBufferLoadLDS : + ClangBuiltin<"__builtin_amdgcn_struct_ptr_buffer_load_lds">, + Intrinsic < [], [AMDGPUBufferRsrcTy, // rsrc(SGPR) LLVMQualPointerType<3>, // LDS base offset From 1600450f9098e5c9cb26840bd53f1be8a2559b7d Mon Sep 17 00:00:00 2001 From: Sirraide Date: Wed, 16 Jul 2025 15:48:53 +0200 Subject: [PATCH 037/813] [Clang] Reintroduce obsolete libclang symbols to avoid an ABI break (#149079) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For more context, see https://github.com/llvm/llvm-project/pull/119269#issuecomment-3075444493, but briefly, when removing ARCMigrate, I also removed some symbols in libclang, which constitutes an ABI break that we don’t want, so this pr reintroduces the removed symbols; the declarations are marked as deprecated for future removal, and the implementations print an error and do nothing, which is what we used to do when ARCMigrate was disabled. --- clang/include/clang-c/Index.h | 15 ++++++ clang/tools/libclang/CMakeLists.txt | 1 + clang/tools/libclang/Obsolete.cpp | 48 +++++++++++++++++++ .../secondary/clang/tools/libclang/BUILD.gn | 1 + 4 files changed, 65 insertions(+) create mode 100644 clang/tools/libclang/Obsolete.cpp diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index c35311c886413..b929585205aee 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -6953,6 +6953,21 @@ clang_getCursorUnaryOperatorKind(CXCursor cursor); * @} */ +CINDEX_DEPRECATED +typedef void *CXRemapping; + +CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping clang_getRemappings(const char *); + +CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping +clang_getRemappingsFromFileList(const char **, unsigned); + +CINDEX_DEPRECATED CINDEX_LINKAGE unsigned clang_remap_getNumFiles(CXRemapping); + +CINDEX_DEPRECATED CINDEX_LINKAGE void +clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *); + +CINDEX_DEPRECATED CINDEX_LINKAGE void clang_remap_dispose(CXRemapping); + LLVM_CLANG_C_EXTERN_C_END #endif diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt index b6662b66206b2..2b1e266f07392 100644 --- a/clang/tools/libclang/CMakeLists.txt +++ b/clang/tools/libclang/CMakeLists.txt @@ -42,6 +42,7 @@ set(SOURCES Indexing.cpp FatalErrorHandler.cpp Rewrite.cpp + Obsolete.cpp ADDITIONAL_HEADERS CIndexDiagnostic.h diff --git a/clang/tools/libclang/Obsolete.cpp b/clang/tools/libclang/Obsolete.cpp new file mode 100644 index 0000000000000..3596f76e1be6f --- /dev/null +++ b/clang/tools/libclang/Obsolete.cpp @@ -0,0 +1,48 @@ +//===- Obsolete.cpp - Obsolete libclang functions and types -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===--------------------------------------------------------------------===// +// +// This file contains libclang symbols whose underlying functionality has been +// removed from Clang, but which need to be kept around so as to retain ABI +// compatibility. +// +//===--------------------------------------------------------------------===// + +#include "clang-c/CXString.h" +#include "clang-c/Index.h" +#include "clang-c/Platform.h" +#include "llvm/Support/raw_ostream.h" + +extern "C" { + +// The functions below used to be part of the C API for ARCMigrate, which has +// since been removed from Clang; they already used to print an error if Clang +// was compiled without arcmt support, so we continue doing so. +CXRemapping clang_getRemappings(const char *) { + llvm::errs() << "error: ARCMigrate has been removed from Clang"; + return nullptr; +} + +CXRemapping clang_getRemappingsFromFileList(const char **, unsigned) { + llvm::errs() << "error: ARCMigrate has been removed from Clang"; + return nullptr; +} + +unsigned clang_remap_getNumFiles(CXRemapping) { + llvm::errs() << "error: ARCMigrate has been removed from Clang"; + return 0; +} + +void clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *) { + llvm::errs() << "error: ARCMigrate has been removed from Clang"; +} + +void clang_remap_dispose(CXRemapping) { + llvm::errs() << "error: ARCMigrate has been removed from Clang"; +} + +} // extern "C" diff --git a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn index 8f7beea152ab7..30b8bb61184bd 100644 --- a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn @@ -87,6 +87,7 @@ shared_library("libclang") { "Index_Internal.h", "Indexing.cpp", "Rewrite.cpp", + "Obsolete.cpp", ] if (host_os == "mac") { ldflags = [ From 76058c09071491fd097e85a0f5434b564dfad60b Mon Sep 17 00:00:00 2001 From: Jan Svoboda Date: Wed, 16 Jul 2025 07:11:13 -0700 Subject: [PATCH 038/813] [clang] Move `ExceptionHandling` from `LangOptions` to `CodeGenOptions` (#148982) This PR removes the command line parsing workaround introduced in https://github.com/llvm/llvm-project/pull/146342 by moving `LangOptions::ExceptionHandling` to `CodeGenOptions` that get parsed even for IR input. Additionally, this improves layering, where the codegen library now checks `CodeGenOptions` instead of `LangOptions` for exception handling. (This got enabled by https://github.com/llvm/llvm-project/pull/146422.) --- clang/include/clang/Basic/CodeGenOptions.def | 2 + clang/include/clang/Basic/CodeGenOptions.h | 19 ++++++++ clang/include/clang/Basic/LangOptions.def | 2 - clang/include/clang/Basic/LangOptions.h | 19 -------- clang/include/clang/Driver/Options.td | 8 +-- clang/lib/CodeGen/BackendUtil.cpp | 8 +-- clang/lib/CodeGen/CGException.cpp | 51 +++++++++++--------- clang/lib/Frontend/CompilerInvocation.cpp | 45 ++--------------- clang/lib/Frontend/InitPreprocessor.cpp | 8 +-- 9 files changed, 63 insertions(+), 99 deletions(-) diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index a11e12d495cd2..cfffeb71f09d1 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -56,6 +56,8 @@ CODEGENOPT(XCOFFReadOnlyPointers, 1, 0, Benign) ///< Set for -mxcoff-roptr. CODEGENOPT(AllTocData, 1, 0, Benign) ///< AIX -mtocdata ENUM_CODEGENOPT(FramePointer, FramePointerKind, 2, FramePointerKind::None, Benign) /// frame-pointer: all,non-leaf,reserved,none +ENUM_CODEGENOPT(ExceptionHandling, ExceptionHandlingKind, 3, ExceptionHandlingKind::None, NotCompatible) + CODEGENOPT(ClearASTBeforeBackend , 1, 0, Benign) ///< Free the AST before running backend code generation. Only works with -disable-free. CODEGENOPT(DisableFree , 1, 0, Benign) ///< Don't free memory. CODEGENOPT(DiscardValueNames , 1, 0, Benign) ///< Discard Value Names from the IR (LLVMContext flag) diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index df4403ace5fe3..cdeedd5b4eac6 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -176,6 +176,9 @@ class CodeGenOptions : public CodeGenOptionsBase { llvm_unreachable("invalid FramePointerKind"); } + /// Possible exception handling behavior. + enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm }; + enum class SwiftAsyncFramePointerKind { Auto, // Choose Swift async extended frame info based on deployment target. Always, // Unconditionally emit Swift async extended frame info. @@ -552,6 +555,22 @@ class CodeGenOptions : public CodeGenOptionsBase { return NoBuiltinFuncs; } + bool hasSjLjExceptions() const { + return getExceptionHandling() == ExceptionHandlingKind::SjLj; + } + + bool hasSEHExceptions() const { + return getExceptionHandling() == ExceptionHandlingKind::WinEH; + } + + bool hasDWARFExceptions() const { + return getExceptionHandling() == ExceptionHandlingKind::DwarfCFI; + } + + bool hasWasmExceptions() const { + return getExceptionHandling() == ExceptionHandlingKind::Wasm; + } + /// Check if Clang profile instrumenation is on. bool hasProfileClangInstr() const { return getProfileInstr() == diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 6c47107796236..6ac8d496f1494 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -98,8 +98,6 @@ LANGOPT(Exceptions , 1, 0, NotCompatible, "exception handling") LANGOPT(ObjCExceptions , 1, 0, NotCompatible, "Objective-C exceptions") LANGOPT(CXXExceptions , 1, 0, NotCompatible, "C++ exceptions") LANGOPT(EHAsynch , 1, 0, NotCompatible, "C/C++ EH Asynch exceptions") -ENUM_LANGOPT(ExceptionHandling, ExceptionHandlingKind, 3, - ExceptionHandlingKind::None, NotCompatible, "exception handling") LANGOPT(IgnoreExceptions , 1, 0, NotCompatible, "ignore exceptions") LANGOPT(ExternCNoUnwind , 1, 0, NotCompatible, "Assume extern C functions don't unwind") LANGOPT(AssumeNothrowExceptionDtor , 1, 0, NotCompatible, "Assume exception object's destructor is nothrow") diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 4c642c9e10c91..937cbff4e3ea3 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -337,9 +337,6 @@ class LangOptionsBase { enum ExcessPrecisionKind { FPP_Standard, FPP_Fast, FPP_None }; - /// Possible exception handling behavior. - enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm }; - enum class LaxVectorConversionKind { /// Permit no implicit vector bitcasts. None, @@ -788,22 +785,6 @@ class LangOptions : public LangOptionsBase { return getSignReturnAddressScope() == SignReturnAddressScopeKind::All; } - bool hasSjLjExceptions() const { - return getExceptionHandling() == ExceptionHandlingKind::SjLj; - } - - bool hasSEHExceptions() const { - return getExceptionHandling() == ExceptionHandlingKind::WinEH; - } - - bool hasDWARFExceptions() const { - return getExceptionHandling() == ExceptionHandlingKind::DwarfCFI; - } - - bool hasWasmExceptions() const { - return getExceptionHandling() == ExceptionHandlingKind::Wasm; - } - bool isSYCL() const { return SYCLIsDevice || SYCLIsHost; } bool hasDefaultVisibilityExportMapping() const { diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index bce29a76f3ac7..a8c1b5dd8ab3b 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2176,10 +2176,10 @@ def fwasm_exceptions : Flag<["-"], "fwasm-exceptions">, Group, HelpText<"Use WebAssembly style exceptions">; def exception_model : Separate<["-"], "exception-model">, Visibility<[CC1Option]>, HelpText<"The exception model">, - Values<"dwarf,sjlj,seh,wasm">, - NormalizedValuesScope<"LangOptions::ExceptionHandlingKind">, - NormalizedValues<["DwarfCFI", "SjLj", "WinEH", "Wasm"]>, - MarshallingInfoEnum, "None">; + Values<"dwarf,sjlj,seh,wasm,none">, + NormalizedValuesScope<"CodeGenOptions::ExceptionHandlingKind">, + NormalizedValues<["DwarfCFI", "SjLj", "WinEH", "Wasm", "None"]>, + MarshallingInfoEnum, "None">; def exception_model_EQ : Joined<["-"], "exception-model=">, Visibility<[CC1Option]>, Alias; def fignore_exceptions : Flag<["-"], "fignore-exceptions">, Group, diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 2f6d4c414e737..1b7257857dd3b 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -407,13 +407,13 @@ static bool initTargetOptions(const CompilerInstance &CI, // Set EABI version. Options.EABIVersion = TargetOpts.EABIVersion; - if (LangOpts.hasSjLjExceptions()) + if (CodeGenOpts.hasSjLjExceptions()) Options.ExceptionModel = llvm::ExceptionHandling::SjLj; - if (LangOpts.hasSEHExceptions()) + if (CodeGenOpts.hasSEHExceptions()) Options.ExceptionModel = llvm::ExceptionHandling::WinEH; - if (LangOpts.hasDWARFExceptions()) + if (CodeGenOpts.hasDWARFExceptions()) Options.ExceptionModel = llvm::ExceptionHandling::DwarfCFI; - if (LangOpts.hasWasmExceptions()) + if (CodeGenOpts.hasWasmExceptions()) Options.ExceptionModel = llvm::ExceptionHandling::Wasm; Options.NoInfsFPMath = LangOpts.NoHonorInfs; diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp index ad138b9876e8c..f86af4581c345 100644 --- a/clang/lib/CodeGen/CGException.cpp +++ b/clang/lib/CodeGen/CGException.cpp @@ -131,20 +131,21 @@ const EHPersonality EHPersonality::ZOS_CPlusPlus = {"__zos_cxx_personality_v2", nullptr}; static const EHPersonality &getCPersonality(const TargetInfo &Target, - const LangOptions &L) { + const CodeGenOptions &CGOpts) { const llvm::Triple &T = Target.getTriple(); if (T.isWindowsMSVCEnvironment()) return EHPersonality::MSVC_CxxFrameHandler3; - if (L.hasSjLjExceptions()) + if (CGOpts.hasSjLjExceptions()) return EHPersonality::GNU_C_SJLJ; - if (L.hasDWARFExceptions()) + if (CGOpts.hasDWARFExceptions()) return EHPersonality::GNU_C; - if (L.hasSEHExceptions()) + if (CGOpts.hasSEHExceptions()) return EHPersonality::GNU_C_SEH; return EHPersonality::GNU_C; } static const EHPersonality &getObjCPersonality(const TargetInfo &Target, + const CodeGenOptions &CGOpts, const LangOptions &L) { const llvm::Triple &T = Target.getTriple(); if (T.isWindowsMSVCEnvironment()) @@ -152,7 +153,7 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target, switch (L.ObjCRuntime.getKind()) { case ObjCRuntime::FragileMacOSX: - return getCPersonality(Target, L); + return getCPersonality(Target, CGOpts); case ObjCRuntime::MacOSX: case ObjCRuntime::iOS: case ObjCRuntime::WatchOS: @@ -165,9 +166,9 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target, [[fallthrough]]; case ObjCRuntime::GCC: case ObjCRuntime::ObjFW: - if (L.hasSjLjExceptions()) + if (CGOpts.hasSjLjExceptions()) return EHPersonality::GNU_ObjC_SJLJ; - if (L.hasSEHExceptions()) + if (CGOpts.hasSEHExceptions()) return EHPersonality::GNU_ObjC_SEH; return EHPersonality::GNU_ObjC; } @@ -175,19 +176,19 @@ static const EHPersonality &getObjCPersonality(const TargetInfo &Target, } static const EHPersonality &getCXXPersonality(const TargetInfo &Target, - const LangOptions &L) { + const CodeGenOptions &CGOpts) { const llvm::Triple &T = Target.getTriple(); if (T.isWindowsMSVCEnvironment()) return EHPersonality::MSVC_CxxFrameHandler3; if (T.isOSAIX()) return EHPersonality::XL_CPlusPlus; - if (L.hasSjLjExceptions()) + if (CGOpts.hasSjLjExceptions()) return EHPersonality::GNU_CPlusPlus_SJLJ; - if (L.hasDWARFExceptions()) + if (CGOpts.hasDWARFExceptions()) return EHPersonality::GNU_CPlusPlus; - if (L.hasSEHExceptions()) + if (CGOpts.hasSEHExceptions()) return EHPersonality::GNU_CPlusPlus_SEH; - if (L.hasWasmExceptions()) + if (CGOpts.hasWasmExceptions()) return EHPersonality::GNU_Wasm_CPlusPlus; if (T.isOSzOS()) return EHPersonality::ZOS_CPlusPlus; @@ -197,6 +198,7 @@ static const EHPersonality &getCXXPersonality(const TargetInfo &Target, /// Determines the personality function to use when both C++ /// and Objective-C exceptions are being caught. static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target, + const CodeGenOptions &CGOpts, const LangOptions &L) { if (Target.getTriple().isWindowsMSVCEnvironment()) return EHPersonality::MSVC_CxxFrameHandler3; @@ -205,7 +207,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target, // In the fragile ABI, just use C++ exception handling and hope // they're not doing crazy exception mixing. case ObjCRuntime::FragileMacOSX: - return getCXXPersonality(Target, L); + return getCXXPersonality(Target, CGOpts); // The ObjC personality defers to the C++ personality for non-ObjC // handlers. Unlike the C++ case, we use the same personality @@ -213,7 +215,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target, case ObjCRuntime::MacOSX: case ObjCRuntime::iOS: case ObjCRuntime::WatchOS: - return getObjCPersonality(Target, L); + return getObjCPersonality(Target, CGOpts, L); case ObjCRuntime::GNUstep: return Target.getTriple().isOSCygMing() ? EHPersonality::GNU_CPlusPlus_SEH @@ -223,7 +225,7 @@ static const EHPersonality &getObjCXXPersonality(const TargetInfo &Target, // mixed EH. Use the ObjC personality just to avoid returning null. case ObjCRuntime::GCC: case ObjCRuntime::ObjFW: - return getObjCPersonality(Target, L); + return getObjCPersonality(Target, CGOpts, L); } llvm_unreachable("bad runtime kind"); } @@ -237,6 +239,7 @@ static const EHPersonality &getSEHPersonalityMSVC(const llvm::Triple &T) { const EHPersonality &EHPersonality::get(CodeGenModule &CGM, const FunctionDecl *FD) { const llvm::Triple &T = CGM.getTarget().getTriple(); + const CodeGenOptions &CGOpts = CGM.getCodeGenOpts(); const LangOptions &L = CGM.getLangOpts(); const TargetInfo &Target = CGM.getTarget(); @@ -245,10 +248,10 @@ const EHPersonality &EHPersonality::get(CodeGenModule &CGM, return getSEHPersonalityMSVC(T); if (L.ObjC) - return L.CPlusPlus ? getObjCXXPersonality(Target, L) - : getObjCPersonality(Target, L); - return L.CPlusPlus ? getCXXPersonality(Target, L) - : getCPersonality(Target, L); + return L.CPlusPlus ? getObjCXXPersonality(Target, CGOpts, L) + : getObjCPersonality(Target, CGOpts, L); + return L.CPlusPlus ? getCXXPersonality(Target, CGOpts) + : getCPersonality(Target, CGOpts); } const EHPersonality &EHPersonality::get(CodeGenFunction &CGF) { @@ -344,7 +347,7 @@ void CodeGenModule::SimplifyPersonality() { return; const EHPersonality &ObjCXX = EHPersonality::get(*this, /*FD=*/nullptr); - const EHPersonality &CXX = getCXXPersonality(getTarget(), LangOpts); + const EHPersonality &CXX = getCXXPersonality(getTarget(), CodeGenOpts); if (&ObjCXX == &CXX) return; @@ -500,7 +503,7 @@ void CodeGenFunction::EmitStartEHSpec(const Decl *D) { // In Wasm EH we currently treat 'throw()' in the same way as 'noexcept'. In // case of throw with types, we ignore it and print a warning for now. // TODO Correctly handle exception specification in Wasm EH - if (CGM.getLangOpts().hasWasmExceptions()) { + if (CGM.getCodeGenOpts().hasWasmExceptions()) { if (EST == EST_DynamicNone) EHStack.pushTerminate(); else @@ -515,8 +518,8 @@ void CodeGenFunction::EmitStartEHSpec(const Decl *D) { // throw with types. // TODO Correctly handle exception specification in Emscripten EH if (getTarget().getCXXABI() == TargetCXXABI::WebAssembly && - CGM.getLangOpts().getExceptionHandling() == - LangOptions::ExceptionHandlingKind::None && + CGM.getCodeGenOpts().getExceptionHandling() == + CodeGenOptions::ExceptionHandlingKind::None && EST == EST_Dynamic) CGM.getDiags().Report(D->getLocation(), diag::warn_wasm_dynamic_exception_spec_ignored) @@ -604,7 +607,7 @@ void CodeGenFunction::EmitEndEHSpec(const Decl *D) { // In wasm we currently treat 'throw()' in the same way as 'noexcept'. In // case of throw with types, we ignore it and print a warning for now. // TODO Correctly handle exception specification in wasm - if (CGM.getLangOpts().hasWasmExceptions()) { + if (CGM.getCodeGenOpts().hasWasmExceptions()) { if (EST == EST_DynamicNone) EHStack.popTerminate(); return; diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 56d10ceb986b3..6ab36d8675966 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -593,11 +593,11 @@ static bool FixupInvocation(CompilerInvocation &Invocation, CodeGenOpts.CodeModel = TargetOpts.CodeModel; CodeGenOpts.LargeDataThreshold = TargetOpts.LargeDataThreshold; - if (LangOpts.getExceptionHandling() != - LangOptions::ExceptionHandlingKind::None && + if (CodeGenOpts.getExceptionHandling() != + CodeGenOptions::ExceptionHandlingKind::None && T.isWindowsMSVCEnvironment()) Diags.Report(diag::err_fe_invalid_exception_model) - << static_cast(LangOpts.getExceptionHandling()) << T.str(); + << static_cast(CodeGenOpts.getExceptionHandling()) << T.str(); if (LangOpts.AppleKext && !LangOpts.CPlusPlus) Diags.Report(diag::warn_c_kext); @@ -3713,23 +3713,6 @@ static StringRef GetInputKindName(InputKind IK) { llvm_unreachable("unknown input language"); } -static StringRef getExceptionHandlingName(unsigned EHK) { - switch (static_cast(EHK)) { - case LangOptions::ExceptionHandlingKind::None: - return "none"; - case LangOptions::ExceptionHandlingKind::DwarfCFI: - return "dwarf"; - case LangOptions::ExceptionHandlingKind::SjLj: - return "sjlj"; - case LangOptions::ExceptionHandlingKind::WinEH: - return "seh"; - case LangOptions::ExceptionHandlingKind::Wasm: - return "wasm"; - } - - llvm_unreachable("covered switch"); -} - void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, ArgumentConsumer Consumer, const llvm::Triple &T, @@ -3745,10 +3728,6 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, GenerateArg(Consumer, OPT_pic_is_pie); for (StringRef Sanitizer : serializeSanitizerKinds(Opts.Sanitize)) GenerateArg(Consumer, OPT_fsanitize_EQ, Sanitizer); - if (Opts.ExceptionHandling) { - GenerateArg(Consumer, OPT_exception_model, - getExceptionHandlingName(Opts.ExceptionHandling)); - } return; } @@ -4057,24 +4036,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, parseSanitizerKinds("-fsanitize=", Args.getAllArgValues(OPT_fsanitize_EQ), Diags, Opts.Sanitize); - if (const Arg *A = Args.getLastArg(options::OPT_exception_model)) { - std::optional EMValue = - llvm::StringSwitch>( - A->getValue()) - .Case("dwarf", LangOptions::ExceptionHandlingKind::DwarfCFI) - .Case("sjlj", LangOptions::ExceptionHandlingKind::SjLj) - .Case("seh", LangOptions::ExceptionHandlingKind::WinEH) - .Case("wasm", LangOptions::ExceptionHandlingKind::Wasm) - .Case("none", LangOptions::ExceptionHandlingKind::None) - .Default(std::nullopt); - if (EMValue) { - Opts.ExceptionHandling = static_cast(*EMValue); - } else { - Diags.Report(diag::err_drv_invalid_value) - << A->getAsString(Args) << A->getValue(); - } - } - return Diags.getNumErrors() == NumErrorsBefore; } diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 136bc55847cc1..38b2e0cf1ca59 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1032,14 +1032,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI, if (LangOpts.GNUCVersion && LangOpts.RTTI) Builder.defineMacro("__GXX_RTTI"); - if (LangOpts.hasSjLjExceptions()) + if (CGOpts.hasSjLjExceptions()) Builder.defineMacro("__USING_SJLJ_EXCEPTIONS__"); - else if (LangOpts.hasSEHExceptions()) + else if (CGOpts.hasSEHExceptions()) Builder.defineMacro("__SEH__"); - else if (LangOpts.hasDWARFExceptions() && + else if (CGOpts.hasDWARFExceptions() && (TI.getTriple().isThumb() || TI.getTriple().isARM())) Builder.defineMacro("__ARM_DWARF_EH__"); - else if (LangOpts.hasWasmExceptions() && TI.getTriple().isWasm()) + else if (CGOpts.hasWasmExceptions() && TI.getTriple().isWasm()) Builder.defineMacro("__WASM_EXCEPTIONS__"); if (LangOpts.Deprecated) From a944e861f99f2bea9f2f5015deb52c1d82ac4f8a Mon Sep 17 00:00:00 2001 From: Augie Fackler Date: Wed, 16 Jul 2025 10:15:47 -0400 Subject: [PATCH 039/813] [bazel] update for 3e4153c97b54d456cfaf6ae21391122582b0ab28 --- utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel index ac4e3813fbba8..3598944381900 100644 --- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel @@ -270,6 +270,12 @@ td_library( includes = ["include"], ) +td_library( + name = "BuiltinsRISCVXAndesTdFiles", + srcs = ["include/clang/Basic/BuiltinsRISCVXAndes.td"], + includes = ["include"], +) + td_library( name = "BuiltinsX86BaseTdFiles", srcs = ["include/clang/Basic/BuiltinsX86Base.td"], @@ -348,6 +354,7 @@ gentbl_cc_library( td_file = "include/clang/Basic/BuiltinsRISCV.td", deps = [ ":BuiltinsBaseTdFiles", + ":BuiltinsRISCVXAndesTdFiles", ":BuiltinsRISCVXCVTdFiles", ], ) From 752e31c27c0d2888e23db9db63dedc316dbaa1a4 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Wed, 16 Jul 2025 16:19:51 +0200 Subject: [PATCH 040/813] [LifetimeSafety] Make the dataflow analysis generic (#148222) Refactored the lifetime safety analysis to use a generic dataflow framework with a policy-based design. ### Changes - Introduced a generic `DataflowAnalysis` template class that can be specialized for different analyses - Renamed `LifetimeLattice` to `LoanPropagationLattice` to better reflect its purpose - Created a `LoanPropagationAnalysis` class that inherits from the generic framework - Moved transfer functions from the standalone `Transferer` class into the analysis class - Restructured the code to separate the dataflow engine from the specific analysis logic - Updated debug output and test expectations to use the new class names ### Motivation In order to add more analyses, e.g. [loan expiry](https://github.com/llvm/llvm-project/pull/148712) and origin liveness, the previous implementation would have separate, nearly identical dataflow runners for each analysis. This change creates a single, reusable component, which will make it much simpler to add subsequent analyses without repeating boilerplate code. This is quite close to the existing dataflow framework! --- clang/lib/Analysis/LifetimeSafety.cpp | 395 ++++++++++-------- .../Sema/warn-lifetime-safety-dataflow.cpp | 30 +- 2 files changed, 233 insertions(+), 192 deletions(-) diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index bf67bea6c9933..9c623e3a5693b 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/ImmutableMap.h" #include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/PointerUnion.h" +#include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Debug.h" #include "llvm/Support/TimeProfiler.h" @@ -496,7 +497,168 @@ class FactGenerator : public ConstStmtVisitor { }; // ========================================================================= // -// The Dataflow Lattice +// Generic Dataflow Analysis +// ========================================================================= // +/// A generic, policy-based driver for forward dataflow analyses. It combines +/// the dataflow runner and the transferer logic into a single class hierarchy. +/// +/// The derived class is expected to provide: +/// - A `Lattice` type. +/// - `StringRef getAnalysisName() const` +/// - `Lattice getInitialState();` The initial state at the function entry. +/// - `Lattice join(Lattice, Lattice);` Merges states from multiple CFG paths. +/// - `Lattice transfer(Lattice, const FactType&);` Defines how a single +/// lifetime-relevant `Fact` transforms the lattice state. Only overloads +/// for facts relevant to the analysis need to be implemented. +/// +/// \tparam Derived The CRTP derived class that implements the specific +/// analysis. +/// \tparam LatticeType The dataflow lattice used by the analysis. +/// TODO: Maybe use the dataflow framework! The framework might need changes +/// to support the current comparison done at block-entry. +template class DataflowAnalysis { +public: + using Lattice = LatticeType; + +private: + const CFG &Cfg; + AnalysisDeclContext &AC; + + llvm::DenseMap BlockEntryStates; + llvm::DenseMap BlockExitStates; + +protected: + FactManager &AllFacts; + + explicit DataflowAnalysis(const CFG &C, AnalysisDeclContext &AC, + FactManager &F) + : Cfg(C), AC(AC), AllFacts(F) {} + +public: + void run() { + Derived &D = static_cast(*this); + llvm::TimeTraceScope Time(D.getAnalysisName()); + + ForwardDataflowWorklist Worklist(Cfg, AC); + const CFGBlock *Entry = &Cfg.getEntry(); + BlockEntryStates[Entry] = D.getInitialState(); + Worklist.enqueueBlock(Entry); + llvm::SmallBitVector Visited; + Visited.resize(Cfg.getNumBlockIDs() + 1); + + while (const CFGBlock *B = Worklist.dequeue()) { + Lattice EntryState = getEntryState(B); + Lattice ExitState = transferBlock(B, EntryState); + BlockExitStates[B] = ExitState; + Visited.set(B->getBlockID()); + + for (const CFGBlock *Successor : B->succs()) { + Lattice OldSuccEntryState = getEntryState(Successor); + Lattice NewSuccEntryState = D.join(OldSuccEntryState, ExitState); + + // Enqueue the successor if its entry state has changed or if we have + // never visited it. + if (!Visited.test(Successor->getBlockID()) || + NewSuccEntryState != OldSuccEntryState) { + BlockEntryStates[Successor] = NewSuccEntryState; + Worklist.enqueueBlock(Successor); + } + } + } + } + + Lattice getEntryState(const CFGBlock *B) const { + return BlockEntryStates.lookup(B); + } + + Lattice getExitState(const CFGBlock *B) const { + return BlockExitStates.lookup(B); + } + + void dump() const { + const Derived *D = static_cast(this); + llvm::dbgs() << "==========================================\n"; + llvm::dbgs() << D->getAnalysisName() << " results:\n"; + llvm::dbgs() << "==========================================\n"; + const CFGBlock &B = Cfg.getExit(); + getExitState(&B).dump(llvm::dbgs()); + } + +private: + /// Computes the exit state of a block by applying all its facts sequentially + /// to a given entry state. + /// TODO: We might need to store intermediate states per-fact in the block for + /// later analysis. + Lattice transferBlock(const CFGBlock *Block, Lattice EntryState) { + Lattice BlockState = EntryState; + for (const Fact *F : AllFacts.getFacts(Block)) { + BlockState = transferFact(BlockState, F); + } + return BlockState; + } + + Lattice transferFact(Lattice In, const Fact *F) { + Derived *d = static_cast(this); + switch (F->getKind()) { + case Fact::Kind::Issue: + return d->transfer(In, *F->getAs()); + case Fact::Kind::Expire: + return d->transfer(In, *F->getAs()); + case Fact::Kind::AssignOrigin: + return d->transfer(In, *F->getAs()); + case Fact::Kind::ReturnOfOrigin: + return d->transfer(In, *F->getAs()); + } + llvm_unreachable("Unknown fact kind"); + } + +public: + Lattice transfer(Lattice In, const IssueFact &) { return In; } + Lattice transfer(Lattice In, const ExpireFact &) { return In; } + Lattice transfer(Lattice In, const AssignOriginFact &) { return In; } + Lattice transfer(Lattice In, const ReturnOfOriginFact &) { return In; } +}; + +namespace utils { + +/// Computes the union of two ImmutableSets. +template +llvm::ImmutableSet join(llvm::ImmutableSet A, llvm::ImmutableSet B, + typename llvm::ImmutableSet::Factory &F) { + if (A.getHeight() < B.getHeight()) + std::swap(A, B); + for (const T &E : B) + A = F.add(A, E); + return A; +} + +/// Computes the key-wise union of two ImmutableMaps. +// TODO(opt): This key-wise join is a performance bottleneck. A more +// efficient merge could be implemented using a Patricia Trie or HAMT +// instead of the current AVL-tree-based ImmutableMap. +template +llvm::ImmutableMap +join(llvm::ImmutableMap A, llvm::ImmutableMap B, + typename llvm::ImmutableMap::Factory &F, Joiner joinValues) { + if (A.getHeight() < B.getHeight()) + std::swap(A, B); + + // For each element in B, join it with the corresponding element in A + // (or with an empty value if it doesn't exist in A). + for (const auto &Entry : B) { + const K &Key = Entry.first; + const V &ValB = Entry.second; + if (const V *ValA = A.lookup(Key)) + A = F.add(A, Key, joinValues(*ValA, ValB)); + else + A = F.add(A, Key, ValB); + } + return A; +} +} // namespace utils + +// ========================================================================= // +// Loan Propagation Analysis // ========================================================================= // // Using LLVM's immutable collections is efficient for dataflow analysis @@ -509,82 +671,37 @@ using OriginLoanMap = llvm::ImmutableMap; /// that all created states share the same underlying memory management. struct LifetimeFactory { OriginLoanMap::Factory OriginMapFactory; - LoanSet::Factory LoanSetFact; + LoanSet::Factory LoanSetFactory; /// Creates a singleton set containing only the given loan ID. LoanSet createLoanSet(LoanID LID) { - return LoanSetFact.add(LoanSetFact.getEmptySet(), LID); + return LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID); } }; -/// LifetimeLattice represents the state of our analysis at a given program -/// point. It is an immutable object, and all operations produce a new -/// instance rather than modifying the existing one. -struct LifetimeLattice { +/// Represents the dataflow lattice for loan propagation. +/// +/// This lattice tracks which loans each origin may hold at a given program +/// point.The lattice has a finite height: An origin's loan set is bounded by +/// the total number of loans in the function. +/// TODO(opt): To reduce the lattice size, propagate origins of declarations, +/// not expressions, because expressions are not visible across blocks. +struct LoanPropagationLattice { /// The map from an origin to the set of loans it contains. - /// The lattice has a finite height: An origin's loan set is bounded by the - /// total number of loans in the function. - /// TODO(opt): To reduce the lattice size, propagate origins of declarations, - /// not expressions, because expressions are not visible across blocks. OriginLoanMap Origins = OriginLoanMap(nullptr); - explicit LifetimeLattice(const OriginLoanMap &S) : Origins(S) {} - LifetimeLattice() = default; + explicit LoanPropagationLattice(const OriginLoanMap &S) : Origins(S) {} + LoanPropagationLattice() = default; - bool operator==(const LifetimeLattice &Other) const { + bool operator==(const LoanPropagationLattice &Other) const { return Origins == Other.Origins; } - bool operator!=(const LifetimeLattice &Other) const { + bool operator!=(const LoanPropagationLattice &Other) const { return !(*this == Other); } - LoanSet getLoans(OriginID OID) const { - if (auto *Loans = Origins.lookup(OID)) - return *Loans; - return LoanSet(nullptr); - } - - /// Computes the union of two lattices by performing a key-wise join of - /// their OriginLoanMaps. - // TODO(opt): This key-wise join is a performance bottleneck. A more - // efficient merge could be implemented using a Patricia Trie or HAMT - // instead of the current AVL-tree-based ImmutableMap. - // TODO(opt): Keep the state small by removing origins which become dead. - LifetimeLattice join(const LifetimeLattice &Other, - LifetimeFactory &Factory) const { - /// Merge the smaller map into the larger one ensuring we iterate over the - /// smaller map. - if (Origins.getHeight() < Other.Origins.getHeight()) - return Other.join(*this, Factory); - - OriginLoanMap JoinedState = Origins; - // For each origin in the other map, union its loan set with ours. - for (const auto &Entry : Other.Origins) { - OriginID OID = Entry.first; - LoanSet OtherLoanSet = Entry.second; - JoinedState = Factory.OriginMapFactory.add( - JoinedState, OID, join(getLoans(OID), OtherLoanSet, Factory)); - } - return LifetimeLattice(JoinedState); - } - - LoanSet join(LoanSet a, LoanSet b, LifetimeFactory &Factory) const { - /// Merge the smaller set into the larger one ensuring we iterate over the - /// smaller set. - if (a.getHeight() < b.getHeight()) - std::swap(a, b); - LoanSet Result = a; - for (LoanID LID : b) { - /// TODO(opt): Profiling shows that this loop is a major performance - /// bottleneck. Investigate using a BitVector to represent the set of - /// loans for improved join performance. - Result = Factory.LoanSetFact.add(Result, LID); - } - return Result; - } - void dump(llvm::raw_ostream &OS) const { - OS << "LifetimeLattice State:\n"; + OS << "LoanPropagationLattice State:\n"; if (Origins.isEmpty()) OS << " \n"; for (const auto &Entry : Origins) { @@ -596,143 +713,66 @@ struct LifetimeLattice { } }; -// ========================================================================= // -// The Transfer Function -// ========================================================================= // -class Transferer { - FactManager &AllFacts; +/// The analysis that tracks which loans belong to which origins. +class LoanPropagationAnalysis + : public DataflowAnalysis { + LifetimeFactory &Factory; public: - explicit Transferer(FactManager &F, LifetimeFactory &Factory) - : AllFacts(F), Factory(Factory) {} + LoanPropagationAnalysis(const CFG &C, AnalysisDeclContext &AC, FactManager &F, + LifetimeFactory &Factory) + : DataflowAnalysis(C, AC, F), Factory(Factory) {} - /// Computes the exit state of a block by applying all its facts sequentially - /// to a given entry state. - /// TODO: We might need to store intermediate states per-fact in the block for - /// later analysis. - LifetimeLattice transferBlock(const CFGBlock *Block, - LifetimeLattice EntryState) { - LifetimeLattice BlockState = EntryState; - llvm::ArrayRef Facts = AllFacts.getFacts(Block); + using DataflowAnalysis::transfer; - for (const Fact *F : Facts) { - BlockState = transferFact(BlockState, F); - } - return BlockState; - } + StringRef getAnalysisName() const { return "LoanPropagation"; } -private: - LifetimeLattice transferFact(LifetimeLattice In, const Fact *F) { - switch (F->getKind()) { - case Fact::Kind::Issue: - return transfer(In, *F->getAs()); - case Fact::Kind::AssignOrigin: - return transfer(In, *F->getAs()); - // Expire and ReturnOfOrigin facts don't modify the Origins and the State. - case Fact::Kind::Expire: - case Fact::Kind::ReturnOfOrigin: - return In; - } - llvm_unreachable("Unknown fact kind"); + Lattice getInitialState() { return Lattice{}; } + + /// Merges two lattices by taking the union of loans for each origin. + // TODO(opt): Keep the state small by removing origins which become dead. + Lattice join(Lattice A, Lattice B) { + OriginLoanMap JoinedOrigins = + utils::join(A.Origins, B.Origins, Factory.OriginMapFactory, + [this](LoanSet S1, LoanSet S2) { + return utils::join(S1, S2, Factory.LoanSetFactory); + }); + return Lattice(JoinedOrigins); } /// A new loan is issued to the origin. Old loans are erased. - LifetimeLattice transfer(LifetimeLattice In, const IssueFact &F) { + Lattice transfer(Lattice In, const IssueFact &F) { OriginID OID = F.getOriginID(); LoanID LID = F.getLoanID(); - return LifetimeLattice(Factory.OriginMapFactory.add( + return LoanPropagationLattice(Factory.OriginMapFactory.add( In.Origins, OID, Factory.createLoanSet(LID))); } /// The destination origin's loan set is replaced by the source's. /// This implicitly "resets" the old loans of the destination. - LifetimeLattice transfer(LifetimeLattice InState, const AssignOriginFact &F) { + Lattice transfer(Lattice In, const AssignOriginFact &F) { OriginID DestOID = F.getDestOriginID(); OriginID SrcOID = F.getSrcOriginID(); - LoanSet SrcLoans = InState.getLoans(SrcOID); - return LifetimeLattice( - Factory.OriginMapFactory.add(InState.Origins, DestOID, SrcLoans)); + LoanSet SrcLoans = getLoans(In, SrcOID); + return LoanPropagationLattice( + Factory.OriginMapFactory.add(In.Origins, DestOID, SrcLoans)); } -}; -// ========================================================================= // -// Dataflow analysis -// ========================================================================= // - -/// Drives the intra-procedural dataflow analysis. -/// -/// Orchestrates the analysis by iterating over the CFG using a worklist -/// algorithm. It computes a fixed point by propagating the LifetimeLattice -/// state through each block until the state no longer changes. -/// TODO: Maybe use the dataflow framework! The framework might need changes -/// to support the current comparison done at block-entry. -class LifetimeDataflow { - const CFG &Cfg; - AnalysisDeclContext &AC; - LifetimeFactory LifetimeFact; - - Transferer Xfer; - - /// Stores the merged analysis state at the entry of each CFG block. - llvm::DenseMap BlockEntryStates; - /// Stores the analysis state at the exit of each CFG block, after the - /// transfer function has been applied. - llvm::DenseMap BlockExitStates; - -public: - LifetimeDataflow(const CFG &C, FactManager &FS, AnalysisDeclContext &AC) - : Cfg(C), AC(AC), Xfer(FS, LifetimeFact) {} - - void run() { - llvm::TimeTraceScope TimeProfile("Lifetime Dataflow"); - ForwardDataflowWorklist Worklist(Cfg, AC); - const CFGBlock *Entry = &Cfg.getEntry(); - BlockEntryStates[Entry] = LifetimeLattice{}; - Worklist.enqueueBlock(Entry); - while (const CFGBlock *B = Worklist.dequeue()) { - LifetimeLattice EntryState = getEntryState(B); - LifetimeLattice ExitState = Xfer.transferBlock(B, EntryState); - BlockExitStates[B] = ExitState; - - for (const CFGBlock *Successor : B->succs()) { - auto SuccIt = BlockEntryStates.find(Successor); - LifetimeLattice OldSuccEntryState = (SuccIt != BlockEntryStates.end()) - ? SuccIt->second - : LifetimeLattice{}; - LifetimeLattice NewSuccEntryState = - OldSuccEntryState.join(ExitState, LifetimeFact); - // Enqueue the successor if its entry state has changed. - // TODO(opt): Consider changing 'join' to report a change if != - // comparison is found expensive. - if (SuccIt == BlockEntryStates.end() || - NewSuccEntryState != OldSuccEntryState) { - BlockEntryStates[Successor] = NewSuccEntryState; - Worklist.enqueueBlock(Successor); - } - } - } - } - - void dump() const { - llvm::dbgs() << "==========================================\n"; - llvm::dbgs() << " Dataflow results:\n"; - llvm::dbgs() << "==========================================\n"; - const CFGBlock &B = Cfg.getExit(); - getExitState(&B).dump(llvm::dbgs()); - } - - LifetimeLattice getEntryState(const CFGBlock *B) const { - return BlockEntryStates.lookup(B); - } - - LifetimeLattice getExitState(const CFGBlock *B) const { - return BlockExitStates.lookup(B); +private: + LoanSet getLoans(Lattice L, OriginID OID) { + if (auto *Loans = L.Origins.lookup(OID)) + return *Loans; + return Factory.LoanSetFactory.getEmptySet(); } }; // ========================================================================= // -// TODO: Analysing dataflow results and error reporting. +// TODO: +// - Modifying loan propagation to answer `LoanSet getLoans(Origin O, Point P)` +// - Modify loan expiry analysis to answer `bool isExpired(Loan L, Point P)` +// - Modify origin liveness analysis to answer `bool isLive(Origin O, Point P)` +// - Using the above three to perform the final error reporting. // ========================================================================= // } // anonymous namespace @@ -755,8 +795,9 @@ void runLifetimeSafetyAnalysis(const DeclContext &DC, const CFG &Cfg, /// blocks; only Decls are visible. Therefore, loans in a block that /// never reach an Origin associated with a Decl can be safely dropped by /// the analysis. - LifetimeDataflow Dataflow(Cfg, FactMgr, AC); - Dataflow.run(); - DEBUG_WITH_TYPE("LifetimeDataflow", Dataflow.dump()); + LifetimeFactory Factory; + LoanPropagationAnalysis LoanPropagation(Cfg, AC, FactMgr, Factory); + LoanPropagation.run(); + DEBUG_WITH_TYPE("LifetimeLoanPropagation", LoanPropagation.dump()); } } // namespace clang diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp index 38dfdb98f08fc..0e98904ade86a 100644 --- a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp +++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -mllvm -debug-only=LifetimeFacts,LifetimeDataflow -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s +// RUN: %clang_cc1 -mllvm -debug-only=LifetimeFacts,LifetimeLoanPropagation -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s // REQUIRES: asserts struct MyObj { @@ -19,7 +19,7 @@ MyObj* return_local_addr() { // CHECK: ReturnOfOrigin (OriginID: [[O_RET_VAL]]) // CHECK: Expire (LoanID: [[L_X]]) } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_ADDR_X]] contains Loan [[L_X]] // CHECK-DAG: Origin [[O_P]] contains Loan [[L_X]] // CHECK-DAG: Origin [[O_RET_VAL]] contains Loan [[L_X]] @@ -47,7 +47,7 @@ MyObj* assign_and_return_local_addr() { // CHECK: ReturnOfOrigin (OriginID: [[O_PTR2_RVAL_2]]) // CHECK: Expire (LoanID: [[L_Y]]) } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_ADDR_Y]] contains Loan [[L_Y]] // CHECK-DAG: Origin [[O_PTR1]] contains Loan [[L_Y]] // CHECK-DAG: Origin [[O_PTR2]] contains Loan [[L_Y]] @@ -65,7 +65,7 @@ int return_int_val() { return x; } // CHECK-NEXT: End of Block -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK: @@ -79,7 +79,7 @@ void loan_expires_cpp() { // CHECK: AssignOrigin (DestID: [[O_POBJ:[0-9]+]], SrcID: [[O_ADDR_OBJ]]) // CHECK: Expire (LoanID: [[L_OBJ]]) } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_ADDR_OBJ]] contains Loan [[L_OBJ]] // CHECK-DAG: Origin [[O_POBJ]] contains Loan [[L_OBJ]] @@ -96,7 +96,7 @@ void loan_expires_trivial() { // CHECK-NEXT: End of Block // FIXME: Add check for Expire once trivial destructors are handled for expiration. } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_ADDR_TRIVIAL_OBJ]] contains Loan [[L_TRIVIAL_OBJ]] // CHECK-DAG: Origin [[O_PTOBJ]] contains Loan [[L_TRIVIAL_OBJ]] @@ -119,7 +119,7 @@ void conditional(bool condition) { // CHECK: AssignOrigin (DestID: [[O_P_RVAL:[0-9]+]], SrcID: [[O_P]]) // CHECK: AssignOrigin (DestID: [[O_Q:[0-9]+]], SrcID: [[O_P_RVAL]]) } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_ADDR_A]] contains Loan [[L_A]] // CHECK-DAG: Origin [[O_ADDR_B]] contains Loan [[L_B]] // CHECK-DAG: Origin [[O_P]] contains Loan [[L_A]] @@ -163,7 +163,7 @@ void pointers_in_a_cycle(bool condition) { } // At the end of the analysis, the origins for the pointers involved in the cycle // (p1, p2, p3, temp) should all contain the loans from v1, v2, and v3 at the fixed point. -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V1]] // CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V2]] // CHECK-DAG: Origin [[O_P1]] contains Loan [[L_V3]] @@ -195,7 +195,7 @@ void overwrite_origin() { // CHECK: Expire (LoanID: [[L_S2]]) // CHECK: Expire (LoanID: [[L_S1]]) } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK: Origin [[O_P]] contains Loan [[L_S2]] // CHECK-NOT: Origin [[O_P]] contains Loan [[L_S1]] @@ -213,7 +213,7 @@ void reassign_to_null() { } // FIXME: Have a better representation for nullptr than just an empty origin. // It should be a separate loan and origin kind. -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK: Origin [[O_P]] contains no loans @@ -235,7 +235,7 @@ void reassign_in_if(bool condition) { // CHECK: Expire (LoanID: [[L_S2]]) // CHECK: Expire (LoanID: [[L_S1]]) } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]] // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]] // CHECK-DAG: Origin [[O_ADDR_S1]] contains Loan [[L_S1]] @@ -276,7 +276,7 @@ void assign_in_switch(int mode) { // CHECK-DAG: Expire (LoanID: [[L_S2]]) // CHECK-DAG: Expire (LoanID: [[L_S1]]) } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]] // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]] // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S3]] @@ -299,7 +299,7 @@ void loan_in_loop(bool condition) { // CHECK: Expire (LoanID: [[L_INNER]]) } } -// CHECK: Dataflow results: +// CHECK: LoanPropagation results: // CHECK-DAG: Origin [[O_P]] contains Loan [[L_INNER]] // CHECK-DAG: Origin [[O_ADDR_INNER]] contains Loan [[L_INNER]] @@ -326,7 +326,7 @@ void loop_with_break(int count) { // CHECK: Expire (LoanID: [[L_S1]]) } -// CHECK-LABEL: Dataflow results: +// CHECK-LABEL: LoanPropagation results: // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S1]] // CHECK-DAG: Origin [[O_P]] contains Loan [[L_S2]] // CHECK-DAG: Origin [[O_ADDR_S1]] contains Loan [[L_S1]] @@ -355,7 +355,7 @@ void nested_scopes() { // CHECK: Expire (LoanID: [[L_OUTER]]) } -// CHECK-LABEL: Dataflow results: +// CHECK-LABEL: LoanPropagation results: // CHECK-DAG: Origin [[O_P]] contains Loan [[L_INNER]] // CHECK-DAG: Origin [[O_ADDR_INNER]] contains Loan [[L_INNER]] // CHECK-DAG: Origin [[O_ADDR_OUTER]] contains Loan [[L_OUTER]] From 53355ab9328f5dbdaf77c8529b9eb95cce2166d2 Mon Sep 17 00:00:00 2001 From: Stephen Long <63318318+steplong@users.noreply.github.com> Date: Wed, 16 Jul 2025 10:22:17 -0400 Subject: [PATCH 041/813] [llvm-exegesis] fix test when building LLVM for WoA. NFC (#148968) If building with LLVM_DEFAULT_TARGET_TRIPLE=arm-none-linux-gnueabi on WoA (Windows on ARM), this test fails, so force it to use aarch64 with -mtriple. --- .../llvm-exegesis/AArch64/setReg_init_check.s | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s b/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s index a4350fc6dc2cb..3ef664f899551 100644 --- a/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s +++ b/llvm/test/tools/llvm-exegesis/AArch64/setReg_init_check.s @@ -3,7 +3,7 @@ REQUIRES: aarch64-registered-target ## PPR Register Class Initialization Testcase ## Ideally, we should use PTRUE_{B/H/S/D} instead of FADDV_VPZ_D for an isolated test case; ## However, exegesis does not yet support PTRUE_{B/H/S/D}. -RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1 +RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %d > %t.s RUN: FileCheck %s --check-prefix=PPR_ASM < %t.s PPR_ASM: : @@ -14,7 +14,7 @@ PPR_ASM-NEXT: faddv d{{[0-9]+}}, p{{[0-9]+}}, z{{[0-9]+}} ## ZPR Register Class Initialization Testcase ## Ideally, we should use DUP_ZI_{B/H/S/D} instead of FADDV_VPZ_D for an isolated test case; ## However, exegesis does not yet support DUP_ZI_{B/H/S/D}. -RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1 +RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FADDV_VPZ_D --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %d > %t.s RUN: FileCheck %s --check-prefix=ZPR_ASM < %t.s ZPR_ASM: : @@ -23,7 +23,7 @@ ZPR_ASM-NEXT: mov z{{[0-9]+}}.d, #0x0 ZPR_ASM-NEXT: faddv d{{[0-9]+}}, p{{[0-9]+}}, z{{[0-9]+}} ## FPR128 Register Class Initialization Testcase -RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv16i8v --benchmark-phase=assemble-measured-code 2>&1 +RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv16i8v --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %d > %t.s RUN: FileCheck %s --check-prefix=FPR128-ASM < %t.s FPR128-ASM: : @@ -31,7 +31,7 @@ FPR128-ASM: movi v{{[0-9]+}}.2d, #0000000000000000 FPR128-ASM-NEXT: addv b{{[0-9]+}}, v{{[0-9]+}}.16b ## FPR64 Register Class Initialization Testcase -RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv4i16v --benchmark-phase=assemble-measured-code 2>&1 +RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=ADDVv4i16v --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %d > %t.s RUN: FileCheck %s --check-prefix=FPR64-ASM < %t.s FPR64-ASM: : @@ -39,7 +39,7 @@ FPR64-ASM: movi d{{[0-9]+}}, #0000000000000000 FPR64-ASM-NEXT: addv h{{[0-9]+}}, v{{[0-9]+}}.4h ## FPR32 Register Class Initialization Testcase -RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSSr --benchmark-phase=assemble-measured-code 2>&1 +RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSSr --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %d > %t.s RUN: FileCheck %s --check-prefix=FPR32-ASM < %t.s FPR32-ASM: : @@ -48,7 +48,7 @@ FPR32-ASM-NEXT: fabs s{{[0-9]+}}, s{{[0-9]+}} ## FPR16 Register Class Initialization Testcase -RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSHr --benchmark-phase=assemble-measured-code 2>&1 +RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=FABSHr --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %d > %t.s RUN: FileCheck %s --check-prefix=FPR16-ASM < %t.s FPR16-ASM: : @@ -56,7 +56,7 @@ FPR16-ASM: movi d{{[0-9]+}}, #0000000000000000 FPR16-ASM-NEXT: fabs h{{[0-9]+}}, h{{[0-9]+}} ## FPR8 Register Class Initialization Testcase -RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=SQABSv1i8 --benchmark-phase=assemble-measured-code 2>&1 +RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=SQABSv1i8 --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %d > %t.s RUN: FileCheck %s --check-prefix=FPR8-ASM < %t.s FPR8-ASM: : @@ -65,7 +65,7 @@ FPR8-ASM-NEXT: sqabs b{{[0-9]+}}, b{{[0-9]+}} ## FPCR Register Class Initialization Testcase -RUN: llvm-exegesis -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=BFCVT --benchmark-phase=assemble-measured-code 2>&1 +RUN: llvm-exegesis -mtriple=aarch64 -mcpu=neoverse-v2 -mode=latency --dump-object-to-disk=%d --opcode-name=BFCVT --benchmark-phase=assemble-measured-code 2>&1 RUN: llvm-objdump -d %d > %t.s RUN: FileCheck %s --check-prefix=FPCR-ASM < %t.s FPCR-ASM: : From f7fc36dca7e40c1b38c2481b65280b396cd8daf8 Mon Sep 17 00:00:00 2001 From: Utkarsh Saxena Date: Wed, 16 Jul 2025 16:25:48 +0200 Subject: [PATCH 042/813] [LifetimeSafety] Support bidirectional dataflow analysis (#148967) Generalize the dataflow analysis to support both forward and backward analyses. Some program analyses would be expressed as backward dataflow problems (like liveness analysis). This change enables the framework to support both forward analyses (like the loan propagation analysis) and backward analyses with the same infrastructure. --- clang/lib/Analysis/LifetimeSafety.cpp | 108 ++++++++++++++------------ 1 file changed, 59 insertions(+), 49 deletions(-) diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index 9c623e3a5693b..e3a03cf93880e 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -499,13 +499,16 @@ class FactGenerator : public ConstStmtVisitor { // ========================================================================= // // Generic Dataflow Analysis // ========================================================================= // -/// A generic, policy-based driver for forward dataflow analyses. It combines + +enum class Direction { Forward, Backward }; + +/// A generic, policy-based driver for dataflow analyses. It combines /// the dataflow runner and the transferer logic into a single class hierarchy. /// /// The derived class is expected to provide: /// - A `Lattice` type. /// - `StringRef getAnalysisName() const` -/// - `Lattice getInitialState();` The initial state at the function entry. +/// - `Lattice getInitialState();` The initial state of the analysis. /// - `Lattice join(Lattice, Lattice);` Merges states from multiple CFG paths. /// - `Lattice transfer(Lattice, const FactType&);` Defines how a single /// lifetime-relevant `Fact` transforms the lattice state. Only overloads @@ -514,18 +517,23 @@ class FactGenerator : public ConstStmtVisitor { /// \tparam Derived The CRTP derived class that implements the specific /// analysis. /// \tparam LatticeType The dataflow lattice used by the analysis. +/// \tparam Dir The direction of the analysis (Forward or Backward). /// TODO: Maybe use the dataflow framework! The framework might need changes /// to support the current comparison done at block-entry. -template class DataflowAnalysis { +template +class DataflowAnalysis { public: using Lattice = LatticeType; + using Base = DataflowAnalysis; private: const CFG &Cfg; AnalysisDeclContext &AC; - llvm::DenseMap BlockEntryStates; - llvm::DenseMap BlockExitStates; + llvm::DenseMap InStates; + llvm::DenseMap OutStates; + + static constexpr bool isForward() { return Dir == Direction::Forward; } protected: FactManager &AllFacts; @@ -539,75 +547,76 @@ template class DataflowAnalysis { Derived &D = static_cast(*this); llvm::TimeTraceScope Time(D.getAnalysisName()); - ForwardDataflowWorklist Worklist(Cfg, AC); - const CFGBlock *Entry = &Cfg.getEntry(); - BlockEntryStates[Entry] = D.getInitialState(); - Worklist.enqueueBlock(Entry); - llvm::SmallBitVector Visited; - Visited.resize(Cfg.getNumBlockIDs() + 1); - - while (const CFGBlock *B = Worklist.dequeue()) { - Lattice EntryState = getEntryState(B); - Lattice ExitState = transferBlock(B, EntryState); - BlockExitStates[B] = ExitState; - Visited.set(B->getBlockID()); + using Worklist = + std::conditional_t; + Worklist W(Cfg, AC); + + const CFGBlock *Start = isForward() ? &Cfg.getEntry() : &Cfg.getExit(); + InStates[Start] = D.getInitialState(); + W.enqueueBlock(Start); - for (const CFGBlock *Successor : B->succs()) { - Lattice OldSuccEntryState = getEntryState(Successor); - Lattice NewSuccEntryState = D.join(OldSuccEntryState, ExitState); + llvm::SmallBitVector Visited(Cfg.getNumBlockIDs() + 1); - // Enqueue the successor if its entry state has changed or if we have + while (const CFGBlock *B = W.dequeue()) { + Lattice StateIn = getInState(B); + Lattice StateOut = transferBlock(B, StateIn); + OutStates[B] = StateOut; + Visited.set(B->getBlockID()); + for (const CFGBlock *AdjacentB : isForward() ? B->succs() : B->preds()) { + Lattice OldInState = getInState(AdjacentB); + Lattice NewInState = D.join(OldInState, StateOut); + // Enqueue the adjacent block if its in-state has changed or if we have // never visited it. - if (!Visited.test(Successor->getBlockID()) || - NewSuccEntryState != OldSuccEntryState) { - BlockEntryStates[Successor] = NewSuccEntryState; - Worklist.enqueueBlock(Successor); + if (!Visited.test(AdjacentB->getBlockID()) || + NewInState != OldInState) { + InStates[AdjacentB] = NewInState; + W.enqueueBlock(AdjacentB); } } } } - Lattice getEntryState(const CFGBlock *B) const { - return BlockEntryStates.lookup(B); - } + Lattice getInState(const CFGBlock *B) const { return InStates.lookup(B); } - Lattice getExitState(const CFGBlock *B) const { - return BlockExitStates.lookup(B); - } + Lattice getOutState(const CFGBlock *B) const { return OutStates.lookup(B); } void dump() const { const Derived *D = static_cast(this); llvm::dbgs() << "==========================================\n"; llvm::dbgs() << D->getAnalysisName() << " results:\n"; llvm::dbgs() << "==========================================\n"; - const CFGBlock &B = Cfg.getExit(); - getExitState(&B).dump(llvm::dbgs()); + const CFGBlock &B = isForward() ? Cfg.getExit() : Cfg.getEntry(); + getOutState(&B).dump(llvm::dbgs()); } -private: - /// Computes the exit state of a block by applying all its facts sequentially - /// to a given entry state. + /// Computes the state at one end of a block by applying all its facts + /// sequentially to a given state from the other end. /// TODO: We might need to store intermediate states per-fact in the block for /// later analysis. - Lattice transferBlock(const CFGBlock *Block, Lattice EntryState) { - Lattice BlockState = EntryState; - for (const Fact *F : AllFacts.getFacts(Block)) { - BlockState = transferFact(BlockState, F); - } - return BlockState; + Lattice transferBlock(const CFGBlock *Block, Lattice State) { + auto Facts = AllFacts.getFacts(Block); + if constexpr (isForward()) + for (const Fact *F : Facts) + State = transferFact(State, F); + else + for (const Fact *F : llvm::reverse(Facts)) + State = transferFact(State, F); + return State; } Lattice transferFact(Lattice In, const Fact *F) { - Derived *d = static_cast(this); + assert(F); + Derived *D = static_cast(this); switch (F->getKind()) { case Fact::Kind::Issue: - return d->transfer(In, *F->getAs()); + return D->transfer(In, *F->getAs()); case Fact::Kind::Expire: - return d->transfer(In, *F->getAs()); + return D->transfer(In, *F->getAs()); case Fact::Kind::AssignOrigin: - return d->transfer(In, *F->getAs()); + return D->transfer(In, *F->getAs()); case Fact::Kind::ReturnOfOrigin: - return d->transfer(In, *F->getAs()); + return D->transfer(In, *F->getAs()); } llvm_unreachable("Unknown fact kind"); } @@ -715,7 +724,8 @@ struct LoanPropagationLattice { /// The analysis that tracks which loans belong to which origins. class LoanPropagationAnalysis - : public DataflowAnalysis { + : public DataflowAnalysis { LifetimeFactory &Factory; @@ -724,7 +734,7 @@ class LoanPropagationAnalysis LifetimeFactory &Factory) : DataflowAnalysis(C, AC, F), Factory(Factory) {} - using DataflowAnalysis::transfer; + using Base::transfer; StringRef getAnalysisName() const { return "LoanPropagation"; } From 01ab4238a874450a74a52c5c40a008de8a0e9a83 Mon Sep 17 00:00:00 2001 From: Thomas Fransham Date: Wed, 16 Jul 2025 15:26:26 +0100 Subject: [PATCH 043/813] [JITLink][AArch32] Add explicit visibility macros to functions needed by unittests (#116557) Avoid missing symbol errors when building JITLinkTests for windows shared library builds with explicit symbol visibility macros are enabled. This is part of the work to enable LLVM_BUILD_LLVM_DYLIB and LLVM plugins on window. --- .../ExecutionEngine/JITLink/AArch32Tests.cpp | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp index b1890d884d173..dfabb4ab76180 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp +++ b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp @@ -8,6 +8,7 @@ #include #include +#include #include "gtest/gtest.h" @@ -96,21 +97,21 @@ namespace llvm { namespace jitlink { namespace aarch32 { -HalfWords encodeImmBT4BlT1BlxT2(int64_t Value); -HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value); -uint32_t encodeImmBA1BlA1BlxA2(int64_t Value); -HalfWords encodeImmMovtT1MovwT3(uint16_t Value); -HalfWords encodeRegMovtT1MovwT3(int64_t Value); -uint32_t encodeImmMovtA1MovwA2(uint16_t Value); -uint32_t encodeRegMovtA1MovwA2(int64_t Value); - -int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo); -int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo); -int64_t decodeImmBA1BlA1BlxA2(int64_t Value); -uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo); -int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo); -uint16_t decodeImmMovtA1MovwA2(uint64_t Value); -int64_t decodeRegMovtA1MovwA2(uint64_t Value); +LLVM_ABI HalfWords encodeImmBT4BlT1BlxT2(int64_t Value); +LLVM_ABI HalfWords encodeImmBT4BlT1BlxT2_J1J2(int64_t Value); +LLVM_ABI uint32_t encodeImmBA1BlA1BlxA2(int64_t Value); +LLVM_ABI HalfWords encodeImmMovtT1MovwT3(uint16_t Value); +LLVM_ABI HalfWords encodeRegMovtT1MovwT3(int64_t Value); +LLVM_ABI uint32_t encodeImmMovtA1MovwA2(uint16_t Value); +LLVM_ABI uint32_t encodeRegMovtA1MovwA2(int64_t Value); + +LLVM_ABI int64_t decodeImmBT4BlT1BlxT2(uint32_t Hi, uint32_t Lo); +LLVM_ABI int64_t decodeImmBT4BlT1BlxT2_J1J2(uint32_t Hi, uint32_t Lo); +LLVM_ABI int64_t decodeImmBA1BlA1BlxA2(int64_t Value); +LLVM_ABI uint16_t decodeImmMovtT1MovwT3(uint32_t Hi, uint32_t Lo); +LLVM_ABI int64_t decodeRegMovtT1MovwT3(uint32_t Hi, uint32_t Lo); +LLVM_ABI uint16_t decodeImmMovtA1MovwA2(uint64_t Value); +LLVM_ABI int64_t decodeRegMovtA1MovwA2(uint64_t Value); } // namespace aarch32 } // namespace jitlink From 10d4652144fb5dd93b1996c6805ba805574e2cd7 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 16 Jul 2025 15:27:41 +0100 Subject: [PATCH 044/813] [HashRecognize] Track visited in ValueEvolution (#147812) Require that all Instructions in the Loop are visited by ValueEvolution, as any stray instructions would complicate life for the optimization. --- llvm/lib/Analysis/HashRecognize.cpp | 42 +++++++++++--- .../HashRecognize/cyclic-redundancy-check.ll | 58 ++++++++++++++++++- 2 files changed, 90 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Analysis/HashRecognize.cpp b/llvm/lib/Analysis/HashRecognize.cpp index 2cc3ad5f18482..92c9e37dbb484 100644 --- a/llvm/lib/Analysis/HashRecognize.cpp +++ b/llvm/lib/Analysis/HashRecognize.cpp @@ -102,8 +102,8 @@ class ValueEvolution { public: // ValueEvolution is meant to be constructed with the TripCount of the loop, - // and whether the polynomial algorithm is big-endian, for the significant-bit - // check. + // and a boolean indicating whether the polynomial algorithm is big-endian + // (for the significant-bit check). ValueEvolution(unsigned TripCount, bool ByteOrderSwapped); // Given a list of PHI nodes along with their incoming value from within the @@ -115,6 +115,10 @@ class ValueEvolution { // precise error message. StringRef getError() const { return ErrStr; } + // A set of Instructions visited by ValueEvolution. The only unvisited + // instructions will be ones not on the use-def chain of the PHIs' evolutions. + SmallPtrSet Visited; + // The computed KnownBits for each PHI node, which is populated after // computeEvolutions is called. KnownPhiMap KnownPhis; @@ -177,6 +181,9 @@ KnownBits ValueEvolution::computeBinOp(const BinaryOperator *I) { KnownBits ValueEvolution::computeInstr(const Instruction *I) { unsigned BitWidth = I->getType()->getScalarSizeInBits(); + // computeInstr is the only entry-point that needs to update the Visited set. + Visited.insert(I); + // We look up in the map that contains the KnownBits of the PHI from the // previous iteration. if (const PHINode *P = dyn_cast(I)) @@ -185,9 +192,12 @@ KnownBits ValueEvolution::computeInstr(const Instruction *I) { // Compute the KnownBits for a Select(Cmp()), forcing it to take the branch // that is predicated on the (least|most)-significant-bit check. CmpPredicate Pred; - Value *L, *R, *TV, *FV; - if (match(I, m_Select(m_ICmp(Pred, m_Value(L), m_Value(R)), m_Value(TV), - m_Value(FV)))) { + Value *L, *R; + Instruction *TV, *FV; + if (match(I, m_Select(m_ICmp(Pred, m_Value(L), m_Value(R)), m_Instruction(TV), + m_Instruction(FV)))) { + Visited.insert(cast(I->getOperand(0))); + // We need to check LCR against [0, 2) in the little-endian case, because // the RCR check is insufficient: it is simply [0, 1). if (!ByteOrderSwapped) { @@ -209,10 +219,17 @@ KnownBits ValueEvolution::computeInstr(const Instruction *I) { ConstantRange CheckRCR(APInt::getZero(ICmpBW), ByteOrderSwapped ? APInt::getSignedMinValue(ICmpBW) : APInt(ICmpBW, 1)); - if (AllowedR == CheckRCR) + + // We only compute KnownBits of either TV or FV, as the other value would + // just be a bit-shift as checked by isBigEndianBitShift. + if (AllowedR == CheckRCR) { + Visited.insert(FV); return compute(TV); - if (AllowedR.inverse() == CheckRCR) + } + if (AllowedR.inverse() == CheckRCR) { + Visited.insert(TV); return compute(FV); + } ErrStr = "Bad RHS of significant-bit-check"; return {BitWidth}; @@ -634,6 +651,17 @@ HashRecognize::recognizeCRC() const { return VE.getError(); KnownBits ResultBits = VE.KnownPhis.at(ConditionalRecurrence.Phi); + // There must be exactly four unvisited instructions, corresponding to the + // IndVar PHI. Any other unvisited instructions from the KnownBits propagation + // can complicate the optimization, which replaces the entire loop with the + // table-lookup version of the hash algorithm. + std::initializer_list AugmentVisited = { + IndVar, Latch->getTerminator(), L.getLatchCmpInst(), + cast(IndVar->getIncomingValueForBlock(Latch))}; + VE.Visited.insert_range(AugmentVisited); + if (std::distance(Latch->begin(), Latch->end()) != VE.Visited.size()) + return "Found stray unvisited instructions"; + unsigned N = std::min(TC, ResultBits.getBitWidth()); auto IsZero = [](const KnownBits &K) { return K.isZero(); }; if (!checkExtractBits(ResultBits, N, IsZero, *ByteOrderSwapped)) diff --git a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll index 247a105940e6e..fe140d01e8818 100644 --- a/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll +++ b/llvm/test/Analysis/HashRecognize/cyclic-redundancy-check.ll @@ -909,10 +909,10 @@ exit: ; preds = %loop ret i16 %crc.next } -define i16 @not.crc.bad.cast(i8 %msg, i16 %checksum) { -; CHECK-LABEL: 'not.crc.bad.cast' +define i16 @not.crc.bad.endian.swapped.sb.check(i8 %msg, i16 %checksum) { +; CHECK-LABEL: 'not.crc.bad.endian.swapped.sb.check' ; CHECK-NEXT: Did not find a hash algorithm -; CHECK-NEXT: Reason: Expected bottom 8 bits zero (????????00001011) +; CHECK-NEXT: Reason: Found stray unvisited instructions ; entry: br label %loop @@ -1189,3 +1189,55 @@ loop: ; preds = %loop, %entry exit: ; preds = %loop ret i16 %crc.next } + +define i16 @not.crc.stray.unvisited.call(i16 %crc.init) { +; CHECK-LABEL: 'not.crc.stray.unvisited.call' +; CHECK-NEXT: Did not find a hash algorithm +; CHECK-NEXT: Reason: Found stray unvisited instructions +; +entry: + br label %loop + +loop: ; preds = %loop, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ] + %crc.shl = shl i16 %crc, 1 + %crc.xor = xor i16 %crc.shl, 4129 + %check.sb = icmp slt i16 %crc, 0 + %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl + call void @print(i16 %crc.next) + %iv.next = add nuw nsw i32 %iv, 1 + %exit.cond = icmp samesign ult i32 %iv, 7 + br i1 %exit.cond, label %loop, label %exit + +exit: ; preds = %loop + ret i16 %crc.next +} + +declare void @print(i16) + +define i16 @not.crc.call.sb.check(i16 %crc.init) { +; CHECK-LABEL: 'not.crc.call.sb.check' +; CHECK-NEXT: Did not find a hash algorithm +; CHECK-NEXT: Reason: Found stray unvisited instructions +; +entry: + br label %loop + +loop: ; preds = %loop, %entry + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ] + %crc = phi i16 [ %crc.init, %entry ], [ %crc.next, %loop ] + %crc.shl = shl i16 %crc, 1 + %crc.xor = xor i16 %crc.shl, 4129 + %call = call i16 @side.effect() + %check.sb = icmp slt i16 %call, 0 + %crc.next = select i1 %check.sb, i16 %crc.xor, i16 %crc.shl + %iv.next = add nuw nsw i32 %iv, 1 + %exit.cond = icmp samesign ult i32 %iv, 7 + br i1 %exit.cond, label %loop, label %exit + +exit: ; preds = %loop + ret i16 %crc.next +} + +declare i16 @side.effect() From 8ef1a0ec1fd5663a1d85f9cc84bf6d86eb46980d Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 16 Jul 2025 15:27:58 +0100 Subject: [PATCH 045/813] [HashRecognize] Track visited in ValueEvolution (#147812) Require that all Instructions in the Loop are visited by ValueEvolution, as any stray instructions would complicate life for the optimization. From 584158f9aeac7df66ca08649b8c082883c66b360 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Wed, 16 Jul 2025 15:30:53 +0100 Subject: [PATCH 046/813] [LAA] Hoist check for SCEV-uncomputable dist (NFC) (#148841) Hoist the check for SCEVCouldNotCompute distance into getDependenceDistanceAndSize. --- llvm/lib/Analysis/LoopAccessAnalysis.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index f8f741575f87a..f3a32d3055edb 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2085,6 +2085,12 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize( if (!isa(Dist)) FoundNonConstantDistanceDependence |= StrideAPtrInt == StrideBPtrInt; + // If distance is a SCEVCouldNotCompute, return Unknown immediately. + if (isa(Dist)) { + LLVM_DEBUG(dbgs() << "LAA: Uncomputable distance.\n"); + return Dependence::Unknown; + } + return DepDistanceStrideAndSizeInfo(Dist, MaxStride, CommonStride, TypeByteSize, AIsWrite, BIsWrite); } @@ -2122,13 +2128,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, std::get(Res); bool HasSameSize = TypeByteSize > 0; - if (isa(Dist)) { - if (CheckCompletelyBeforeOrAfter()) - return Dependence::NoDep; - LLVM_DEBUG(dbgs() << "LAA: Dependence because of uncomputable distance.\n"); - return Dependence::Unknown; - } - ScalarEvolution &SE = *PSE.getSE(); auto &DL = InnermostLoop->getHeader()->getDataLayout(); From f995bc802d06448a41846652d5e9a9f30f80a688 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 16 Jul 2025 15:41:51 +0100 Subject: [PATCH 047/813] [lldb] Improve setting of program for filtering disassembly (#148823) This changes the example command added in https://github.com/llvm/llvm-project/pull/145793 so that the fdis program does not have to be a single program name. Doing so also means we can run the test on Windows where the program needs to be "python.exe script_name". I've changed "fdis set" to treat the rest of the command as the program. Then store that as a list to be passed to subprocess. If we just use a string, Python will think that "python.exe foo" is the name of an actual program instead of a program and an argument to it. This will still break if the paths have spaces in, but I'm trying to do just enough to fix the test here without rewriting all the option handling. --- lldb/examples/python/filter_disasm.py | 26 ++++++++++++++----- .../command-disassemble-riscv32-bytes.s | 5 ++-- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/lldb/examples/python/filter_disasm.py b/lldb/examples/python/filter_disasm.py index de99d4031a7fd..46c9f794b25a2 100644 --- a/lldb/examples/python/filter_disasm.py +++ b/lldb/examples/python/filter_disasm.py @@ -11,8 +11,13 @@ import lldb import subprocess -filter_program = "crustfilt" +class Program(list): + def __str__(self): + return " ".join(self) + + +filter_program = Program(["crustfilt"]) def __lldb_init_module(debugger, dict): debugger.HandleCommand("command script add -f filter_disasm.fdis fdis") @@ -51,13 +56,20 @@ def fdis(debugger, args, exe_ctx, result, dict): result.Clear() if len(args_list) == 1 and args_list[0] == "get": - result.PutCString(filter_program) + result.PutCString(str(filter_program)) result.SetStatus(lldb.eReturnStatusSuccessFinishResult) return - if len(args_list) == 2 and args_list[0] == "set": - filter_program = args_list[1] - result.PutCString("Filter program set to %s" % filter_program) + if args_list[0] == "set": + # Assume the rest is a program to run and any arguments to be passed to + # it. + if len(args_list) <= 1: + result.PutCString('"set" command requires a program argument') + result.SetStatus(lldb.eReturnStatusFailed) + return + + filter_program = Program(args_list[1:]) + result.PutCString('Filter program set to "{}"'.format(filter_program)) result.SetStatus(lldb.eReturnStatusSuccessFinishResult) return @@ -70,7 +82,9 @@ def fdis(debugger, args, exe_ctx, result, dict): output = res.GetOutput() try: - proc = subprocess.run([filter_program], capture_output=True, text=True, input=output) + proc = subprocess.run( + filter_program, capture_output=True, text=True, input=output + ) except (subprocess.SubprocessError, OSError) as e: result.PutCString("Error occurred. Original disassembly:\n\n" + output) result.SetError(str(e)) diff --git a/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s b/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s index bd40baf2643a0..78be614e3af15 100644 --- a/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s +++ b/lldb/test/Shell/Commands/command-disassemble-riscv32-bytes.s @@ -1,6 +1,5 @@ # REQUIRES: riscv -# Unsupported until we fix launching the filter program on Windows. -# UNSUPPORTED: system-windows +# REQUIRES: python # This test verifies that disassemble -b prints out the correct bytes and # format for standard and unknown riscv instructions of various sizes, @@ -11,7 +10,7 @@ # RUN: llvm-mc -filetype=obj -mattr=+c --triple=riscv32-unknown-unknown %s -o %t # RUN: %lldb -b %t "-o" "disassemble -b -n main" | FileCheck %s -# RUN: %lldb -b %t -o "command script import %S/../../../examples/python/filter_disasm.py" -o "fdis set %S/Inputs/dis_filt.py" -o "fdis -n main" | FileCheck --check-prefix=FILTER %s +# RUN: %lldb -b %t -o "command script import %S/../../../examples/python/filter_disasm.py" -o "fdis set %python %S/Inputs/dis_filt.py" -o "fdis -n main" | FileCheck --check-prefix=FILTER %s main: addi sp, sp, -0x20 # 16 bit standard instruction From 616e4c43dd196450b11376971966d71e501c26b8 Mon Sep 17 00:00:00 2001 From: Martin Erhart Date: Wed, 16 Jul 2025 15:45:15 +0100 Subject: [PATCH 048/813] [mlir] Add Python bindings to enable default passmanager timing (#149087) --- mlir/include/mlir-c/Pass.h | 4 ++++ mlir/lib/Bindings/Python/Pass.cpp | 6 ++++++ mlir/lib/CAPI/IR/Pass.cpp | 4 ++++ 3 files changed, 14 insertions(+) diff --git a/mlir/include/mlir-c/Pass.h b/mlir/include/mlir-c/Pass.h index 8fd8e9956a65a..0d2e19ee7fb0a 100644 --- a/mlir/include/mlir-c/Pass.h +++ b/mlir/include/mlir-c/Pass.h @@ -88,6 +88,10 @@ MLIR_CAPI_EXPORTED void mlirPassManagerEnableIRPrinting( MLIR_CAPI_EXPORTED void mlirPassManagerEnableVerifier(MlirPassManager passManager, bool enable); +/// Enable pass timing. +MLIR_CAPI_EXPORTED void +mlirPassManagerEnableTiming(MlirPassManager passManager); + /// Nest an OpPassManager under the top-level PassManager, the nested /// passmanager will only run on operations matching the provided name. /// The returned OpPassManager will be destroyed when the parent is destroyed. diff --git a/mlir/lib/Bindings/Python/Pass.cpp b/mlir/lib/Bindings/Python/Pass.cpp index 858c3bd5745fe..8d84864b9db4d 100644 --- a/mlir/lib/Bindings/Python/Pass.cpp +++ b/mlir/lib/Bindings/Python/Pass.cpp @@ -112,6 +112,12 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) { mlirPassManagerEnableVerifier(passManager.get(), enable); }, "enable"_a, "Enable / disable verify-each.") + .def( + "enable_timing", + [](PyPassManager &passManager) { + mlirPassManagerEnableTiming(passManager.get()); + }, + "Enable pass timing.") .def_static( "parse", [](const std::string &pipeline, DefaultingPyMlirContext context) { diff --git a/mlir/lib/CAPI/IR/Pass.cpp b/mlir/lib/CAPI/IR/Pass.cpp index 883b7e8bb832d..3c499c3e4974d 100644 --- a/mlir/lib/CAPI/IR/Pass.cpp +++ b/mlir/lib/CAPI/IR/Pass.cpp @@ -75,6 +75,10 @@ void mlirPassManagerEnableVerifier(MlirPassManager passManager, bool enable) { unwrap(passManager)->enableVerifier(enable); } +void mlirPassManagerEnableTiming(MlirPassManager passManager) { + unwrap(passManager)->enableTiming(); +} + MlirOpPassManager mlirPassManagerGetNestedUnder(MlirPassManager passManager, MlirStringRef operationName) { return wrap(&unwrap(passManager)->nest(unwrap(operationName))); From 7eb65f470c5e5c6e4b3320c8bbe0eaf7705d8c4e Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 16 Jul 2025 15:37:07 +0100 Subject: [PATCH 049/813] [DebugInfo] Delete a now-unused function after 5328c732a4770 --- llvm/lib/Transforms/Utils/Local.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 72bc09431e9cb..b14bbeac97675 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1664,14 +1664,6 @@ static void insertDbgValueOrDbgVariableRecord(DIBuilder &Builder, Value *DV, Instr->getParent()->insertDbgRecordBefore(DVRec, Instr); } -static void insertDbgValueOrDbgVariableRecordAfter( - DIBuilder &Builder, Value *DV, DILocalVariable *DIVar, DIExpression *DIExpr, - const DebugLoc &NewLoc, Instruction *Instr) { - BasicBlock::iterator NextIt = std::next(Instr->getIterator()); - NextIt.setHeadBit(true); - insertDbgValueOrDbgVariableRecord(Builder, DV, DIVar, DIExpr, NewLoc, NextIt); -} - static DIExpression *dropInitialDeref(const DIExpression *DIExpr) { int NumEltDropped = DIExpr->getElements()[0] == dwarf::DW_OP_LLVM_arg ? 3 : 1; return DIExpression::get(DIExpr->getContext(), From 7d0a81c508f6268a85e4ad59e4fb78cde0d2e5ba Mon Sep 17 00:00:00 2001 From: Augie Fackler Date: Wed, 16 Jul 2025 10:47:12 -0400 Subject: [PATCH 050/813] [bazel] update mlir BUILD files for fc114e4d931ae25f74a15e42371dbead1387ad51 --- .../llvm-project-overlay/mlir/BUILD.bazel | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index f5f0d92685e0c..719b3da1822e2 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3961,6 +3961,7 @@ cc_library( ":ComplexToSPIRV", ":ComplexToStandard", ":ControlFlowToLLVM", + ":ComplexToROCDLLibraryCalls", ":ControlFlowToSCF", ":ControlFlowToSPIRV", ":ConversionPassIncGen", @@ -11900,6 +11901,25 @@ cc_library( ], ) +cc_library( + name = "ComplexToROCDLLibraryCalls", + srcs = glob([ + "lib/Conversion/ComplexToROCDLLibraryCalls/*.cpp", + ]), + hdrs = glob([ + "include/mlir/Conversion/ComplexToROCDLLibraryCalls/*.h", + ]), + includes = ["include"], + deps = [ + ":ComplexDialect", + ":ConversionPassIncGen", + ":FuncDialect", + ":IR", + ":Pass", + ":TransformUtils", + ], +) + cc_library( name = "ComplexToSPIRV", srcs = glob([ From d9190f8141661bd6120dea61d28ae8940fd775d0 Mon Sep 17 00:00:00 2001 From: Augie Fackler Date: Wed, 16 Jul 2025 10:48:45 -0400 Subject: [PATCH 051/813] [bazel] run buildifier on mlir/BUILD.bazel This reorders a couple of deps lists, but seems worth doing. --- .../bazel/llvm-project-overlay/mlir/BUILD.bazel | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 719b3da1822e2..683885e1d4123 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3958,10 +3958,10 @@ cc_library( ":BufferizationToMemRef", ":ComplexToLLVM", ":ComplexToLibm", + ":ComplexToROCDLLibraryCalls", ":ComplexToSPIRV", ":ComplexToStandard", ":ControlFlowToLLVM", - ":ComplexToROCDLLibraryCalls", ":ControlFlowToSCF", ":ControlFlowToSPIRV", ":ConversionPassIncGen", @@ -13677,17 +13677,17 @@ cc_library( ]), includes = ["include"], deps = [ - ":XeVMDialect", ":ConversionPassIncGen", - ":ConvertToLLVMInterface", - ":GPUDialect", + ":ConvertToLLVMInterface", + ":GPUDialect", ":IR", - ":LLVMCommonConversion", - ":LLVMDialect", + ":LLVMCommonConversion", + ":LLVMDialect", ":Pass", - ":Support", + ":Support", ":TransformUtils", ":VectorDialect", - "//llvm:Support", + ":XeVMDialect", + "//llvm:Support", ], ) From a13712ed88cf6fc37d3789d4c3b54ffdd1a05a1d Mon Sep 17 00:00:00 2001 From: royitaqi Date: Wed, 16 Jul 2025 07:57:13 -0700 Subject: [PATCH 052/813] [lldb] Fix a crash in lldb-server during RemoveSoftwareBreakpoint() (#148738) # Lldb-server crash We have seen stacks like the following in lldb-server core dumps: ``` [ "__GI___pthread_kill at pthread_kill.c:46", "__GI_raise at raise.c:26", "__GI_abort at abort.c:100", "__assert_fail_base at assert.c:92", "__GI___assert_fail at assert.c:101", "lldb_private::NativeProcessProtocol::RemoveSoftwareBreakpoint(unsigned long) at /redacted/lldb-server:0" ] ``` # Hypothesis of root cause In `NativeProcessProtocol::RemoveSoftwareBreakpoint()` ([code](https://github.com/llvm/llvm-project/blob/19b2dd9d798c124406b0124a1b8debb711675281/lldb/source/Host/common/NativeProcessProtocol.cpp#L359-L423)), a `ref_count` is asserted and reduced. If it becomes zero, the code first go through a series of memory reads and writes to remove the breakpoint trap opcode and to restore the original process code, then, if everything goes fine, removes the entry from the map `m_software_breakpoints` at the end of the function. However, if any of the validations for the above reads and writes goes wrong, the code returns an error early, skipping the removal of the entry. This leaves the entry behind with a `ref_count` of zero. The next call to `NativeProcessProtocol::RemoveSoftwareBreakpoint()` for the same breakpoint[*] would violate the assertion about `ref_count > 0` ([here](https://github.com/llvm/llvm-project/blob/19b2dd9d798c124406b0124a1b8debb711675281/lldb/source/Host/common/NativeProcessProtocol.cpp#L365)), which would cause a crash. [*] We haven't found a *regular* way to repro such a next call in lldb or lldb-dap. This is because both of them remove the breakpoint from their internal list when they get any response from the lldb-server (OK or error). Asking the client to delete the breakpoint a second time doesn't trigger the client to send the `$z` gdb packet to lldb-server. We are able to trigger the crash by sending the `$z` packet directly, see "Manual test" below. # Fix Lift the removal of the map entry to be immediately after the decrement of `ref_count`, before the early returns. This ensures that the asserted case will never happen. The validation errors can still happen, and whether they happen or not, the breakpoint has been removed from the perspective of the lldb-server (same as that of lldb and lldb-dap). # Manual test & unit test See PR. --- .../Host/common/NativeProcessProtocol.cpp | 18 ++-- .../Host/NativeProcessProtocolTest.cpp | 93 ++++++++++++++++++- 2 files changed, 104 insertions(+), 7 deletions(-) diff --git a/lldb/source/Host/common/NativeProcessProtocol.cpp b/lldb/source/Host/common/NativeProcessProtocol.cpp index 405acbb5662d6..196f54b93538d 100644 --- a/lldb/source/Host/common/NativeProcessProtocol.cpp +++ b/lldb/source/Host/common/NativeProcessProtocol.cpp @@ -366,12 +366,19 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) { if (--it->second.ref_count > 0) return Status(); + // Remove the entry from m_software_breakpoints rightaway, so that we don't + // leave behind an entry with ref_count == 0 in case one of the following + // conditions returns an error. The breakpoint is moved so that it can be + // accessed below. + SoftwareBreakpoint bkpt = std::move(it->second); + m_software_breakpoints.erase(it); + // This is the last reference. Let's remove the breakpoint. Status error; // Clear a software breakpoint instruction - llvm::SmallVector curr_break_op( - it->second.breakpoint_opcodes.size(), 0); + llvm::SmallVector curr_break_op(bkpt.breakpoint_opcodes.size(), + 0); // Read the breakpoint opcode size_t bytes_read = 0; @@ -382,10 +389,10 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) { "addr=0x%" PRIx64 ": tried to read %zu bytes but only read %zu", addr, curr_break_op.size(), bytes_read); } - const auto &saved = it->second.saved_opcodes; + const auto &saved = bkpt.saved_opcodes; // Make sure the breakpoint opcode exists at this address - if (llvm::ArrayRef(curr_break_op) != it->second.breakpoint_opcodes) { - if (curr_break_op != it->second.saved_opcodes) + if (llvm::ArrayRef(curr_break_op) != bkpt.breakpoint_opcodes) { + if (curr_break_op != bkpt.saved_opcodes) return Status::FromErrorString( "Original breakpoint trap is no longer in memory."); LLDB_LOG(log, @@ -418,7 +425,6 @@ Status NativeProcessProtocol::RemoveSoftwareBreakpoint(lldb::addr_t addr) { llvm::make_range(saved.begin(), saved.end())); } - m_software_breakpoints.erase(it); return Status(); } diff --git a/lldb/unittests/Host/NativeProcessProtocolTest.cpp b/lldb/unittests/Host/NativeProcessProtocolTest.cpp index a48e67c9213da..91c4fd69d6e54 100644 --- a/lldb/unittests/Host/NativeProcessProtocolTest.cpp +++ b/lldb/unittests/Host/NativeProcessProtocolTest.cpp @@ -73,6 +73,97 @@ TEST(NativeProcessProtocolTest, SetBreakpointFailVerify) { llvm::Failed()); } +TEST(NativeProcessProtocolTest, RemoveSoftwareBreakpoint) { + NiceMock DummyDelegate; + MockProcess Process(DummyDelegate, + ArchSpec("x86_64-pc-linux")); + auto Trap = cantFail(Process.GetSoftwareBreakpointTrapOpcode(1)); + auto Original = std::vector{0xbb}; + + // Set up a breakpoint. + { + InSequence S; + EXPECT_CALL(Process, ReadMemory(0x47, 1)) + .WillOnce(Return(ByMove(Original))); + EXPECT_CALL(Process, WriteMemory(0x47, Trap)).WillOnce(Return(ByMove(1))); + EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap))); + EXPECT_THAT_ERROR(Process.SetBreakpoint(0x47, 0, false).ToError(), + llvm::Succeeded()); + } + + // Remove the breakpoint for the first time. This should remove the breakpoint + // from m_software_breakpoints. + // + // Should succeed. + { + InSequence S; + EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap))); + EXPECT_CALL(Process, WriteMemory(0x47, llvm::ArrayRef(Original))) + .WillOnce(Return(ByMove(1))); + EXPECT_CALL(Process, ReadMemory(0x47, 1)) + .WillOnce(Return(ByMove(Original))); + EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(), + llvm::Succeeded()); + } + + // Remove the breakpoint for the second time. + // + // Should fail. None of the ReadMemory() or WriteMemory() should be called, + // because the function should early return when seeing that the breakpoint + // isn't in m_software_breakpoints. + { + EXPECT_CALL(Process, ReadMemory(_, _)).Times(0); + EXPECT_CALL(Process, WriteMemory(_, _)).Times(0); + EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(), + llvm::Failed()); + } +} + +TEST(NativeProcessProtocolTest, RemoveSoftwareBreakpointMemoryError) { + NiceMock DummyDelegate; + MockProcess Process(DummyDelegate, + ArchSpec("x86_64-pc-linux")); + auto Trap = cantFail(Process.GetSoftwareBreakpointTrapOpcode(1)); + auto Original = std::vector{0xbb}; + auto SomethingElse = std::vector{0xaa}; + + // Set up a breakpoint. + { + InSequence S; + EXPECT_CALL(Process, ReadMemory(0x47, 1)) + .WillOnce(Return(ByMove(Original))); + EXPECT_CALL(Process, WriteMemory(0x47, Trap)).WillOnce(Return(ByMove(1))); + EXPECT_CALL(Process, ReadMemory(0x47, 1)).WillOnce(Return(ByMove(Trap))); + EXPECT_THAT_ERROR(Process.SetBreakpoint(0x47, 0, false).ToError(), + llvm::Succeeded()); + } + + // Remove the breakpoint for the first time, with an unexpected value read by + // the first ReadMemory(). This should cause an early return, with the + // breakpoint removed from m_software_breakpoints. + // + // Should fail. + { + InSequence S; + EXPECT_CALL(Process, ReadMemory(0x47, 1)) + .WillOnce(Return(ByMove(SomethingElse))); + EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(), + llvm::Failed()); + } + + // Remove the breakpoint for the second time. + // + // Should fail. None of the ReadMemory() or WriteMemory() should be called, + // because the function should early return when seeing that the breakpoint + // isn't in m_software_breakpoints. + { + EXPECT_CALL(Process, ReadMemory(_, _)).Times(0); + EXPECT_CALL(Process, WriteMemory(_, _)).Times(0); + EXPECT_THAT_ERROR(Process.RemoveBreakpoint(0x47, false).ToError(), + llvm::Failed()); + } +} + TEST(NativeProcessProtocolTest, ReadMemoryWithoutTrap) { NiceMock DummyDelegate; MockProcess Process(DummyDelegate, @@ -146,4 +237,4 @@ TEST(NativeProcessProtocolTest, ReadCStringFromMemory_CrossPageBoundary) { bytes_read), llvm::HasValue(llvm::StringRef("hello"))); EXPECT_EQ(bytes_read, 6UL); -} \ No newline at end of file +} From b8b99d83a6d891001d4070e3cac9f029cc57059e Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 16 Jul 2025 17:13:08 +0200 Subject: [PATCH 053/813] [libc++] Simplify __hash_table further (#148375) --- libcxx/include/__hash_table | 115 +++++------------- libcxx/include/unordered_map | 8 +- .../unord/key_value_traits.pass.cpp | 60 --------- 3 files changed, 34 insertions(+), 149 deletions(-) delete mode 100644 libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table index 78f2f3bfd2f4c..03f50d9f3f269 100644 --- a/libcxx/include/__hash_table +++ b/libcxx/include/__hash_table @@ -122,6 +122,19 @@ struct __get_hash_node_value_type<__hash_value_type<_Key, _Tp> > { template using __get_hash_node_value_type_t _LIBCPP_NODEBUG = typename __get_hash_node_value_type<_Tp>::type; +template +struct __get_hash_node_key_type { + using type _LIBCPP_NODEBUG = _Tp; +}; + +template +struct __get_hash_node_key_type<__hash_value_type<_Key, _Tp> > { + using type _LIBCPP_NODEBUG = _Key; +}; + +template +using __get_hash_node_key_type_t _LIBCPP_NODEBUG = typename __get_hash_node_key_type<_Tp>::type; + template struct __hash_node : public __hash_node_base< __rebind_pointer_t<_VoidPtr, __hash_node<_Tp, _VoidPtr> > > { using __node_value_type _LIBCPP_NODEBUG = __get_hash_node_value_type_t<_Tp>; @@ -182,69 +195,11 @@ class __hash_map_iterator; template class __hash_map_const_iterator; -template -struct __hash_key_value_types { - static_assert(!is_reference<_Tp>::value && !is_const<_Tp>::value, ""); - typedef _Tp key_type; - typedef _Tp __node_value_type; - typedef _Tp __container_value_type; - static const bool __is_map = false; - - _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(_Tp const& __v) { return __v; } - _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(__node_value_type const& __v) { return __v; } - _LIBCPP_HIDE_FROM_ABI static __container_value_type* __get_ptr(__node_value_type& __n) { return std::addressof(__n); } - _LIBCPP_HIDE_FROM_ABI static __container_value_type&& __move(__node_value_type& __v) { return std::move(__v); } -}; - -template -struct __hash_key_value_types<__hash_value_type<_Key, _Tp> > { - typedef _Key key_type; - typedef _Tp mapped_type; - typedef __hash_value_type<_Key, _Tp> __node_value_type; - typedef pair __container_value_type; - typedef __container_value_type __map_value_type; - static const bool __is_map = true; - - _LIBCPP_HIDE_FROM_ABI static key_type const& __get_key(__container_value_type const& __v) { return __v.first; } - - template , __node_value_type>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(_Up& __t) { - return __t.__get_value(); - } - - template , __container_value_type>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI static __container_value_type const& __get_value(_Up& __t) { - return __t; - } - - _LIBCPP_HIDE_FROM_ABI static __container_value_type* __get_ptr(__container_value_type& __n) { - return std::addressof(__n); - } - _LIBCPP_HIDE_FROM_ABI static pair __move(__node_value_type& __v) { return __v.__move(); } -}; - -template , bool = _KVTypes::__is_map> -struct __hash_map_pointer_types {}; - -template -struct __hash_map_pointer_types<_Tp, _AllocPtr, _KVTypes, true> { - typedef typename _KVTypes::__map_value_type _Mv; - typedef __rebind_pointer_t<_AllocPtr, _Mv> __map_value_type_pointer; - typedef __rebind_pointer_t<_AllocPtr, const _Mv> __const_map_value_type_pointer; -}; - template ::element_type> struct __hash_node_types; template -struct __hash_node_types<_NodePtr, __hash_node<_Tp, _VoidPtr> > - : public __hash_key_value_types<_Tp>, - __hash_map_pointer_types<_Tp, _VoidPtr> - -{ - typedef __hash_key_value_types<_Tp> __base; - -public: +struct __hash_node_types<_NodePtr, __hash_node<_Tp, _VoidPtr> > { typedef ptrdiff_t difference_type; typedef size_t size_type; @@ -617,8 +572,6 @@ public: typedef typename __alloc_traits::pointer pointer; private: - typedef __hash_node_types _NodeTypes; - allocator_type& __na_; public: @@ -633,7 +586,7 @@ public: _LIBCPP_HIDE_FROM_ABI void operator()(pointer __p) _NOEXCEPT { if (__value_constructed) { - __alloc_traits::destroy(__na_, _NodeTypes::__get_ptr(__p->__get_value())); + __alloc_traits::destroy(__na_, std::addressof(__p->__get_value())); std::__destroy_at(std::addressof(*__p)); } if (__p) @@ -684,6 +637,8 @@ template class __hash_table { public: using value_type = __get_hash_node_value_type_t<_Tp>; + using key_type = __get_hash_node_key_type_t<_Tp>; + typedef _Hash hasher; typedef _Equal key_equal; typedef _Alloc allocator_type; @@ -694,8 +649,6 @@ private: public: typedef typename _NodeTypes::__node_value_type __node_value_type; - typedef typename _NodeTypes::__container_value_type __container_value_type; - typedef typename _NodeTypes::key_type key_type; typedef value_type& reference; typedef const value_type& const_reference; typedef typename __alloc_traits::pointer pointer; @@ -824,7 +777,7 @@ public: template ::value, int> = 0> + __enable_if_t<__can_extract_map_key<_First, key_type, value_type>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI pair __emplace_unique(_First&& __f, _Second&& __s) { return __emplace_unique_key_args(__f, std::forward<_First>(__f), std::forward<_Second>(__s)); } @@ -854,9 +807,7 @@ public: template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI void __insert_unique_from_orphaned_node(value_type&& __value) { - using __key_type = typename _NodeTypes::key_type; - - __node_holder __h = __construct_node(const_cast<__key_type&&>(__value.first), std::move(__value.second)); + __node_holder __h = __construct_node(const_cast(__value.first), std::move(__value.second)); __node_insert_unique(__h.get()); __h.release(); } @@ -870,9 +821,7 @@ public: template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI void __insert_multi_from_orphaned_node(value_type&& __value) { - using __key_type = typename _NodeTypes::key_type; - - __node_holder __h = __construct_node(const_cast<__key_type&&>(__value.first), std::move(__value.second)); + __node_holder __h = __construct_node(const_cast(__value.first), std::move(__value.second)); __node_insert_multi(__h.get()); __h.release(); } @@ -1047,12 +996,10 @@ private: template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI void __assign_value(__get_hash_node_value_type_t<_Tp>& __lhs, _From&& __rhs) { - using __key_type = typename _NodeTypes::key_type; - // This is technically UB, since the object was constructed as `const`. // Clang doesn't optimize on this currently though. - const_cast<__key_type&>(__lhs.first) = const_cast<__copy_cvref_t<_From, __key_type>&&>(__rhs.first); - __lhs.second = std::forward<_From>(__rhs).second; + const_cast(__lhs.first) = const_cast<__copy_cvref_t<_From, key_type>&&>(__rhs.first); + __lhs.second = std::forward<_From>(__rhs).second; } template ::value, int> = 0> @@ -1201,7 +1148,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__deallocate_node(__next_pointer while (__np != nullptr) { __next_pointer __next = __np->__next_; __node_pointer __real_np = __np->__upcast(); - __node_traits::destroy(__na, _NodeTypes::__get_ptr(__real_np->__get_value())); + __node_traits::destroy(__na, std::addressof(__real_np->__get_value())); std::__destroy_at(std::addressof(*__real_np)); __node_traits::deallocate(__na, __real_np, 1); __np = __next; @@ -1290,8 +1237,8 @@ template void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_unique(_InputIterator __first, _InputIterator __last) { typedef iterator_traits<_InputIterator> _ITraits; typedef typename _ITraits::value_type _ItValueType; - static_assert(is_same<_ItValueType, __container_value_type>::value, - "__assign_unique may only be called with the containers value type"); + static_assert( + is_same<_ItValueType, value_type>::value, "__assign_unique may only be called with the containers value type"); if (bucket_count() != 0) { __next_pointer __cache = __detach(); @@ -1321,10 +1268,8 @@ template void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __first, _InputIterator __last) { typedef iterator_traits<_InputIterator> _ITraits; typedef typename _ITraits::value_type _ItValueType; - static_assert( - (is_same<_ItValueType, __container_value_type>::value || is_same<_ItValueType, __node_value_type>::value), - "__assign_multi may only be called with the containers value type" - " or the nodes value type"); + static_assert(is_same<_ItValueType, value_type>::value, + "__assign_multi may only be called with the containers value type or the nodes value type"); if (bucket_count() != 0) { __next_pointer __cache = __detach(); #if _LIBCPP_HAS_EXCEPTIONS @@ -1345,7 +1290,7 @@ void __hash_table<_Tp, _Hash, _Equal, _Alloc>::__assign_multi(_InputIterator __f __deallocate_node(__cache); } for (; __first != __last; ++__first) - __emplace_multi(_NodeTypes::__get_value(*__first)); + __emplace_multi(*__first); } template @@ -1863,7 +1808,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node(_Args&&... __args) { std::__construct_at(std::addressof(*__h), /* next = */ nullptr, /* hash = */ 0); // Now construct the value_type using the allocator's construct() method. - __node_traits::construct(__na, _NodeTypes::__get_ptr(__h->__get_value()), std::forward<_Args>(__args)...); + __node_traits::construct(__na, std::addressof(__h->__get_value()), std::forward<_Args>(__args)...); __h.get_deleter().__value_constructed = true; __h->__hash_ = hash_function()(__h->__get_value()); @@ -1879,7 +1824,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__construct_node_hash(size_t __hash, _ __node_holder __h(__node_traits::allocate(__na, 1), _Dp(__na)); std::__construct_at(std::addressof(*__h), /* next = */ nullptr, /* hash = */ __hash); __node_traits::construct( - __na, _NodeTypes::__get_ptr(__h->__get_value()), std::forward<_First>(__f), std::forward<_Rest>(__rest)...); + __na, std::addressof(__h->__get_value()), std::forward<_First>(__f), std::forward<_Rest>(__rest)...); __h.get_deleter().__value_constructed = true; return __h; } diff --git a/libcxx/include/unordered_map b/libcxx/include/unordered_map index 5b70cdeae11a5..97c2c52eba337 100644 --- a/libcxx/include/unordered_map +++ b/libcxx/include/unordered_map @@ -844,10 +844,10 @@ class __hash_map_iterator { public: typedef forward_iterator_tag iterator_category; - typedef typename _NodeTypes::__map_value_type value_type; + using value_type = typename _HashIterator::value_type; typedef typename _NodeTypes::difference_type difference_type; typedef value_type& reference; - typedef typename _NodeTypes::__map_value_type_pointer pointer; + using pointer = typename _HashIterator::pointer; _LIBCPP_HIDE_FROM_ABI __hash_map_iterator() _NOEXCEPT {} @@ -895,10 +895,10 @@ class __hash_map_const_iterator { public: typedef forward_iterator_tag iterator_category; - typedef typename _NodeTypes::__map_value_type value_type; + using value_type = typename _HashIterator::value_type; typedef typename _NodeTypes::difference_type difference_type; typedef const value_type& reference; - typedef typename _NodeTypes::__const_map_value_type_pointer pointer; + using pointer = typename _HashIterator::pointer; _LIBCPP_HIDE_FROM_ABI __hash_map_const_iterator() _NOEXCEPT {} diff --git a/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp b/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp deleted file mode 100644 index e00a028489a72..0000000000000 --- a/libcxx/test/libcxx/containers/unord/key_value_traits.pass.cpp +++ /dev/null @@ -1,60 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// XFAIL: FROZEN-CXX03-HEADERS-FIXME - -#include <__hash_table> -#include -#include -#include - -#include "test_macros.h" -#include "min_allocator.h" - -void testKeyValueTrait() { - { - typedef int Tp; - typedef std::__hash_key_value_types Traits; - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); - static_assert(Traits::__is_map == false, ""); - } - { - typedef std::pair Tp; - typedef std::__hash_key_value_types Traits; - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); - static_assert(Traits::__is_map == false, ""); - } - { - typedef std::pair Tp; - typedef std::__hash_key_value_types Traits; - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); - static_assert(Traits::__is_map == false, ""); - } - { - typedef std::__hash_value_type Tp; - typedef std::__hash_key_value_types Traits; - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); - static_assert((std::is_same::value), ""); - static_assert((std::is_same >::value), ""); - static_assert((std::is_same >::value), ""); - static_assert(Traits::__is_map == true, ""); - } -} - -int main(int, char**) { - testKeyValueTrait(); - - return 0; -} From a89e6f66722713addd3a7a60377ddbb38add4702 Mon Sep 17 00:00:00 2001 From: Steve O'Brien Date: Wed, 16 Jul 2025 11:19:54 -0400 Subject: [PATCH 054/813] Minor formatting fix in 'generate_feature_test_macro_components' (#148889) Fixes a small annoyance where generated files have a format which does not agree with the one checked during `code-formatter` in CI. For example `libcxx-generate-files` updates (among possibly others) the `*.version.compile.pass.cpp` files. Previously these files contained an extra newline which would fail the code format check. If you update that file manually to remove just that extra trailing newline, then `check-generated-output` will fail due to the file's contents differing from what's expected. Contains a number of changes: one actual change to the py script, and lots of resulting whitespace changes. My process for this was: * Update `generate_feature_test_macro_components`: just remove an extra newline which causes the code-format step to fail * Run `$NINJA libcxx-generate-files` to rebuild all these `.version.pass.cpp`'s * Watch this PR's CI run to ensure things pass (i.e. this didn't break things worse) --- .../support.limits.general/algorithm.version.compile.pass.cpp | 1 - .../support.limits.general/any.version.compile.pass.cpp | 1 - .../support.limits.general/array.version.compile.pass.cpp | 1 - .../support.limits.general/atomic.version.compile.pass.cpp | 1 - .../support.limits.general/barrier.version.compile.pass.cpp | 1 - .../support.limits.general/bit.version.compile.pass.cpp | 1 - .../support.limits.general/bitset.version.compile.pass.cpp | 1 - .../support.limits.general/charconv.version.compile.pass.cpp | 1 - .../support.limits.general/chrono.version.compile.pass.cpp | 1 - .../support.limits.general/cmath.version.compile.pass.cpp | 1 - .../support.limits.general/compare.version.compile.pass.cpp | 1 - .../support.limits.general/complex.version.compile.pass.cpp | 1 - .../support.limits.general/concepts.version.compile.pass.cpp | 1 - .../support.limits.general/coroutine.version.compile.pass.cpp | 1 - .../support.limits.general/cstddef.version.compile.pass.cpp | 1 - .../support.limits.general/cstdlib.version.compile.pass.cpp | 1 - .../support.limits.general/cstring.version.compile.pass.cpp | 1 - .../support.limits.general/deque.version.compile.pass.cpp | 1 - .../support.limits.general/exception.version.compile.pass.cpp | 1 - .../support.limits.general/execution.version.compile.pass.cpp | 1 - .../support.limits.general/expected.version.compile.pass.cpp | 1 - .../support.limits.general/filesystem.version.compile.pass.cpp | 1 - .../support.limits.general/flat_map.version.compile.pass.cpp | 1 - .../support.limits.general/flat_set.version.compile.pass.cpp | 1 - .../support.limits.general/format.version.compile.pass.cpp | 1 - .../support.limits.general/forward_list.version.compile.pass.cpp | 1 - .../support.limits.general/fstream.version.compile.pass.cpp | 1 - .../support.limits.general/functional.version.compile.pass.cpp | 1 - .../support.limits.general/iomanip.version.compile.pass.cpp | 1 - .../support.limits.general/ios.version.compile.pass.cpp | 1 - .../support.limits.general/istream.version.compile.pass.cpp | 1 - .../support.limits.general/iterator.version.compile.pass.cpp | 1 - .../support.limits.general/latch.version.compile.pass.cpp | 1 - .../support.limits.general/limits.version.compile.pass.cpp | 1 - .../support.limits.general/list.version.compile.pass.cpp | 1 - .../support.limits.general/locale.version.compile.pass.cpp | 1 - .../support.limits.general/map.version.compile.pass.cpp | 1 - .../support.limits.general/mdspan.version.compile.pass.cpp | 1 - .../support.limits.general/memory.version.compile.pass.cpp | 1 - .../memory_resource.version.compile.pass.cpp | 1 - .../support.limits.general/mutex.version.compile.pass.cpp | 1 - .../support.limits.general/new.version.compile.pass.cpp | 1 - .../support.limits.general/numbers.version.compile.pass.cpp | 1 - .../support.limits.general/numeric.version.compile.pass.cpp | 1 - .../support.limits.general/optional.version.compile.pass.cpp | 1 - .../support.limits.general/ostream.version.compile.pass.cpp | 1 - .../support.limits.general/print.version.compile.pass.cpp | 1 - .../support.limits.general/queue.version.compile.pass.cpp | 1 - .../support.limits.general/random.version.compile.pass.cpp | 1 - .../support.limits.general/ranges.version.compile.pass.cpp | 1 - .../support.limits.general/ratio.version.compile.pass.cpp | 1 - .../support.limits.general/regex.version.compile.pass.cpp | 1 - .../scoped_allocator.version.compile.pass.cpp | 1 - .../support.limits.general/semaphore.version.compile.pass.cpp | 1 - .../support.limits.general/set.version.compile.pass.cpp | 1 - .../support.limits.general/shared_mutex.version.compile.pass.cpp | 1 - .../source_location.version.compile.pass.cpp | 1 - .../support.limits.general/span.version.compile.pass.cpp | 1 - .../support.limits.general/sstream.version.compile.pass.cpp | 1 - .../support.limits.general/stack.version.compile.pass.cpp | 1 - .../support.limits.general/stdatomic.h.version.compile.pass.cpp | 1 - .../support.limits.general/stop_token.version.compile.pass.cpp | 1 - .../support.limits.general/string.version.compile.pass.cpp | 1 - .../support.limits.general/string_view.version.compile.pass.cpp | 1 - .../support.limits.general/syncstream.version.compile.pass.cpp | 1 - .../support.limits.general/thread.version.compile.pass.cpp | 1 - .../support.limits.general/tuple.version.compile.pass.cpp | 1 - .../support.limits.general/type_traits.version.compile.pass.cpp | 1 - .../support.limits.general/typeinfo.version.compile.pass.cpp | 1 - .../unordered_map.version.compile.pass.cpp | 1 - .../unordered_set.version.compile.pass.cpp | 1 - .../support.limits.general/utility.version.compile.pass.cpp | 1 - .../support.limits.general/variant.version.compile.pass.cpp | 1 - .../support.limits.general/vector.version.compile.pass.cpp | 1 - .../support.limits.general/version.version.compile.pass.cpp | 1 - libcxx/utils/generate_feature_test_macro_components.py | 1 - 76 files changed, 76 deletions(-) diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp index 488bc468bce79..bc479f1bcb1e0 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/algorithm.version.compile.pass.cpp @@ -459,4 +459,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp index 7f3d6394749b4..fe0e6bd17f94d 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/any.version.compile.pass.cpp @@ -69,4 +69,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp index 9e50976e5cc2c..30efb61893a1b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/array.version.compile.pass.cpp @@ -171,4 +171,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp index e6145bbed5af9..3470e2b28bc40 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp @@ -420,4 +420,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp index 0d025923728b7..a908c417df48b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/barrier.version.compile.pass.cpp @@ -86,4 +86,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp index 35033419ac440..cad025eee3373 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/bit.version.compile.pass.cpp @@ -195,4 +195,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp index ea61d99736208..8799a1f7d14e5 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/bitset.version.compile.pass.cpp @@ -90,4 +90,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp index 52b02562dc5ab..6ec3037c9ea45 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/charconv.version.compile.pass.cpp @@ -123,4 +123,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp index 1453938b01da0..d5d7a5da4a64d 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp @@ -108,4 +108,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp index 507c7ab6084f8..26ebe1e3ad6b1 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cmath.version.compile.pass.cpp @@ -204,4 +204,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp index 56759a88a7348..907535a087de2 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/compare.version.compile.pass.cpp @@ -66,4 +66,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp index b5efa984b456a..9a3a644ca5d64 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/complex.version.compile.pass.cpp @@ -105,4 +105,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp index d9b2c43ecbd12..e4058c2348f9b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/concepts.version.compile.pass.cpp @@ -66,4 +66,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp index b472b205f89d5..24a9eca1e2346 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/coroutine.version.compile.pass.cpp @@ -66,4 +66,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp index ccc034418cde0..bc65a7f3cae00 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstddef.version.compile.pass.cpp @@ -69,4 +69,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp index f250798c129ea..600fa2eb2e4f5 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstdlib.version.compile.pass.cpp @@ -75,4 +75,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp index 675c918cac417..8445aa3cf0c48 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/cstring.version.compile.pass.cpp @@ -66,4 +66,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp index eff8689be9fb8..b634f3253093e 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/deque.version.compile.pass.cpp @@ -201,4 +201,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp index 60d6418c7459a..11d5735007f5b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/exception.version.compile.pass.cpp @@ -69,4 +69,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp index b843aab42e6eb..77a6455e23302 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/execution.version.compile.pass.cpp @@ -126,4 +126,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp index 9c7a84f145dde..74cf85ea9029f 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/expected.version.compile.pass.cpp @@ -129,4 +129,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp index 98acf8bb602ca..9c28db3bb0869 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/filesystem.version.compile.pass.cpp @@ -179,4 +179,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp index 19e2fd79a4295..9c06eee27e0c8 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp @@ -63,4 +63,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp index d078f9bda23c9..5985bdc2d7d4f 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_set.version.compile.pass.cpp @@ -63,4 +63,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp index 3fa4334143b1a..77730f17fd9c6 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/format.version.compile.pass.cpp @@ -147,4 +147,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp index 05f903dccafe7..d2082946597cb 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp @@ -297,4 +297,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp index ee32346d61080..f67adb0de1ded 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/fstream.version.compile.pass.cpp @@ -68,4 +68,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp index 8c0820681188d..b7b7d0334830a 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/functional.version.compile.pass.cpp @@ -579,4 +579,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp index 37deba7c9661a..4de327cbfa26b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iomanip.version.compile.pass.cpp @@ -104,4 +104,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp index 179c3ce066b6f..68816936c55e9 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ios.version.compile.pass.cpp @@ -65,4 +65,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp index 46238896f79c3..a1178b22776f1 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/istream.version.compile.pass.cpp @@ -86,4 +86,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp index 75dcb18a5428c..e9805ed4b1542 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/iterator.version.compile.pass.cpp @@ -315,4 +315,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp index 6857c54460650..8e105648becef 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/latch.version.compile.pass.cpp @@ -86,4 +86,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp index 0b3d6f5d2bd9c..f4cc8db0f54cb 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/limits.version.compile.pass.cpp @@ -84,4 +84,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp index d10c61c0e9cf4..1407d74e03aa2 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/list.version.compile.pass.cpp @@ -297,4 +297,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp index e1a04d1b0e087..f516881651b23 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/locale.version.compile.pass.cpp @@ -86,4 +86,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp index 4044c2b1b2e0f..3db3861c72b5c 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/map.version.compile.pass.cpp @@ -396,4 +396,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp index e6b4adac20efb..fad0e5b9777dd 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mdspan.version.compile.pass.cpp @@ -156,4 +156,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp index bf02dba0da773..f287e1ad9b3ad 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory.version.compile.pass.cpp @@ -678,4 +678,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp index 52fc2d1854fec..dddf473f86a42 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/memory_resource.version.compile.pass.cpp @@ -144,4 +144,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp index fb3734fff10e5..5ffa5df8841c9 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/mutex.version.compile.pass.cpp @@ -95,4 +95,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp index b1de3f7629e9d..3797e0966ec31 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/new.version.compile.pass.cpp @@ -213,4 +213,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp index e8f109610a3e5..27170d1ea0ce7 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.compile.pass.cpp @@ -66,4 +66,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp index 687c343e34e08..cafbd2cac2ccf 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp @@ -252,4 +252,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp index 32685972d6019..148a6dbc0d3e4 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp @@ -168,4 +168,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp index de0520af18e2a..163ea5b5514e4 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ostream.version.compile.pass.cpp @@ -128,4 +128,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp index 263d20ace2fd9..0382d93cb40c9 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/print.version.compile.pass.cpp @@ -77,4 +77,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp index 0ebfc6de84104..db32433ff518e 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/queue.version.compile.pass.cpp @@ -120,4 +120,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp index d40d115443977..d0ede1168dfa1 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/random.version.compile.pass.cpp @@ -99,4 +99,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp index 4cf5178dd7b8f..df19f03e7dba1 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ranges.version.compile.pass.cpp @@ -450,4 +450,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp index 6507e1c683f24..b7c08fe0de42c 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/ratio.version.compile.pass.cpp @@ -60,4 +60,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp index d6acf35d63ab0..dc27dc91851a5 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/regex.version.compile.pass.cpp @@ -71,4 +71,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp index 4246f2515dc09..9dc2d8b876640 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/scoped_allocator.version.compile.pass.cpp @@ -69,4 +69,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp index fd0f0c51e72b2..c9cae7340e215 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/semaphore.version.compile.pass.cpp @@ -86,4 +86,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp index 80eae6e1fd274..5dc69f29d0ecd 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/set.version.compile.pass.cpp @@ -318,4 +318,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp index 4392173ebbb3a..51feff2195c3d 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp @@ -164,4 +164,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp index 2b326e2b37832..9495e319521c1 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/source_location.version.compile.pass.cpp @@ -66,4 +66,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp index 3c550e0fa676e..826471a65f691 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp @@ -120,4 +120,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp index b7650c436128e..992e31ed602e3 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/sstream.version.compile.pass.cpp @@ -62,4 +62,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp index 1e530ccc3043d..61c5ed476228c 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stack.version.compile.pass.cpp @@ -93,4 +93,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp index 113ffce2a5d12..c07d935106ea6 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stdatomic.h.version.compile.pass.cpp @@ -65,4 +65,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp index ac70b0c21e018..6f6c4bbbde808 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/stop_token.version.compile.pass.cpp @@ -86,4 +86,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp index 40a6c07081008..7236d5d7f2aca 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string.version.compile.pass.cpp @@ -486,4 +486,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp index bda523614106c..c7bafb0bf059c 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/string_view.version.compile.pass.cpp @@ -249,4 +249,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp index 0eaf9f1aff4fe..589b9ba5a75df 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/syncstream.version.compile.pass.cpp @@ -86,4 +86,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp index e6c44a223ee89..a2a81a619d93c 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/thread.version.compile.pass.cpp @@ -128,4 +128,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp index b583edfc43ad0..b10441fee5eb9 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/tuple.version.compile.pass.cpp @@ -333,4 +333,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp index e6c0940ab7fd5..0074f3bf4cc57 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/type_traits.version.compile.pass.cpp @@ -996,4 +996,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp index 0729b0b37ee6a..cf29080ea75b4 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/typeinfo.version.compile.pass.cpp @@ -63,4 +63,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp index 74b3c8fff69b3..221d8aaebc14b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_map.version.compile.pass.cpp @@ -390,4 +390,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp index 9c400ddd2f657..d1c1335df7c80 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/unordered_set.version.compile.pass.cpp @@ -312,4 +312,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp index 7dd3478576331..02e7febf5c5a1 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/utility.version.compile.pass.cpp @@ -492,4 +492,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp index 4a7b9f7431a81..dea2f293f4c49 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/variant.version.compile.pass.cpp @@ -135,4 +135,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp index c2513ecad8d08..e34800a89c950 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/vector.version.compile.pass.cpp @@ -270,4 +270,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 222d562a19d63..962688e06188a 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -8154,4 +8154,3 @@ #endif // TEST_STD_VER > 23 // clang-format on - diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 18eb8a8623748..fe175fd758726 100644 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -1882,7 +1882,6 @@ def produce_tests(): {cxx_tests} // clang-format on - """.format( script_name=script_name, header=h, From cfc20ea15eea5704a79dd4e8d0eaeef4cf053268 Mon Sep 17 00:00:00 2001 From: Doug Wyatt Date: Wed, 16 Jul 2025 08:24:28 -0700 Subject: [PATCH 055/813] [Clang] FunctionEffects: Make a separate diagnostic group for redeclarations/overrides where effects are implicit. (#148690) The current function effect diagnostics include these behaviors: When you declare a function `nonblocking` (typically in a header) and then omit the attribute on the implementation (or any other redeclaration), Clang warns: attribute 'nonblocking' on function does not match previous declaration. But if a `nonblocking` function is a C++ virtual method, then overrides are implicitly nonblocking; the attribute doesn't need to be explicitly stated. These behaviors are arguably inconsistent -- and also, both, more pedantic than the rest of the function effect diagnostics. This PR accomplishes two things: - Separates the diagnostic on a redeclaration into a new group, `-Wfunction-effect-redeclarations`, so it can be disabled independently. - Adds a second diagnostic to this new group, for the case of an override method missing the attribute. (This helps in a situation where I'm trying to add `nonblocking` via a macro that does other things and I want to know that the macro is missing on an override declaration.) --------- Co-authored-by: Doug Wyatt Co-authored-by: Sirraide --- clang/docs/ReleaseNotes.rst | 6 +++++ clang/include/clang/Basic/DiagnosticGroups.td | 1 + .../clang/Basic/DiagnosticSemaKinds.td | 23 ++++++++++++++----- clang/lib/Sema/SemaDeclCXX.cpp | 10 +++++++- clang/test/Sema/attr-nonblocking-sema.cpp | 22 +++++++++++------- 5 files changed, 47 insertions(+), 15 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 2f1705ba7db06..fcd3887ec7a09 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -710,6 +710,12 @@ Improvements to Clang's diagnostics pointer, provided it can be proven that the pointer only points to ``[[noreturn]]`` functions. +- Added a separate diagnostic group ``-Wfunction-effect-redeclarations``, for the more pedantic + diagnostics for function effects (``[[clang::nonblocking]]`` and ``[[clang::nonallocating]]``). + Moved the warning for a missing (though implied) attribute on a redeclaration into this group. + Added a new warning in this group for the case where the attribute is missing/implicit on + an override of a virtual method. + Improvements to Clang's time-trace ---------------------------------- diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index c28a919e35d08..ccb18aa37447e 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1293,6 +1293,7 @@ def ThreadSafetyBeta : DiagGroup<"thread-safety-beta">; // Warnings and notes related to the function effects system which underlies // the nonblocking and nonallocating attributes. def FunctionEffects : DiagGroup<"function-effects">; +def FunctionEffectRedeclarations : DiagGroup<"function-effect-redeclarations">; def PerfConstraintImpliesNoexcept : DiagGroup<"perf-constraint-implies-noexcept">; // Uniqueness Analysis warnings diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 2781ff81ab4cf..e94e91cbd56d9 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -11530,17 +11530,28 @@ def note_in_evaluating_default_argument : Note< def warn_invalid_add_func_effects : Warning< "attribute '%0' should not be added via type conversion">, InGroup, DefaultIgnore; -def warn_mismatched_func_effect_override : Warning< - "attribute '%0' on overriding function does not match base declaration">, - InGroup, DefaultIgnore; -def warn_mismatched_func_effect_redeclaration : Warning< - "attribute '%0' on function does not match previous declaration">, - InGroup, DefaultIgnore; +def warn_conflicting_func_effect_override + : Warning<"attribute '%0' on overriding function conflicts with base " + "declaration">, + InGroup, + DefaultIgnore; def warn_conflicting_func_effects : Warning< "effects conflict when merging declarations; kept '%0', discarded '%1'">, InGroup, DefaultIgnore; def err_func_with_effects_no_prototype : Error< "'%0' function must have a prototype">; +// These are more pedantic: in redeclarations and virtual method overrides, +// the effect attribute(s) should be restated. +def warn_mismatched_func_effect_override + : Warning<"overriding function is missing '%0' attribute from base " + "declaration">, + InGroup, + DefaultIgnore; +def warn_mismatched_func_effect_redeclaration + : Warning< + "redeclaration is missing '%0' attribute from previous declaration">, + InGroup, + DefaultIgnore; } // end of sema category diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index f60ab4f0da7a0..c8638420aebb5 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -18682,7 +18682,7 @@ bool Sema::CheckOverridingFunctionAttributes(CXXMethodDecl *New, case FunctionEffectDiff::OverrideResult::NoAction: break; case FunctionEffectDiff::OverrideResult::Warn: - Diag(New->getLocation(), diag::warn_mismatched_func_effect_override) + Diag(New->getLocation(), diag::warn_conflicting_func_effect_override) << Diff.effectName(); Diag(Old->getLocation(), diag::note_overridden_virtual_function) << Old->getReturnTypeSourceRange(); @@ -18695,6 +18695,14 @@ bool Sema::CheckOverridingFunctionAttributes(CXXMethodDecl *New, QualType ModQT = Context.getFunctionType(NewFT->getReturnType(), NewFT->getParamTypes(), EPI); New->setType(ModQT); + if (Errs.empty()) { + // A warning here is somewhat pedantic. Skip this if there was + // already a merge conflict, which is more serious. + Diag(New->getLocation(), diag::warn_mismatched_func_effect_override) + << Diff.effectName(); + Diag(Old->getLocation(), diag::note_overridden_virtual_function) + << Old->getReturnTypeSourceRange(); + } break; } } diff --git a/clang/test/Sema/attr-nonblocking-sema.cpp b/clang/test/Sema/attr-nonblocking-sema.cpp index f13cc783dfc33..c8fb40693eec0 100644 --- a/clang/test/Sema/attr-nonblocking-sema.cpp +++ b/clang/test/Sema/attr-nonblocking-sema.cpp @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -verify -Wfunction-effects %s -// RUN: %clang_cc1 -fsyntax-only -fblocks -verify -x c -std=c23 -Wfunction-effects %s +// RUN: %clang_cc1 -fsyntax-only -fblocks -fcxx-exceptions -verify -Wfunction-effects -Wfunction-effect-redeclarations %s +// RUN: %clang_cc1 -fsyntax-only -fblocks -verify -x c -std=c23 -Wfunction-effects -Wfunction-effect-redeclarations %s #if !__has_attribute(nonblocking) #error "the 'nonblocking' attribute is not available" @@ -127,29 +127,35 @@ void type_conversions_2() #endif // --- VIRTUAL METHODS --- -// Attributes propagate to overridden methods, so no diagnostics except for conflicts. +// Attributes propagate to overridden methods. // Check this in the syntax tests too. #ifdef __cplusplus struct Base { virtual void f1(); - virtual void nonblocking() noexcept [[clang::nonblocking]]; - virtual void nonallocating() noexcept [[clang::nonallocating]]; + virtual void nonblocking() noexcept [[clang::nonblocking]]; // expected-note {{overridden virtual function is here}} + virtual void nonallocating() noexcept [[clang::nonallocating]]; // expected-note {{overridden virtual function is here}} virtual void f2() [[clang::nonallocating]]; // expected-note {{previous declaration is here}} + virtual void f3() [[clang::nonblocking]]; // expected-note {{overridden virtual function is here}} }; struct Derived : public Base { void f1() [[clang::nonblocking]] override; - void nonblocking() noexcept override; - void nonallocating() noexcept override; + void nonblocking() noexcept override; // expected-warning {{overriding function is missing 'nonblocking' attribute from base declaration}} + void nonallocating() noexcept override; // expected-warning {{overriding function is missing 'nonallocating' attribute from base declaration}} void f2() [[clang::allocating]] override; // expected-warning {{effects conflict when merging declarations; kept 'allocating', discarded 'nonallocating'}} }; + +template +struct TDerived : public Base { + void f3() [[clang::nonblocking(B)]] override; // expected-warning {{attribute 'nonblocking' on overriding function conflicts with base declaration}} +}; #endif // __cplusplus // --- REDECLARATIONS --- void f2(); void f2() [[clang::nonblocking]]; // expected-note {{previous declaration is here}} -void f2(); // expected-warning {{attribute 'nonblocking' on function does not match previous declaration}} +void f2(); // expected-warning {{redeclaration is missing 'nonblocking' attribute from previous declaration}} // Note: we verify that the attribute is actually seen during the constraints tests. void f3() [[clang::blocking]]; // expected-note {{previous declaration is here}} From 372e99938f53bada5788a063e539ccc477ec3af9 Mon Sep 17 00:00:00 2001 From: Serge Pavlov Date: Wed, 16 Jul 2025 22:28:57 +0700 Subject: [PATCH 056/813] Remove unused variable (#149115) --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 0cee5c87e999d..4845a9c84e01f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14017,7 +14017,6 @@ const uint32_t ModeMask32 = ~RISCVExceptFlags::ALL; SDValue RISCVTargetLowering::lowerGET_FPMODE(SDValue Op, SelectionDAG &DAG) const { const MVT XLenVT = Subtarget.getXLenVT(); - const uint64_t ModeMaskValue = Subtarget.is64Bit() ? ModeMask64 : ModeMask32; SDLoc DL(Op); SDValue Chain = Op->getOperand(0); SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); From 33396d71e9aef624670380e11348a11b8ff74246 Mon Sep 17 00:00:00 2001 From: cmtice Date: Wed, 16 Jul 2025 08:29:33 -0700 Subject: [PATCH 057/813] [LLDB] Update release note about DIL as requested. (#149117) A post-commit review on PR #147887 requested a minor update to the formatting of the LLDB DIL implementation release note. --- llvm/docs/ReleaseNotes.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 4def8725c1bf9..68d653b9b53d6 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -315,10 +315,7 @@ Changes to LLDB [DIL](https://discourse.llvm.org/t/rfc-data-inspection-language/69893) as the default implementation for 'frame variable'. This should not change the behavior of 'frame variable' at all, at this time. To revert to using the - old implementation use - ``` - settings set target.experimental.use-DIL false - ``` + old implementation use: `settings set target.experimental.use-DIL false`. * Disassembly of unknown instructions now produces `` instead of nothing at all * Changed the format of opcode bytes to match llvm-objdump when disassembling From 2206c7d4afc28d78c23db48de29d600ce1e8791b Mon Sep 17 00:00:00 2001 From: jjasmine Date: Wed, 16 Jul 2025 08:35:13 -0700 Subject: [PATCH 058/813] [InstSimplify] Fold trig functions call of poison to poison (#148969) Fold trig functions call of poison to poison. This includes sin, cos, asin, acos, atan, atan2, sinh, cosh, sincos, sincospi. Test cases are fixed and also added to llvm/test/Transforms/InstSimplify/fold-intrinsics.ll just like in https://github.com/llvm/llvm-project/pull/146750 --- llvm/lib/Analysis/ValueTracking.cpp | 12 ++ .../InstSimplify/ConstProp/atan-intrinsic.ll | 3 +- .../ConstProp/sinh-cosh-intrinsics.ll | 6 +- .../InstSimplify/fold-intrinsics.ll | 199 ++++++++++++++++++ llvm/test/Transforms/InstSimplify/sincos.ll | 9 +- llvm/unittests/Analysis/ValueTrackingTest.cpp | 4 +- 6 files changed, 219 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 21f844c4d2f45..61a322be03da1 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7914,6 +7914,18 @@ bool llvm::intrinsicPropagatesPoison(Intrinsic::ID IID) { case Intrinsic::smul_fix_sat: case Intrinsic::pow: case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::sinh: + case Intrinsic::cos: + case Intrinsic::cosh: + case Intrinsic::sincos: + case Intrinsic::sincospi: + case Intrinsic::tan: + case Intrinsic::tanh: + case Intrinsic::asin: + case Intrinsic::acos: + case Intrinsic::atan: + case Intrinsic::atan2: case Intrinsic::canonicalize: case Intrinsic::sqrt: return true; diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll index d824d6d35643d..3cb6290b1a808 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll @@ -35,8 +35,7 @@ define double @test_atan_neg0() { define double @test_atan_poison() { ; CHECK-LABEL: define double @test_atan_poison() { -; CHECK-NEXT: [[RES:%.*]] = call double @llvm.atan.f64(double poison) -; CHECK-NEXT: ret double [[RES]] +; CHECK-NEXT: ret double poison ; %res = call double @llvm.atan.f64(double poison) ret double %res diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll b/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll index a4f318bbc834c..96419382c7b7f 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/sinh-cosh-intrinsics.ll @@ -35,8 +35,7 @@ define double @test_sinh_neg0() { define double @test_sinh_poison() { ; CHECK-LABEL: define double @test_sinh_poison() { -; CHECK-NEXT: [[RES:%.*]] = call double @llvm.sinh.f64(double poison) -; CHECK-NEXT: ret double [[RES]] +; CHECK-NEXT: ret double poison ; %res = call double @llvm.sinh.f64(double poison) ret double %res @@ -121,8 +120,7 @@ define double @test_cosh_neg0() { define double @test_cosh_poison() { ; CHECK-LABEL: define double @test_cosh_poison() { -; CHECK-NEXT: [[RES:%.*]] = call double @llvm.cosh.f64(double poison) -; CHECK-NEXT: ret double [[RES]] +; CHECK-NEXT: ret double poison ; %res = call double @llvm.cosh.f64(double poison) ret double %res diff --git a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll index 8578aa9fa84b3..e4cfa4673a979 100644 --- a/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll +++ b/llvm/test/Transforms/InstSimplify/fold-intrinsics.ll @@ -87,3 +87,202 @@ define void @pow_poison(i16 %arg_int,float %arg_flt, ptr %P) { ret void } + +define void @sin_poison(ptr %P) { +; CHECK-LABEL: @sin_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %sin_f32 = call float @llvm.sin(float poison) + store volatile float %sin_f32, ptr %P + + %sin_2xf32 = call <2 x float> @llvm.sin(<2 x float> poison) + store volatile <2 x float> %sin_2xf32, ptr %P + + %sin_4xf64 = call <4 x double> @llvm.sin(<4 x double> poison) + store volatile <4 x double> %sin_4xf64, ptr %P + + %asin_f32 = call float @llvm.asin(float poison) + store volatile float %asin_f32, ptr %P + + %asin_2xf32 = call <2 x float> @llvm.asin(<2 x float> poison) + store volatile <2 x float> %asin_2xf32, ptr %P + + %asin_4xf64 = call <4 x double> @llvm.asin(<4 x double> poison) + store volatile <4 x double> %asin_4xf64, ptr %P + + ret void +} + + +define void @cos_poison(ptr %P) { +; CHECK-LABEL: @cos_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %cos_f32 = call float @llvm.cos(float poison) + store volatile float %cos_f32, ptr %P + + %cos_2xf32 = call <2 x float> @llvm.cos(<2 x float> poison) + store volatile <2 x float> %cos_2xf32, ptr %P + + %cos_4xf64 = call <4 x double> @llvm.cos(<4 x double> poison) + store volatile <4 x double> %cos_4xf64, ptr %P + + %acos_f32 = call float @llvm.acos(float poison) + store volatile float %acos_f32, ptr %P + + %acos_2xf32 = call <2 x float> @llvm.acos(<2 x float> poison) + store volatile <2 x float> %acos_2xf32, ptr %P + + %acos_4xf64 = call <4 x double> @llvm.acos(<4 x double> poison) + store volatile <4 x double> %acos_4xf64, ptr %P + + ret void +} + + +define void @tan_poison(ptr %P) { +; CHECK-LABEL: @tan_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile float poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %tan_f32 = call float @llvm.tan(float poison) + store volatile float %tan_f32, ptr %P + + %tan_2xf32 = call <2 x float> @llvm.tan(<2 x float> poison) + store volatile <2 x float> %tan_2xf32, ptr %P + + %tan_4xf64 = call <4 x double> @llvm.tan(<4 x double> poison) + store volatile <4 x double> %tan_4xf64, ptr %P + + %atan_f32 = call float @llvm.atan(float poison) + store volatile float %atan_f32, ptr %P + + %atan_2xf32 = call <2 x float> @llvm.atan(<2 x float> poison) + store volatile <2 x float> %atan_2xf32, ptr %P + + %atan_4xf64 = call <4 x double> @llvm.atan(<4 x double> poison) + store volatile <4 x double> %atan_4xf64, ptr %P + + %atan2_f32 = call float @llvm.atan2(float poison, float poison) + store volatile float %atan2_f32, ptr %P + + %atan2_2xf32 = call <2 x float> @llvm.atan2(<2 x float> poison, <2 x float> poison) + store volatile <2 x float> %atan2_2xf32, ptr %P + + %atan2_4xf64 = call <4 x double> @llvm.atan2(<4 x double> poison, <4 x double> poison) + store volatile <4 x double> %atan2_4xf64, ptr %P + + ret void +} + + +define void @sincos_poison(ptr %P) { +; CHECK-LABEL: @sincos_poison( +; CHECK-NEXT: store volatile { float, float } poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32 +; CHECK-NEXT: store volatile { float, float } poison, ptr [[P]], align 4 +; CHECK-NEXT: store volatile { <2 x float>, <2 x float> } poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile { <4 x double>, <4 x double> } poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %sincos_f32 = call { float, float } @llvm.sincos(float poison) + store volatile { float, float } %sincos_f32, ptr %P + + %sincos_2xf32 = call { <2 x float>, <2 x float> } @llvm.sincos(<2 x float> poison) + store volatile { <2 x float>, <2 x float> } %sincos_2xf32, ptr %P + + %sincos_4xf64 = call { <4 x double>, <4 x double> } @llvm.sincos(<4 x double> poison) + store volatile { <4 x double>, <4 x double> } %sincos_4xf64, ptr %P + + %sincospi_f32 = call { float, float } @llvm.sincospi(float poison) + store volatile { float, float } %sincospi_f32, ptr %P + + %sincospi_2xf32 = call { <2 x float>, <2 x float> } @llvm.sincospi(<2 x float> poison) + store volatile { <2 x float>, <2 x float> } %sincospi_2xf32, ptr %P + + %sincospi_4xf64 = call { <4 x double>, <4 x double> } @llvm.sincospi(<4 x double> poison) + store volatile { <4 x double>, <4 x double> } %sincospi_4xf64, ptr %P + + ret void +} + + +define void @sinh_poison(ptr %P) { +; CHECK-LABEL: @sinh_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %sinh_f32 = call float @llvm.sinh(float poison) + store volatile float %sinh_f32, ptr %P + + %sinh_2xf32 = call <2 x float> @llvm.sinh(<2 x float> poison) + store volatile <2 x float> %sinh_2xf32, ptr %P + + %sinh_4xf64 = call <4 x double> @llvm.sinh(<4 x double> poison) + store volatile <4 x double> %sinh_4xf64, ptr %P + + ret void +} + + +define void @cosh_poison(ptr %P) { +; CHECK-LABEL: @cosh_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %cosh_f32 = call float @llvm.cosh(float poison) + store volatile float %cosh_f32, ptr %P + + %cosh_2xf32 = call <2 x float> @llvm.cosh(<2 x float> poison) + store volatile <2 x float> %cosh_2xf32, ptr %P + + %cosh_4xf64 = call <4 x double> @llvm.cosh(<4 x double> poison) + store volatile <4 x double> %cosh_4xf64, ptr %P + + ret void +} + + +define void @tanh_poison(ptr %P) { +; CHECK-LABEL: @tanh_poison( +; CHECK-NEXT: store volatile float poison, ptr [[P:%.*]], align 4 +; CHECK-NEXT: store volatile <2 x float> poison, ptr [[P]], align 8 +; CHECK-NEXT: store volatile <4 x double> poison, ptr [[P]], align 32 +; CHECK-NEXT: ret void +; + %tanh_f32 = call float @llvm.tanh(float poison) + store volatile float %tanh_f32, ptr %P + + %tanh_2xf32 = call <2 x float> @llvm.tanh(<2 x float> poison) + store volatile <2 x float> %tanh_2xf32, ptr %P + + %tanh_4xf64 = call <4 x double> @llvm.tanh(<4 x double> poison) + store volatile <4 x double> %tanh_4xf64, ptr %P + + ret void +} diff --git a/llvm/test/Transforms/InstSimplify/sincos.ll b/llvm/test/Transforms/InstSimplify/sincos.ll index e0f81ee45af05..144da53c6917b 100644 --- a/llvm/test/Transforms/InstSimplify/sincos.ll +++ b/llvm/test/Transforms/InstSimplify/sincos.ll @@ -50,8 +50,7 @@ define { <2 x float>, <2 x float> } @sincos_zero_vector() { define { float, float } @sincos_poison() { ; CHECK-LABEL: define { float, float } @sincos_poison() { -; CHECK-NEXT: [[RET:%.*]] = call { float, float } @llvm.sincos.f32(float poison) -; CHECK-NEXT: ret { float, float } [[RET]] +; CHECK-NEXT: ret { float, float } poison ; %ret = call { float, float } @llvm.sincos.f32(float poison) ret { float, float } %ret @@ -59,8 +58,7 @@ define { float, float } @sincos_poison() { define { <2 x float>, <2 x float> } @sincos_poison_vector() { ; CHECK-LABEL: define { <2 x float>, <2 x float> } @sincos_poison_vector() { -; CHECK-NEXT: [[RET:%.*]] = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> poison) -; CHECK-NEXT: ret { <2 x float>, <2 x float> } [[RET]] +; CHECK-NEXT: ret { <2 x float>, <2 x float> } poison ; %ret = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> poison) ret { <2 x float>, <2 x float> } %ret @@ -68,8 +66,7 @@ define { <2 x float>, <2 x float> } @sincos_poison_vector() { define { , } @sincos_poison_scalable_vector() { ; CHECK-LABEL: define { , } @sincos_poison_scalable_vector() { -; CHECK-NEXT: [[RET:%.*]] = call { , } @llvm.sincos.nxv2f32( poison) -; CHECK-NEXT: ret { , } [[RET]] +; CHECK-NEXT: ret { , } poison ; %ret = call { , } @llvm.sincos.nxv2f32( poison) ret { , } %ret diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp index 7a48105a1dc99..4b476551f63d9 100644 --- a/llvm/unittests/Analysis/ValueTrackingTest.cpp +++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp @@ -912,8 +912,8 @@ TEST(ValueTracking, propagatesPoison) { {false, "call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %shamt)", 2}, {true, "call float @llvm.sqrt.f32(float %fx)", 0}, {true, "call float @llvm.powi.f32.i32(float %fx, i32 %x)", 0}, - {false, "call float @llvm.sin.f32(float %fx)", 0}, - {false, "call float @llvm.cos.f32(float %fx)", 0}, + {true, "call float @llvm.sin.f32(float %fx)", 0}, + {true, "call float @llvm.cos.f32(float %fx)", 0}, {true, "call float @llvm.pow.f32(float %fx, float %fy)", 0}, {false, "call float @llvm.exp.f32(float %fx)", 0}, {false, "call float @llvm.exp2.f32(float %fx)", 0}, From 9f2039755fc8a872d0a768581b5a09340a4f30be Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Wed, 16 Jul 2025 11:37:14 -0400 Subject: [PATCH 059/813] [flang] Add missing symbol names to the error message (#148888) Fixes #140485 --- flang/lib/Semantics/resolve-names.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 96faa5fd954cd..b3268605e7c0c 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -8574,8 +8574,10 @@ bool ResolveNamesVisitor::Pre(const parser::ImportStmt &x) { } else { Say(name, "A distinct '%s' is already present in this scope"_err_en_US) - .Attach(symbol->name(), "Previous declaration of '%s'"_en_US) - .Attach(outer->name(), "Declaration of '%s' in host scope"_en_US); + .Attach(symbol->name(), "Previous declaration of '%s'"_en_US, + symbol->name().ToString()) + .Attach(outer->name(), "Declaration of '%s' in host scope"_en_US, + outer->name().ToString()); } } } else { From dd3d26bc8973ed18a527d61a1d8e5961060f138a Mon Sep 17 00:00:00 2001 From: Marina Taylor Date: Wed, 16 Jul 2025 16:48:59 +0100 Subject: [PATCH 060/813] Revert "[Support] Error if SocketPath is too long" (#149096) Reverts llvm/llvm-project#148903 due to bot failure https://lab.llvm.org/buildbot/#/builders/187/builds/8162 --- llvm/lib/Support/raw_socket_stream.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llvm/lib/Support/raw_socket_stream.cpp b/llvm/lib/Support/raw_socket_stream.cpp index cf51de889deaf..fd1c681672138 100644 --- a/llvm/lib/Support/raw_socket_stream.cpp +++ b/llvm/lib/Support/raw_socket_stream.cpp @@ -119,14 +119,6 @@ ListeningSocket::ListeningSocket(ListeningSocket &&LS) Expected ListeningSocket::createUnix(StringRef SocketPath, int MaxBacklog) { - // If SocketPath is too long, the path will be truncated, and there may be - // collisions with other truncated addresses that the fs::exists check below - // will be unable to detect. - if (SocketPath.size() >= sizeof(sockaddr_un::sun_path)) - return llvm::make_error( - std::make_error_code(std::errc::filename_too_long), - "SocketPath too long"); - // Handle instances where the target socket address already exists and // differentiate between a preexisting file with and without a bound socket // From 1754a7d5733d5305e4ec25ef0945b39d6882bb28 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 16 Jul 2025 17:49:19 +0200 Subject: [PATCH 061/813] [Support][BLAKE3] Restore static on blake3_hash4_neon (#149046) This was dropped in #147948 and causes symbol conflicts if libblake3 is also linked. --- llvm/lib/Support/BLAKE3/blake3_neon.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Support/BLAKE3/blake3_neon.c b/llvm/lib/Support/BLAKE3/blake3_neon.c index 9629e10836864..ee36721f87573 100644 --- a/llvm/lib/Support/BLAKE3/blake3_neon.c +++ b/llvm/lib/Support/BLAKE3/blake3_neon.c @@ -245,10 +245,11 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter, counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3))); } -void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, - const uint32_t key[8], uint64_t counter, - bool increment_counter, uint8_t flags, - uint8_t flags_start, uint8_t flags_end, uint8_t *out) { +static void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks, + const uint32_t key[8], uint64_t counter, + bool increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, + uint8_t *out) { uint32x4_t h_vecs[8] = { set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]), set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]), From 82404e3c69168b9fdb779174d3499f5f87f818d2 Mon Sep 17 00:00:00 2001 From: Kyungwoo Lee Date: Wed, 16 Jul 2025 08:55:01 -0700 Subject: [PATCH 062/813] [CGData][GMF] Skip merging unnamed functions (#148995) Skip merging unnamed functions to fix an assertion issue, since unnamed functions would otherwise receive the same merged name -- https://github.com/llvm/llvm-project/blob/main/llvm/lib/CodeGen/GlobalMergeFunctions.cpp#L191 --- llvm/lib/CodeGen/GlobalMergeFunctions.cpp | 4 +++ .../AArch64/cgdata-no-merge-unnamed.ll | 32 +++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll diff --git a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp index 92ecfadf97c99..73f11c1345daf 100644 --- a/llvm/lib/CodeGen/GlobalMergeFunctions.cpp +++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp @@ -95,6 +95,10 @@ bool isEligibleFunction(Function *F) { if (F->getCallingConv() == CallingConv::SwiftTail) return false; + // Unnamed functions are skipped for simplicity. + if (!F->hasName()) + return false; + // If function contains callsites with musttail, if we merge // it, the merged function will have the musttail callsite, but // the number of parameters can change, thus the parameter count diff --git a/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll new file mode 100644 index 0000000000000..9986af7eb231c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/cgdata-no-merge-unnamed.ll @@ -0,0 +1,32 @@ +; This test checks if two similar functions, @0 and @1, are not merged as they are unnamed. + +; RUN: opt -mtriple=arm64-apple-darwin -S --passes=global-merge-func %s | FileCheck %s +; RUN: llc -mtriple=arm64-apple-darwin -enable-global-merge-func=true < %s | FileCheck %s + +; CHECK-NOT: .Tgm + +@g = external local_unnamed_addr global [0 x i32], align 4 +@g1 = external global i32, align 4 +@g2 = external global i32, align 4 + +define i32 @0(i32 %a) { +entry: + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %1 = load volatile i32, i32* @g1, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, 1 + ret i32 %add +} + +define i32 @1(i32 %a) { +entry: + %idxprom = sext i32 %a to i64 + %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + %1 = load volatile i32, i32* @g2, align 4 + %mul = mul nsw i32 %1, %0 + %add = add nsw i32 %mul, 1 + ret i32 %add +} From 64c273a6191bd0036deed7847d39440d36bbc604 Mon Sep 17 00:00:00 2001 From: Erick Velez Date: Wed, 16 Jul 2025 09:03:44 -0700 Subject: [PATCH 063/813] [clang-doc] fix ASan complaints from passing RepositoryURL as reference (#148923) Passing RepositoryURL around as an optional reference triggered stack-use-after-return complaints. --- clang-tools-extra/clang-doc/JSONGenerator.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp index 6fdc7196e9095..cc4c68346ec53 100644 --- a/clang-tools-extra/clang-doc/JSONGenerator.cpp +++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp @@ -45,7 +45,7 @@ static auto SerializeReferenceLambda = [](const auto &Ref, Object &Object) { static json::Object serializeLocation(const Location &Loc, - const std::optional &RepositoryUrl) { + const std::optional RepositoryUrl) { Object LocationObj = Object(); LocationObj["LineNumber"] = Loc.StartLineNumber; LocationObj["Filename"] = Loc.Filename; @@ -169,7 +169,7 @@ static json::Value serializeComment(const CommentInfo &I) { static void serializeCommonAttributes(const Info &I, json::Object &Obj, - const std::optional &RepositoryUrl) { + const std::optional RepositoryUrl) { Obj["Name"] = I.Name; Obj["USR"] = toHex(toStringRef(I.USR)); @@ -211,9 +211,9 @@ static void serializeReference(const Reference &Ref, Object &ReferenceObj) { // differently. Only enums, records, and typedefs are handled here. static void serializeCommonChildren(const ScopeChildren &Children, json::Object &Obj, - const std::optional &RepositoryUrl) { - static auto SerializeInfo = [&RepositoryUrl](const auto &Info, - Object &Object) { + const std::optional RepositoryUrl) { + static auto SerializeInfo = [RepositoryUrl](const auto &Info, + Object &Object) { serializeInfo(Info, Object, RepositoryUrl); }; @@ -304,7 +304,7 @@ static void serializeInfo(const FieldTypeInfo &I, Object &Obj) { } static void serializeInfo(const FunctionInfo &F, json::Object &Obj, - const std::optional &RepositoryURL) { + const std::optional RepositoryURL) { serializeCommonAttributes(F, Obj, RepositoryURL); Obj["IsStatic"] = F.IsStatic; @@ -459,7 +459,7 @@ static void serializeInfo(const RecordInfo &I, json::Object &Obj, } static void serializeInfo(const VarInfo &I, json::Object &Obj, - const std::optional &RepositoryUrl) { + const std::optional RepositoryUrl) { serializeCommonAttributes(I, Obj, RepositoryUrl); Obj["IsStatic"] = I.IsStatic; auto TypeObj = Object(); @@ -468,15 +468,15 @@ static void serializeInfo(const VarInfo &I, json::Object &Obj, } static void serializeInfo(const NamespaceInfo &I, json::Object &Obj, - const std::optional &RepositoryUrl) { + const std::optional RepositoryUrl) { serializeCommonAttributes(I, Obj, RepositoryUrl); if (!I.Children.Namespaces.empty()) serializeArray(I.Children.Namespaces, Obj, "Namespaces", SerializeReferenceLambda); - static auto SerializeInfo = [&RepositoryUrl](const auto &Info, - Object &Object) { + static auto SerializeInfo = [RepositoryUrl](const auto &Info, + Object &Object) { serializeInfo(Info, Object, RepositoryUrl); }; From 78b9128250c9fe5c7f9e460a27cc28c6450fd8fd Mon Sep 17 00:00:00 2001 From: Trevor Gross Date: Wed, 16 Jul 2025 12:04:53 -0400 Subject: [PATCH 064/813] [LangRef] Document the difference between `` and `` (#147929) Document how LLVM expects to use `` and ``, as well as the `pref >= abi` requirement. --- llvm/docs/LangRef.rst | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index a5a5070a43a36..2759e18301d58 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3240,12 +3240,24 @@ as follows: as :ref:`Non-Integral Pointer Type ` s. The ``0`` address space cannot be specified as non-integral. -Unless explicitly stated otherwise, on every specification that specifies -an alignment, the value of the alignment must be in the range [1,2^16) -and must be a power of two times the width of a byte. -On every specification that takes a ``:``, specifying the -```` alignment is optional. If omitted, the preceding ``:`` -should be omitted too and ```` will be equal to ````. +```` is a lower bound on what is required for a type to be considered +aligned. This is used in various places, such as: + +- The alignment for loads and stores if none is explicitly given. +- The alignment used to compute struct layout. +- The alignment used to compute allocation sizes and thus ``getelementptr`` + offsets. +- The alignment below which accesses are considered underaligned. + +```` allows providing a more optimal alignment that should be used when +possible, primarily for ``alloca`` and the alignment of global variables. It is +an optional value that must be greater than or equal to ````. If omitted, +the preceding ``:`` should also be omitted and ```` will be equal to +````. + +Unless explicitly stated otherwise, every alignment specification is provided in +bits and must be in the range [1,2^16). The value must be a power of two times +the width of a byte (i.e. ``align = 8 * 2^N``). When constructing the data layout for a given target, LLVM starts with a default set of specifications which are then (possibly) overridden by From 6e0b0ec66ac8de046cc95080166e2012819f7d93 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 16 Jul 2025 09:09:05 -0700 Subject: [PATCH 065/813] [flang] Fix crash in Semantics (#148706) Allow for renaming in USE association of Cray pointers. Fixes https://github.com/llvm/llvm-project/issues/148559. --- flang/lib/Semantics/tools.cpp | 6 +++--- flang/test/Semantics/bug148559.f90 | 12 ++++++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 flang/test/Semantics/bug148559.f90 diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 5e5b43f26c791..5a5b02e1ac3ce 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -348,9 +348,9 @@ const Symbol &BypassGeneric(const Symbol &symbol) { const Symbol &GetCrayPointer(const Symbol &crayPointee) { const Symbol *found{nullptr}; - for (const auto &[pointee, pointer] : - crayPointee.GetUltimate().owner().crayPointers()) { - if (pointee == crayPointee.name()) { + const Symbol &ultimate{crayPointee.GetUltimate()}; + for (const auto &[pointee, pointer] : ultimate.owner().crayPointers()) { + if (pointee == ultimate.name()) { found = &pointer.get(); break; } diff --git a/flang/test/Semantics/bug148559.f90 b/flang/test/Semantics/bug148559.f90 new file mode 100644 index 0000000000000..d7b959ac8f191 --- /dev/null +++ b/flang/test/Semantics/bug148559.f90 @@ -0,0 +1,12 @@ +!RUN: %flang_fc1 -fsyntax-only %s +!Regression test for crash in semantics on Cray pointers + +module m + pointer(ptr,pp) +end module m + +program main + use m, only:renamea=>pp + use m, only:pp + print *, renamea +end From fc7f9d795d37cd119831e77e475e4690e4120bdb Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 16 Jul 2025 09:09:29 -0700 Subject: [PATCH 066/813] [flang] Better error message for ambiguous ASSIGNMENT(=) (#148720) When a type-bound generic ASSIGNMENT(=) procedure is ambiguous for a particular reference, say so, rather than claiming that no specific procedure matched the types and ranks of the LHS and RHS. Fixes https://github.com/llvm/llvm-project/issues/148675. --- flang/lib/Semantics/expression.cpp | 66 +++++++++++++++++------------- flang/test/Semantics/bug148675.f90 | 21 ++++++++++ 2 files changed, 58 insertions(+), 29 deletions(-) create mode 100644 flang/test/Semantics/bug148675.f90 diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp index 53ec3827893d0..14473724f0f40 100644 --- a/flang/lib/Semantics/expression.cpp +++ b/flang/lib/Semantics/expression.cpp @@ -178,7 +178,7 @@ class ArgumentAnalyzer { } // Find and return a user-defined assignment std::optional TryDefinedAssignment(); - std::optional GetDefinedAssignmentProc(); + std::optional GetDefinedAssignmentProc(bool &isAmbiguous); std::optional GetType(std::size_t) const; void Dump(llvm::raw_ostream &); @@ -191,7 +191,7 @@ class ArgumentAnalyzer { MaybeExpr AnalyzeExprOrWholeAssumedSizeArray(const parser::Expr &); bool AreConformable() const; const Symbol *FindBoundOp(parser::CharBlock, int passIndex, - const Symbol *&generic, bool isSubroutine); + const Symbol *&generic, bool isSubroutine, bool *isAmbiguous = nullptr); void AddAssignmentConversion( const DynamicType &lhsType, const DynamicType &rhsType); bool OkLogicalIntegerAssignment(TypeCategory lhs, TypeCategory rhs); @@ -199,7 +199,8 @@ class ArgumentAnalyzer { bool IsBOZLiteral(std::size_t i) const { return evaluate::IsBOZLiteral(GetExpr(i)); } - void SayNoMatch(const std::string &, bool isAssignment = false); + void SayNoMatch( + const std::string &, bool isAssignment = false, bool isAmbiguous = false); std::string TypeAsFortran(std::size_t); bool AnyUntypedOrMissingOperand(); @@ -4781,7 +4782,9 @@ std::optional ArgumentAnalyzer::TryDefinedAssignment() { return std::nullopt; // user-defined assignment not allowed for these args } auto restorer{context_.GetContextualMessages().SetLocation(source_)}; - if (std::optional procRef{GetDefinedAssignmentProc()}) { + bool isAmbiguous{false}; + if (std::optional procRef{ + GetDefinedAssignmentProc(isAmbiguous)}) { if (context_.inWhereBody() && !procRef->proc().IsElemental()) { // C1032 context_.Say( "Defined assignment in WHERE must be elemental, but '%s' is not"_err_en_US, @@ -4791,9 +4794,11 @@ std::optional ArgumentAnalyzer::TryDefinedAssignment() { return std::move(*procRef); } if (isDefined == Tristate::Yes) { - if (!lhsType || !rhsType || (lhsRank != rhsRank && rhsRank != 0) || + if (isAmbiguous || !lhsType || !rhsType || + (lhsRank != rhsRank && rhsRank != 0) || !OkLogicalIntegerAssignment(lhsType->category(), rhsType->category())) { - SayNoMatch("ASSIGNMENT(=)", true); + SayNoMatch( + "ASSIGNMENT(=)", /*isAssignment=*/true, /*isAmbiguous=*/isAmbiguous); } } else if (!fatalErrors_) { CheckAssignmentConformance(); @@ -4822,13 +4827,15 @@ bool ArgumentAnalyzer::OkLogicalIntegerAssignment( return true; } -std::optional ArgumentAnalyzer::GetDefinedAssignmentProc() { +std::optional ArgumentAnalyzer::GetDefinedAssignmentProc( + bool &isAmbiguous) { const Symbol *proc{nullptr}; bool isProcElemental{false}; std::optional passedObjectIndex; std::string oprNameString{"assignment(=)"}; parser::CharBlock oprName{oprNameString}; const auto &scope{context_.context().FindScope(source_)}; + isAmbiguous = false; { auto restorer{context_.GetContextualMessages().DiscardMessages()}; if (const Symbol *symbol{scope.FindSymbol(oprName)}) { @@ -4842,8 +4849,8 @@ std::optional ArgumentAnalyzer::GetDefinedAssignmentProc() { for (std::size_t i{0}; (!proc || isProcElemental) && i < actuals_.size(); ++i) { const Symbol *generic{nullptr}; - if (const Symbol * - binding{FindBoundOp(oprName, i, generic, /*isSubroutine=*/true)}) { + if (const Symbol *binding{FindBoundOp(oprName, i, generic, + /*isSubroutine=*/true, /*isAmbiguous=*/&isAmbiguous)}) { // ignore inaccessible type-bound ASSIGNMENT(=) generic if (!CheckAccessibleSymbol(scope, DEREF(generic))) { const Symbol *resolution{GetBindingResolution(GetType(i), *binding)}; @@ -4967,7 +4974,8 @@ bool ArgumentAnalyzer::AreConformable() const { // Look for a type-bound operator in the type of arg number passIndex. const Symbol *ArgumentAnalyzer::FindBoundOp(parser::CharBlock oprName, - int passIndex, const Symbol *&generic, bool isSubroutine) { + int passIndex, const Symbol *&generic, bool isSubroutine, + bool *isAmbiguous) { const auto *type{GetDerivedTypeSpec(GetType(passIndex))}; const semantics::Scope *scope{type ? type->scope() : nullptr}; if (scope) { @@ -4989,6 +4997,9 @@ const Symbol *ArgumentAnalyzer::FindBoundOp(parser::CharBlock oprName, // Use the most recent override of the binding, if any return scope->FindComponent(binding->name()); } else { + if (isAmbiguous) { + *isAmbiguous = pair.second; + } context_.EmitGenericResolutionError(*generic, pair.second, isSubroutine); } } @@ -5072,40 +5083,37 @@ void ArgumentAnalyzer::ConvertBOZAssignmentRHS(const DynamicType &lhsType) { } // Report error resolving opr when there is a user-defined one available -void ArgumentAnalyzer::SayNoMatch(const std::string &opr, bool isAssignment) { +void ArgumentAnalyzer::SayNoMatch( + const std::string &opr, bool isAssignment, bool isAmbiguous) { std::string type0{TypeAsFortran(0)}; auto rank0{actuals_[0]->Rank()}; + std::string prefix{"No intrinsic or user-defined "s + opr + " matches"}; + if (isAmbiguous) { + prefix = "Multiple specific procedures for the generic "s + opr + " match"; + } if (actuals_.size() == 1) { if (rank0 > 0) { - context_.Say("No intrinsic or user-defined %s matches " - "rank %d array of %s"_err_en_US, - opr, rank0, type0); + context_.Say("%s rank %d array of %s"_err_en_US, prefix, rank0, type0); } else { - context_.Say("No intrinsic or user-defined %s matches " - "operand type %s"_err_en_US, - opr, type0); + context_.Say("%s operand type %s"_err_en_US, prefix, type0); } } else { std::string type1{TypeAsFortran(1)}; auto rank1{actuals_[1]->Rank()}; if (rank0 > 0 && rank1 > 0 && rank0 != rank1) { - context_.Say("No intrinsic or user-defined %s matches " - "rank %d array of %s and rank %d array of %s"_err_en_US, - opr, rank0, type0, rank1, type1); + context_.Say("%s rank %d array of %s and rank %d array of %s"_err_en_US, + prefix, rank0, type0, rank1, type1); } else if (isAssignment && rank0 != rank1) { if (rank0 == 0) { - context_.Say("No intrinsic or user-defined %s matches " - "scalar %s and rank %d array of %s"_err_en_US, - opr, type0, rank1, type1); + context_.Say("%s scalar %s and rank %d array of %s"_err_en_US, prefix, + type0, rank1, type1); } else { - context_.Say("No intrinsic or user-defined %s matches " - "rank %d array of %s and scalar %s"_err_en_US, - opr, rank0, type0, type1); + context_.Say("%s rank %d array of %s and scalar %s"_err_en_US, prefix, + rank0, type0, type1); } } else { - context_.Say("No intrinsic or user-defined %s matches " - "operand types %s and %s"_err_en_US, - opr, type0, type1); + context_.Say( + "%s operand types %s and %s"_err_en_US, prefix, type0, type1); } } } diff --git a/flang/test/Semantics/bug148675.f90 b/flang/test/Semantics/bug148675.f90 new file mode 100644 index 0000000000000..5ce117e7bb3df --- /dev/null +++ b/flang/test/Semantics/bug148675.f90 @@ -0,0 +1,21 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 +module m + type t + integer n + contains + procedure :: assign1 => myassign, assign2 => myassign + generic :: ASSIGNMENT(=) => assign1 + generic :: ASSIGNMENT(=) => assign2 + end type + contains + subroutine myassign(to, from) + class(t), intent(out) :: to + integer, intent(in) :: from + to%n = from + end + subroutine test + type(t) x + !ERROR: Multiple specific procedures for the generic ASSIGNMENT(=) match operand types TYPE(t) and INTEGER(4) + x = 5 + end +end From 52a46dc57f29e87a5a298ce325562fa2e3d057c9 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 16 Jul 2025 09:09:49 -0700 Subject: [PATCH 067/813] [flang] Allow -fdefault-integer-8 with defined I/O (#148927) Defined I/O subroutines have UNIT= and IOSTAT= dummy arguments that are required to have type INTEGER with its default kind. When that default kind is modified via -fdefault-integer-8, calls to defined I/O subroutines from the runtime don't work. Add a flag to the two data structures shared between the compiler and the runtime support library to indicate that a defined I/O subroutine was compiled under -fdefault-integer-8. This has been done in a compatible manner, so that existing binaries are compatible with the new library and new binaries are compatible with the old library, unless of course -fdefault-integer-8 is used. Fixes https://github.com/llvm/llvm-project/issues/148638. --- .../include/flang-rt/runtime/non-tbp-dio.h | 7 +- flang-rt/include/flang-rt/runtime/type-info.h | 16 ++-- flang-rt/lib/runtime/derived.cpp | 2 +- flang-rt/lib/runtime/descriptor-io.cpp | 88 ++++++++++++++----- flang-rt/lib/runtime/non-tbp-dio.cpp | 2 +- flang-rt/lib/runtime/type-info.cpp | 2 +- .../flang/Semantics/runtime-type-info.h | 7 +- flang/lib/Lower/IO.cpp | 8 +- flang/lib/Semantics/runtime-type-info.cpp | 28 ++++-- flang/module/__fortran_type_info.f90 | 2 +- flang/test/Lower/io-derived-type.f90 | 58 ++++++------ flang/test/Lower/namelist.f90 | 8 +- flang/test/Lower/volatile-openmp.f90 | 8 +- flang/test/Semantics/typeinfo01.f90 | 18 ++-- flang/test/Semantics/typeinfo02.f90 | 4 +- flang/test/Semantics/typeinfo09.f90 | 2 +- flang/test/Semantics/typeinfo13.f90 | 2 +- 17 files changed, 164 insertions(+), 98 deletions(-) diff --git a/flang-rt/include/flang-rt/runtime/non-tbp-dio.h b/flang-rt/include/flang-rt/runtime/non-tbp-dio.h index 99d4113b6c7a8..26849298ec959 100644 --- a/flang-rt/include/flang-rt/runtime/non-tbp-dio.h +++ b/flang-rt/include/flang-rt/runtime/non-tbp-dio.h @@ -34,11 +34,16 @@ namespace Fortran::runtime::io { RT_OFFLOAD_API_GROUP_BEGIN +enum NonTbpDefinedIoFlags { + IsDtvArgPolymorphic = 1 << 0, // first dummy arg is CLASS(T) + DefinedIoInteger8 = 1 << 1, // -fdefault-integer-8 affected UNIT= & IOSTAT= +}; + struct NonTbpDefinedIo { const typeInfo::DerivedType &derivedType; void (*subroutine)(); // null means no non-TBP defined I/O here common::DefinedIo definedIo; - bool isDtvArgPolymorphic; // first dummy arg is CLASS(T) + std::uint8_t flags; }; struct NonTbpDefinedIoTable { diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h index a8d39f4f8a1a3..93bca24a602b4 100644 --- a/flang-rt/include/flang-rt/runtime/type-info.h +++ b/flang-rt/include/flang-rt/runtime/type-info.h @@ -143,9 +143,9 @@ class SpecialBinding { // I/O procedures that are not type-bound. RT_API_ATTRS SpecialBinding(Which which, ProcedurePointer proc, std::uint8_t isArgDescSet, std::uint8_t isTypeBound, - std::uint8_t isArgContiguousSet) + std::uint8_t specialCaseFlag) : which_{which}, isArgDescriptorSet_{isArgDescSet}, - isTypeBound_{isTypeBound}, isArgContiguousSet_{isArgContiguousSet}, + isTypeBound_{isTypeBound}, specialCaseFlag_{specialCaseFlag}, proc_{proc} {} static constexpr RT_API_ATTRS Which RankFinal(int rank) { @@ -153,13 +153,11 @@ class SpecialBinding { } RT_API_ATTRS Which which() const { return which_; } + RT_API_ATTRS bool specialCaseFlag() const { return specialCaseFlag_; } RT_API_ATTRS bool IsArgDescriptor(int zeroBasedArg) const { return (isArgDescriptorSet_ >> zeroBasedArg) & 1; } RT_API_ATTRS bool IsTypeBound() const { return isTypeBound_ != 0; } - RT_API_ATTRS bool IsArgContiguous(int zeroBasedArg) const { - return (isArgContiguousSet_ >> zeroBasedArg) & 1; - } template RT_API_ATTRS PROC GetProc(const Binding *bindings = nullptr) const { if (bindings && isTypeBound_ > 0) { @@ -203,10 +201,10 @@ class SpecialBinding { // When a special binding is type-bound, this is its binding's index (plus 1, // so that 0 signifies that it's not type-bound). std::uint8_t isTypeBound_{0}; - // True when a FINAL subroutine has a dummy argument that is an array that - // is CONTIGUOUS or neither assumed-rank nor assumed-shape. - std::uint8_t isArgContiguousSet_{0}; - + // For a FINAL subroutine, set when it has a dummy argument that is an array + // that is CONTIGUOUS or neither assumed-rank nor assumed-shape. + // For a defined I/O subroutine, set when UNIT= and IOSTAT= are INTEGER(8). + std::uint8_t specialCaseFlag_{0}; ProcedurePointer proc_{nullptr}; }; diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp index bb9a68abef2a7..4ed0baaa3d108 100644 --- a/flang-rt/lib/runtime/derived.cpp +++ b/flang-rt/lib/runtime/derived.cpp @@ -270,7 +270,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor, StaticDescriptor statDesc; Descriptor ©{statDesc.descriptor()}; const Descriptor *argDescriptor{&descriptor}; - if (descriptor.rank() > 0 && special->IsArgContiguous(0) && + if (descriptor.rank() > 0 && special->specialCaseFlag() && !descriptor.IsContiguous()) { // The FINAL subroutine demands a contiguous array argument, but // this INTENT(OUT) or intrinsic assignment LHS isn't contiguous. diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp index b208cb2c397b3..3868c8ddce19f 100644 --- a/flang-rt/lib/runtime/descriptor-io.cpp +++ b/flang-rt/lib/runtime/descriptor-io.cpp @@ -67,13 +67,29 @@ static RT_API_ATTRS Fortran::common::optional DefinedFormattedIo( ioType, io.mutableModes().inNamelist ? "NAMELIST" : "LISTDIRECTED"); ioTypeLen = runtime::strlen(ioType); } + // V_LIST= argument StaticDescriptor<1, true> vListStatDesc; Descriptor &vListDesc{vListStatDesc.descriptor()}; - vListDesc.Establish(TypeCategory::Integer, sizeof(int), nullptr, 1); - vListDesc.set_base_addr(edit.vList); - vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries); - vListDesc.GetDimension(0).SetByteStride( - static_cast(sizeof(int))); + bool integer8{special.specialCaseFlag()}; + std::int64_t vList64[edit.maxVListEntries]; + if (integer8) { + // Convert v_list values to INTEGER(8) + for (int j{0}; j < edit.vListEntries; ++j) { + vList64[j] = edit.vList[j]; + } + vListDesc.Establish( + TypeCategory::Integer, sizeof(std::int64_t), nullptr, 1); + vListDesc.set_base_addr(vList64); + vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries); + vListDesc.GetDimension(0).SetByteStride( + static_cast(sizeof(std::int64_t))); + } else { + vListDesc.Establish(TypeCategory::Integer, sizeof(int), nullptr, 1); + vListDesc.set_base_addr(edit.vList); + vListDesc.GetDimension(0).SetBounds(1, edit.vListEntries); + vListDesc.GetDimension(0).SetByteStride( + static_cast(sizeof(int))); + } ExternalFileUnit *actualExternal{io.GetExternalFileUnit()}; ExternalFileUnit *external{actualExternal}; if (!external) { @@ -84,8 +100,8 @@ static RT_API_ATTRS Fortran::common::optional DefinedFormattedIo( ChildIo &child{external->PushChildIo(io)}; // Child formatted I/O is nonadvancing by definition (F'2018 12.6.2.4). auto restorer{common::ScopedSet(io.mutableModes().nonAdvancing, true)}; - int unit{external->unitNumber()}; - int ioStat{IostatOk}; + std::int32_t unit{external->unitNumber()}; + std::int32_t ioStat{IostatOk}; char ioMsg[100]; Fortran::common::optional startPos; if (edit.descriptor == DataEdit::DefinedDerivedType && @@ -98,23 +114,45 @@ static RT_API_ATTRS Fortran::common::optional DefinedFormattedIo( derived.binding().OffsetElement()}; if (special.IsArgDescriptor(0)) { // "dtv" argument is "class(t)", pass a descriptor - auto *p{special.GetProc( - bindings)}; StaticDescriptor<1, true, 10 /*?*/> elementStatDesc; Descriptor &elementDesc{elementStatDesc.descriptor()}; elementDesc.Establish( derived, nullptr, 0, nullptr, CFI_attribute_pointer); elementDesc.set_base_addr(descriptor.Element(subscripts)); - p(elementDesc, unit, ioType, vListDesc, ioStat, ioMsg, ioTypeLen, - sizeof ioMsg); + if (integer8) { // 64-bit UNIT=/IOSTAT= + std::int64_t unit64{unit}; + std::int64_t ioStat64{ioStat}; + auto *p{special.GetProc(bindings)}; + p(elementDesc, unit64, ioType, vListDesc, ioStat64, ioMsg, ioTypeLen, + sizeof ioMsg); + ioStat = ioStat64; + } else { // 32-bit UNIT=/IOSTAT= + auto *p{special.GetProc(bindings)}; + p(elementDesc, unit, ioType, vListDesc, ioStat, ioMsg, ioTypeLen, + sizeof ioMsg); + } } else { // "dtv" argument is "type(t)", pass a raw pointer - auto *p{special.GetProc( - bindings)}; - p(descriptor.Element(subscripts), unit, ioType, vListDesc, ioStat, - ioMsg, ioTypeLen, sizeof ioMsg); + if (integer8) { // 64-bit UNIT= and IOSTAT= + std::int64_t unit64{unit}; + std::int64_t ioStat64{ioStat}; + auto *p{special.GetProc(bindings)}; + p(descriptor.Element(subscripts), unit64, ioType, vListDesc, + ioStat64, ioMsg, ioTypeLen, sizeof ioMsg); + ioStat = ioStat64; + } else { // 32-bit UNIT= and IOSTAT= + auto *p{special.GetProc(bindings)}; + p(descriptor.Element(subscripts), unit, ioType, vListDesc, ioStat, + ioMsg, ioTypeLen, sizeof ioMsg); + } } handler.Forward(ioStat, ioMsg, sizeof ioMsg); external->PopChildIo(child); @@ -458,11 +496,16 @@ RT_API_ATTRS int DescriptorIoTicket::Begin(WorkQueue &workQueue) { ? common::DefinedIo::ReadUnformatted : common::DefinedIo::WriteUnformatted)}) { if (definedIo->subroutine) { + std::uint8_t isArgDescriptorSet{0}; + if (definedIo->flags & IsDtvArgPolymorphic) { + isArgDescriptorSet = 1; + } typeInfo::SpecialBinding special{DIR == Direction::Input ? typeInfo::SpecialBinding::Which::ReadUnformatted : typeInfo::SpecialBinding::Which::WriteUnformatted, - definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, - false}; + definedIo->subroutine, isArgDescriptorSet, + /*IsTypeBound=*/false, + /*specialCaseFlag=*/!!(definedIo->flags & DefinedIoInteger8)}; if (DefinedUnformattedIo(io_, instance_, *type, special)) { anyIoTookPlace_ = true; return StatOk; @@ -719,8 +762,11 @@ RT_API_ATTRS int DescriptorIoTicket::Begin(WorkQueue &workQueue) { nonTbpSpecial_.emplace(DIR == Direction::Input ? typeInfo::SpecialBinding::Which::ReadFormatted : typeInfo::SpecialBinding::Which::WriteFormatted, - definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, - false); + definedIo->subroutine, + /*isArgDescriptorSet=*/ + (definedIo->flags & IsDtvArgPolymorphic) ? 1 : 0, + /*isTypeBound=*/false, + /*specialCaseFlag=*/!!(definedIo->flags & DefinedIoInteger8)); special_ = &*nonTbpSpecial_; } } diff --git a/flang-rt/lib/runtime/non-tbp-dio.cpp b/flang-rt/lib/runtime/non-tbp-dio.cpp index 72101b06e0c6e..d516526033c27 100644 --- a/flang-rt/lib/runtime/non-tbp-dio.cpp +++ b/flang-rt/lib/runtime/non-tbp-dio.cpp @@ -17,7 +17,7 @@ const NonTbpDefinedIo *NonTbpDefinedIoTable::Find( for (const auto *p{item}; j-- > 0; ++p) { if (&p->derivedType == &type && p->definedIo == definedIo) { return p; - } else if (p->isDtvArgPolymorphic) { + } else if (p->flags & IsDtvArgPolymorphic) { for (const typeInfo::DerivedType *t{type.GetParentType()}; t; t = t->GetParentType()) { if (&p->derivedType == t && p->definedIo == definedIo) { diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp index 3e1d7c9c3c788..50123f4cf321c 100644 --- a/flang-rt/lib/runtime/type-info.cpp +++ b/flang-rt/lib/runtime/type-info.cpp @@ -330,7 +330,7 @@ FILE *SpecialBinding::Dump(FILE *f) const { } std::fprintf(f, " isArgDescriptorSet: 0x%x\n", isArgDescriptorSet_); std::fprintf(f, " isTypeBound: %d\n", isTypeBound_); - std::fprintf(f, " isArgContiguousSet: 0x%x\n", isArgContiguousSet_); + std::fprintf(f, " specialCaseFlag 0x%x\n", specialCaseFlag_); std::fprintf(f, " proc: %p\n", reinterpret_cast(proc_)); return f; } diff --git a/flang/include/flang/Semantics/runtime-type-info.h b/flang/include/flang/Semantics/runtime-type-info.h index 6c5a061d1c1a2..94e8293b14643 100644 --- a/flang/include/flang/Semantics/runtime-type-info.h +++ b/flang/include/flang/Semantics/runtime-type-info.h @@ -52,10 +52,15 @@ constexpr char procCompName[]{"proc"}; SymbolVector CollectBindings(const Scope &dtScope); +enum NonTbpDefinedIoFlags { + IsDtvArgPolymorphic = 1 << 0, + DefinedIoInteger8 = 1 << 1, +}; + struct NonTbpDefinedIo { const Symbol *subroutine; common::DefinedIo definedIo; - bool isDtvArgPolymorphic; + std::uint8_t flags; }; std::multimap diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp index 63a612d7ead61..69d72d9d63b68 100644 --- a/flang/lib/Lower/IO.cpp +++ b/flang/lib/Lower/IO.cpp @@ -269,10 +269,12 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter, mlir::Type sizeTy = fir::runtime::getModel()(builder.getContext()); mlir::Type intTy = fir::runtime::getModel()(builder.getContext()); + mlir::Type byteTy = + fir::runtime::getModel()(builder.getContext()); mlir::Type boolTy = fir::runtime::getModel()(builder.getContext()); mlir::Type listTy = fir::SequenceType::get( definedIoProcMap.size(), - mlir::TupleType::get(context, {refTy, refTy, intTy, boolTy})); + mlir::TupleType::get(context, {refTy, refTy, intTy, byteTy})); mlir::Type tableTy = mlir::TupleType::get( context, {sizeTy, fir::ReferenceType::get(listTy), boolTy}); @@ -339,9 +341,9 @@ getNonTbpDefinedIoTableAddr(Fortran::lower::AbstractConverter &converter, insert(builder.createIntegerConstant( loc, intTy, static_cast(iface.second.definedIo))); // polymorphic flag is set if first defined IO dummy arg is CLASS(T) + // defaultInt8 flag is set if -fdefined-integer-8 // [bool isDtvArgPolymorphic] - insert(builder.createIntegerConstant(loc, boolTy, - iface.second.isDtvArgPolymorphic)); + insert(builder.createIntegerConstant(loc, byteTy, iface.second.flags)); } if (tableIsLocal) builder.create(loc, list, listAddr); diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp index 51ba21a9e5edf..5916a07df7744 100644 --- a/flang/lib/Semantics/runtime-type-info.cpp +++ b/flang/lib/Semantics/runtime-type-info.cpp @@ -1131,7 +1131,7 @@ void RuntimeTableBuilder::DescribeSpecialProc( if (auto proc{evaluate::characteristics::Procedure::Characterize( specific, context_.foldingContext())}) { std::uint8_t isArgDescriptorSet{0}; - std::uint8_t isArgContiguousSet{0}; + bool specialCaseFlag{0}; int argThatMightBeDescriptor{0}; MaybeExpr which; if (isAssignment) { @@ -1197,7 +1197,7 @@ void RuntimeTableBuilder::DescribeSpecialProc( TypeAndShape::Attr::AssumedShape) || dummyData.attrs.test(evaluate::characteristics:: DummyDataObject::Attr::Contiguous)) { - isArgContiguousSet |= 1; + specialCaseFlag = true; } } } @@ -1216,7 +1216,7 @@ void RuntimeTableBuilder::DescribeSpecialProc( return; } if (ddo->type.type().IsPolymorphic()) { - isArgDescriptorSet |= 1; + argThatMightBeDescriptor = 1; } switch (io.value()) { case common::DefinedIo::ReadFormatted: @@ -1232,6 +1232,9 @@ void RuntimeTableBuilder::DescribeSpecialProc( which = writeUnformattedEnum_; break; } + if (context_.defaultKinds().GetDefaultKind(TypeCategory::Integer) == 8) { + specialCaseFlag = true; // UNIT= & IOSTAT= INTEGER(8) + } } if (argThatMightBeDescriptor != 0) { if (const auto *dummyData{ @@ -1262,8 +1265,8 @@ void RuntimeTableBuilder::DescribeSpecialProc( } CHECK(bindingIndex <= 255); AddValue(values, specialSchema_, "istypebound"s, IntExpr<1>(bindingIndex)); - AddValue(values, specialSchema_, "isargcontiguousset"s, - IntExpr<1>(isArgContiguousSet)); + AddValue(values, specialSchema_, "specialcaseflag"s, + IntExpr<1>(specialCaseFlag)); AddValue(values, specialSchema_, procCompName, SomeExpr{evaluate::ProcedureDesignator{specific}}); // index might already be present in the case of an override @@ -1383,19 +1386,26 @@ CollectNonTbpDefinedIoGenericInterfaces( } else { // Local scope's specific overrides host's for this type bool updated{false}; + std::uint8_t flags{0}; + if (declType->IsPolymorphic()) { + flags |= IsDtvArgPolymorphic; + } + if (scope.context().GetDefaultKind(TypeCategory::Integer) == + 8) { + flags |= DefinedIoInteger8; + } for (auto [iter, end]{result.equal_range(dtDesc)}; iter != end; ++iter) { NonTbpDefinedIo &nonTbp{iter->second}; if (nonTbp.definedIo == which) { nonTbp.subroutine = &*specific; - nonTbp.isDtvArgPolymorphic = declType->IsPolymorphic(); + nonTbp.flags = flags; updated = true; } } if (!updated) { - result.emplace(dtDesc, - NonTbpDefinedIo{ - &*specific, which, declType->IsPolymorphic()}); + result.emplace( + dtDesc, NonTbpDefinedIo{&*specific, which, flags}); } } } diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90 index 8dd27d6e4c01b..6af2a5a5e30ff 100644 --- a/flang/module/__fortran_type_info.f90 +++ b/flang/module/__fortran_type_info.f90 @@ -118,7 +118,7 @@ integer(1) :: which ! SpecialBinding::Which integer(1) :: isArgDescriptorSet integer(1) :: isTypeBound ! binding index + 1, if any - integer(1) :: isArgContiguousSet + integer(1) :: specialCaseFlag integer(1) :: __padding0(4) type(__builtin_c_funptr) :: proc end type diff --git a/flang/test/Lower/io-derived-type.f90 b/flang/test/Lower/io-derived-type.f90 index 7d2fef3faa2b7..7c289ce261678 100644 --- a/flang/test/Lower/io-derived-type.f90 +++ b/flang/test/Lower/io-derived-type.f90 @@ -37,16 +37,16 @@ subroutine test1 import, all ! CHECK: %[[V_16:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref>) -> !fir.box> ! CHECK: %[[V_17:[0-9]+]] = fir.convert %[[V_16]] : (!fir.box>) -> !fir.box - ! CHECK: %[[V_18:[0-9]+]] = fir.address_of(@_QQMmFtest1.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i1>>>, i1>> - ! CHECK: %[[V_19:[0-9]+]] = fir.convert %[[V_18]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_18:[0-9]+]] = fir.address_of(@_QQMmFtest1.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i8>>>, i1>> + ! CHECK: %[[V_19:[0-9]+]] = fir.convert %[[V_18]] : (!fir.ref, !fir.ref, i32, i8>>>, i1>>) -> !fir.ref ! CHECK: %[[V_20:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_17]], %[[V_19]]) fastmath : (!fir.ref, !fir.box, !fir.ref) -> i1 print *, 'test1 outer, should call wft: ', t(1) block import, only: t ! CHECK: %[[V_37:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref>) -> !fir.box> ! CHECK: %[[V_38:[0-9]+]] = fir.convert %[[V_37]] : (!fir.box>) -> !fir.box - ! CHECK: %[[V_39:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i1>>>, i1>> - ! CHECK: %[[V_40:[0-9]+]] = fir.convert %[[V_39]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_39:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i8>>>, i1>> + ! CHECK: %[[V_40:[0-9]+]] = fir.convert %[[V_39]] : (!fir.ref, !fir.ref, i32, i8>>>, i1>>) -> !fir.ref ! CHECK: %[[V_41:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_38]], %[[V_40]]) fastmath : (!fir.ref, !fir.box, !fir.ref) -> i1 print *, 'test1 block, should not call wft: ', t(2) end block @@ -56,8 +56,8 @@ subroutine test1 subroutine test2 ! CHECK: %[[V_15:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref>) -> !fir.box> ! CHECK: %[[V_16:[0-9]+]] = fir.convert %[[V_15]] : (!fir.box>) -> !fir.box - ! CHECK: %[[V_17:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i1>>>, i1>> - ! CHECK: %[[V_18:[0-9]+]] = fir.convert %[[V_17]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_17:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i8>>>, i1>> + ! CHECK: %[[V_18:[0-9]+]] = fir.convert %[[V_17]] : (!fir.ref, !fir.ref, i32, i8>>>, i1>>) -> !fir.ref ! CHECK: %[[V_19:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_16]], %[[V_18]]) fastmath : (!fir.ref, !fir.box, !fir.ref) -> i1 import, only: t @@ -74,23 +74,23 @@ subroutine test3(p, x) ! CHECK: %[[V_3:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref>) -> !fir.box> ! CHECK: %[[V_4:[0-9]+]] = fir.convert %[[V_3]] : (!fir.box>) -> !fir.box - ! CHECK: %[[V_5:[0-9]+]] = fir.alloca !fir.array<1xtuple, !fir.ref, i32, i1>> - ! CHECK: %[[V_6:[0-9]+]] = fir.undefined !fir.array<1xtuple, !fir.ref, i32, i1>> + ! CHECK: %[[V_5:[0-9]+]] = fir.alloca !fir.array<1xtuple, !fir.ref, i32, i8>> + ! CHECK: %[[V_6:[0-9]+]] = fir.undefined !fir.array<1xtuple, !fir.ref, i32, i8>> ! CHECK: %[[V_7:[0-9]+]] = fir.address_of(@_QMmE.dt.t) ! CHECK: %[[V_8:[0-9]+]] = fir.convert %[[V_7]] : {{.*}} -> !fir.ref - ! CHECK: %[[V_9:[0-9]+]] = fir.insert_value %[[V_6]], %[[V_8]], [0 : index, 0 : index] : (!fir.array<1xtuple, !fir.ref, i32, i1>>, !fir.ref) -> !fir.array<1xtuple, !fir.ref, i32, i1>> + ! CHECK: %[[V_9:[0-9]+]] = fir.insert_value %[[V_6]], %[[V_8]], [0 : index, 0 : index] : (!fir.array<1xtuple, !fir.ref, i32, i8>>, !fir.ref) -> !fir.array<1xtuple, !fir.ref, i32, i8>> ! CHECK: %[[V_10:[0-9]+]] = fir.box_addr %arg0 : (!fir.boxproc<() -> ()>) -> !fir.ref - ! CHECK: %[[V_11:[0-9]+]] = fir.insert_value %[[V_9]], %[[V_10]], [0 : index, 1 : index] : (!fir.array<1xtuple, !fir.ref, i32, i1>>, !fir.ref) -> !fir.array<1xtuple, !fir.ref, i32, i1>> - ! CHECK: %[[V_12:[0-9]+]] = fir.insert_value %[[V_11]], %c2{{.*}}, [0 : index, 2 : index] : (!fir.array<1xtuple, !fir.ref, i32, i1>>, i32) -> !fir.array<1xtuple, !fir.ref, i32, i1>> - ! CHECK: %[[V_13:[0-9]+]] = fir.insert_value %[[V_12]], %true, [0 : index, 3 : index] : (!fir.array<1xtuple, !fir.ref, i32, i1>>, i1) -> !fir.array<1xtuple, !fir.ref, i32, i1>> - ! CHECK: fir.store %[[V_13]] to %[[V_5]] : !fir.ref, !fir.ref, i32, i1>>> - ! CHECK: %[[V_14:[0-9]+]] = fir.alloca tuple, !fir.ref, i32, i1>>>, i1> - ! CHECK: %[[V_15:[0-9]+]] = fir.undefined tuple, !fir.ref, i32, i1>>>, i1> - ! CHECK: %[[V_16:[0-9]+]] = fir.insert_value %[[V_15]], %c1{{.*}}, [0 : index] : (tuple, !fir.ref, i32, i1>>>, i1>, i64) -> tuple, !fir.ref, i32, i1>>>, i1> - ! CHECK: %[[V_17:[0-9]+]] = fir.insert_value %[[V_16]], %[[V_5]], [1 : index] : (tuple, !fir.ref, i32, i1>>>, i1>, !fir.ref, !fir.ref, i32, i1>>>) -> tuple, !fir.ref, i32, i1>>>, i1> - ! CHECK: %[[V_18:[0-9]+]] = fir.insert_value %[[V_17]], %true_0, [2 : index] : (tuple, !fir.ref, i32, i1>>>, i1>, i1) -> tuple, !fir.ref, i32, i1>>>, i1> - ! CHECK: fir.store %[[V_18]] to %[[V_14]] : !fir.ref, !fir.ref, i32, i1>>>, i1>> - ! CHECK: %[[V_19:[0-9]+]] = fir.convert %[[V_14]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_11:[0-9]+]] = fir.insert_value %[[V_9]], %[[V_10]], [0 : index, 1 : index] : (!fir.array<1xtuple, !fir.ref, i32, i8>>, !fir.ref) -> !fir.array<1xtuple, !fir.ref, i32, i8>> + ! CHECK: %[[V_12:[0-9]+]] = fir.insert_value %[[V_11]], %c2{{.*}}, [0 : index, 2 : index] : (!fir.array<1xtuple, !fir.ref, i32, i8>>, i32) -> !fir.array<1xtuple, !fir.ref, i32, i8>> + ! CHECK: %[[V_13:[0-9]+]] = fir.insert_value %[[V_12]], %c1_i8, [0 : index, 3 : index] : (!fir.array<1xtuple, !fir.ref, i32, i8>>, i8) -> !fir.array<1xtuple, !fir.ref, i32, i8>> + ! CHECK: fir.store %[[V_13]] to %[[V_5]] : !fir.ref, !fir.ref, i32, i8>>> + ! CHECK: %[[V_14:[0-9]+]] = fir.alloca tuple, !fir.ref, i32, i8>>>, i1> + ! CHECK: %[[V_15:[0-9]+]] = fir.undefined tuple, !fir.ref, i32, i8>>>, i1> + ! CHECK: %[[V_16:[0-9]+]] = fir.insert_value %[[V_15]], %c1{{.*}}, [0 : index] : (tuple, !fir.ref, i32, i8>>>, i1>, i64) -> tuple, !fir.ref, i32, i8>>>, i1> + ! CHECK: %[[V_17:[0-9]+]] = fir.insert_value %[[V_16]], %[[V_5]], [1 : index] : (tuple, !fir.ref, i32, i8>>>, i1>, !fir.ref, !fir.ref, i32, i8>>>) -> tuple, !fir.ref, i32, i8>>>, i1> + ! CHECK: %[[V_18:[0-9]+]] = fir.insert_value %[[V_17]], %true, [2 : index] : (tuple, !fir.ref, i32, i8>>>, i1>, i1) -> tuple, !fir.ref, i32, i8>>>, i1> + ! CHECK: fir.store %[[V_18]] to %[[V_14]] : !fir.ref, !fir.ref, i32, i8>>>, i1>> + ! CHECK: %[[V_19:[0-9]+]] = fir.convert %[[V_14]] : (!fir.ref, !fir.ref, i32, i8>>>, i1>>) -> !fir.ref ! CHECK: %[[V_20:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_4]], %[[V_19]]) fastmath : (!fir.ref, !fir.box, !fir.ref) -> i1 print *, x end subroutine @@ -112,8 +112,8 @@ program p ! CHECK: %[[V_97:[0-9]+]] = fir.embox %{{.*}} : (!fir.ref>) -> !fir.box> ! CHECK: %[[V_98:[0-9]+]] = fir.convert %[[V_97]] : (!fir.box>) -> !fir.box - ! CHECK: %[[V_99:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i1>>>, i1>> - ! CHECK: %[[V_100:[0-9]+]] = fir.convert %[[V_99]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_99:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i8>>>, i1>> + ! CHECK: %[[V_100:[0-9]+]] = fir.convert %[[V_99]] : (!fir.ref, !fir.ref, i32, i8>>>, i1>>) -> !fir.ref ! CHECK: %[[V_101:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_98]], %[[V_100]]) fastmath : (!fir.ref, !fir.box, !fir.ref) -> i1 print *, 'main, should call wft: ', t(4) @@ -122,14 +122,14 @@ program p ! CHECK: %[[V_35:[0-9]+]] = fir.shape %c2{{.*}} : (index) -> !fir.shape<1> ! CHECK: %[[V_36:[0-9]+]] = fir.embox %[[V_34]](%[[V_35]]) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> ! CHECK: %[[V_37:[0-9]+]] = fir.convert %[[V_36]] : (!fir.box>>) -> !fir.box - ! CHECK: %[[V_38:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i1>>>, i1>> - ! CHECK: %[[V_39:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_38:[0-9]+]] = fir.address_of(@_QQF.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i8>>>, i1>> + ! CHECK: %[[V_39:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref, !fir.ref, i32, i8>>>, i1>>) -> !fir.ref ! CHECK: %[[V_40:[0-9]+]] = fir.call @_FortranAioOutputDerivedType(%{{.*}}, %[[V_37]], %[[V_39]]) fastmath : (!fir.ref, !fir.box, !fir.ref) -> i1 print *, y(2:3) end -! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple, !fir.ref, i32, i1>> -! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable constant : tuple, !fir.ref, i32, i1>>>, i1> -! CHECK: fir.global linkonce @_QQdefault.nonTbpDefinedIoTable constant : tuple, !fir.ref, i32, i1>>>, i1> -! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple, !fir.ref, i32, i1>> -! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable constant : tuple, !fir.ref, i32, i1>>>, i1> +! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple, !fir.ref, i32, i8>> +! CHECK: fir.global linkonce @_QQMmFtest1.nonTbpDefinedIoTable constant : tuple, !fir.ref, i32, i8>>>, i1> +! CHECK: fir.global linkonce @_QQdefault.nonTbpDefinedIoTable constant : tuple, !fir.ref, i32, i8>>>, i1> +! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable.list constant : !fir.array<1xtuple, !fir.ref, i32, i8>> +! CHECK: fir.global linkonce @_QQF.nonTbpDefinedIoTable constant : tuple, !fir.ref, i32, i8>>>, i1> diff --git a/flang/test/Lower/namelist.f90 b/flang/test/Lower/namelist.f90 index 94b0ef11cb102..770af46eea744 100644 --- a/flang/test/Lower/namelist.f90 +++ b/flang/test/Lower/namelist.f90 @@ -42,8 +42,8 @@ program p ! CHECK: %[[V_42:[0-9]+]] = fir.insert_value %[[V_39]], %[[V_41]], [0 : index] : (tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref>, !fir.ref) -> tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref> ! CHECK: %[[V_43:[0-9]+]] = fir.insert_value %[[V_42]], %c2{{.*}}, [1 : index] : (tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref>, i64) -> tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref> ! CHECK: %[[V_44:[0-9]+]] = fir.insert_value %[[V_43]], %[[V_24]], [2 : index] : (tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref>, !fir.ref, !fir.ref>>>>) -> tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref> - ! CHECK: %[[V_45:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i1>>>, i1>> - ! CHECK: %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_45:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i8>>>, i1>> + ! CHECK: %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.ref, !fir.ref, i32, i8>>>, i1>>) -> !fir.ref ! CHECK: %[[V_47:[0-9]+]] = fir.insert_value %[[V_44]], %[[V_46]], [3 : index] : (tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref>, !fir.ref) -> tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref> ! CHECK: fir.store %[[V_47]] to %[[V_38]] : !fir.ref, i64, !fir.ref, !fir.ref>>>>, !fir.ref>> ! CHECK: %[[V_48:[0-9]+]] = fir.convert %[[V_38]] : (!fir.ref, i64, !fir.ref, !fir.ref>>>>, !fir.ref>>) -> !fir.ref> @@ -100,8 +100,8 @@ subroutine sss ! CHECK: %[[V_20:[0-9]+]] = fir.insert_value %[[V_17]], %[[V_19]], [0 : index] : (tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref>, !fir.ref) -> tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref> ! CHECK: %[[V_21:[0-9]+]] = fir.insert_value %[[V_20]], %c1{{.*}}, [1 : index] : (tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref>, i64) -> tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref> ! CHECK: %[[V_22:[0-9]+]] = fir.insert_value %[[V_21]], %[[V_8]], [2 : index] : (tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref>, !fir.ref, !fir.ref>>>>) -> tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref> - ! CHECK: %[[V_23:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i1>>>, i1>> - ! CHECK: %[[V_24:[0-9]+]] = fir.convert %[[V_23]] : (!fir.ref, !fir.ref, i32, i1>>>, i1>>) -> !fir.ref + ! CHECK: %[[V_23:[0-9]+]] = fir.address_of(@_QQdefault.nonTbpDefinedIoTable) : !fir.ref, !fir.ref, i32, i8>>>, i1>> + ! CHECK: %[[V_24:[0-9]+]] = fir.convert %[[V_23]] : (!fir.ref, !fir.ref, i32, i8>>>, i1>>) -> !fir.ref ! CHECK: %[[V_25:[0-9]+]] = fir.insert_value %[[V_22]], %[[V_24]], [3 : index] : (tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref>, !fir.ref) -> tuple, i64, !fir.ref, !fir.ref>>>>, !fir.ref> ! CHECK: fir.store %[[V_25]] to %[[V_16]] : !fir.ref, i64, !fir.ref, !fir.ref>>>>, !fir.ref>> ! CHECK: %[[V_26:[0-9]+]] = fir.convert %[[V_16]] : (!fir.ref, i64, !fir.ref, !fir.ref>>>>, !fir.ref>>) -> !fir.ref> diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90 index 2e05b652822b5..d1a844eddd106 100644 --- a/flang/test/Lower/volatile-openmp.f90 +++ b/flang/test/Lower/volatile-openmp.f90 @@ -23,11 +23,11 @@ ! CHECK: %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref>>}>> ! CHECK: %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref>>}>>) -> !fir.ref>>}>, volatile> ! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEcontainer"} : (!fir.ref>>}>, volatile>) -> (!fir.ref>>}>, volatile>, !fir.ref>>}>, volatile>) -! CHECK: %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>> +! CHECK: %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>> ! CHECK: %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1> -! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.c.t"} : (!fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>) -! CHECK: %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>> -! CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.dt.t"} : (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) +! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.c.t"} : (!fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>) +! CHECK: %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>> +! CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.dt.t"} : (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) ! CHECK: %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"} {fortran_attrs = #fir.var_attrs} : (!fir.ref>>}>, volatile>) -> !fir.ref>>, volatile> ! CHECK: %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref>>, volatile> ! CHECK: %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box>>, index) -> (index, index, index) diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90 index bb20c546e0261..aeec336ea58ea 100644 --- a/flang/test/Semantics/typeinfo01.f90 +++ b/flang/test/Semantics/typeinfo01.f90 @@ -87,8 +87,8 @@ subroutine s2(x, y) !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())] !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) !CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] -!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)] +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)] +!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s2)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] !CHECK: .v.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s2,name=.n.s1)] end module @@ -115,8 +115,8 @@ subroutine s2(x, y) !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())] !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) !CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] -!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)] +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)] +!CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s2)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] !CHECK: .v.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s2,name=.n.s1)] end module @@ -133,7 +133,7 @@ impure elemental subroutine s1(x, y) class(t), intent(in) :: y end subroutine !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=s1)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] end module @@ -156,7 +156,7 @@ subroutine s4(x) type(t), contiguous :: x(:,:,:) end subroutine !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,isargcontiguousset=1_1,proc=s4)] +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=0_1,specialcaseflag=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=0_1,specialcaseflag=1_1,proc=s4)] end module module m09 @@ -198,7 +198,7 @@ subroutine wu(x,u,iostat,iomsg) character(len=*), intent(inout) :: iomsg end subroutine !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,isargcontiguousset=0_1,proc=wu)] +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=2_1,specialcaseflag=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=3_1,specialcaseflag=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=4_1,specialcaseflag=0_1,proc=wu)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)] end module @@ -247,7 +247,7 @@ subroutine wu(x,u,iostat,iomsg) character(len=*), intent(inout) :: iomsg end subroutine !CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) -!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)] +!CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,specialcaseflag=0_1,proc=wu)] end module module m11 @@ -290,7 +290,7 @@ module m13 contains procedure :: assign1, assign2 generic :: assignment(=) => assign1, assign2 - ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=assign1)] + ! CHECK: .s.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,specialcaseflag=0_1,proc=assign1)] end type contains impure elemental subroutine assign1(to, from) diff --git a/flang/test/Semantics/typeinfo02.f90 b/flang/test/Semantics/typeinfo02.f90 index 29d14c7a0f196..07293627ab492 100644 --- a/flang/test/Semantics/typeinfo02.f90 +++ b/flang/test/Semantics/typeinfo02.f90 @@ -29,5 +29,5 @@ subroutine wf2(x,u,iot,v,iostat,iomsg) character(len=*), intent(inout) :: iomsg end subroutine end module -!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf1)] -!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf2)] +!CHECK: .s.base, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=wf1)] +!CHECK: .s.extended, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=wf2)] diff --git a/flang/test/Semantics/typeinfo09.f90 b/flang/test/Semantics/typeinfo09.f90 index 3527ee6058ad8..8daa6a5f420d7 100644 --- a/flang/test/Semantics/typeinfo09.f90 +++ b/flang/test/Semantics/typeinfo09.f90 @@ -17,4 +17,4 @@ subroutine copy_impl(this, x) end interface end module -!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=copy_impl)] +!CHECK: .s.sometype, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=1_1,istypebound=1_1,specialcaseflag=0_1,proc=copy_impl)] diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90 index ad824ad3590a2..facc280815722 100644 --- a/flang/test/Semantics/typeinfo13.f90 +++ b/flang/test/Semantics/typeinfo13.f90 @@ -22,5 +22,5 @@ impure elemental subroutine override(to, from) end end -!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,isargcontiguousset=0_1,proc=override)] +!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=2_1,specialcaseflag=0_1,proc=override)] !CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)] From bbcdad1f8eab303a9d56c76a0bced7b17c6d2656 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 16 Jul 2025 09:10:07 -0700 Subject: [PATCH 068/813] [flang][runtime] MCLOCK library routine (#148960) Add MCLOCK as an interface to std::clock(). --- flang-rt/lib/runtime/extensions.cpp | 8 ++++---- flang/docs/Intrinsics.md | 3 ++- flang/include/flang/Runtime/extensions.h | 9 +++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/flang-rt/lib/runtime/extensions.cpp b/flang-rt/lib/runtime/extensions.cpp index e70dff3997233..f6c39468d5655 100644 --- a/flang-rt/lib/runtime/extensions.cpp +++ b/flang-rt/lib/runtime/extensions.cpp @@ -27,10 +27,7 @@ #include #ifdef _WIN32 -#define WIN32_LEAN_AND_MEAN -#define NOMINMAX -#include - +#include "flang/Common/windows-include.h" #include inline void CtimeBuffer(char *buffer, size_t bufsize, const time_t cur_time, @@ -309,6 +306,9 @@ void RTNAME(Perror)(const char *str) { perror(str); } // GNU extension function TIME() std::int64_t RTNAME(time)() { return time(nullptr); } +// MCLOCK: returns accumulated CPU time in ticks +std::int32_t FORTRAN_PROCEDURE_NAME(mclock)() { return std::clock(); } + // Extension procedures related to I/O namespace io { diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 0118f8eb7d913..f7da6c889d413 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -709,8 +709,9 @@ CACHESIZE, EOF, FP_CLASS, INT_PTR_KIND, ISNAN, LOC MALLOC, FREE ``` -### Library subroutine +### Library subroutines and functions ``` +ticks = MCLOCK() CALL BACKTRACE() CALL FDATE(TIME) CALL GETLOG(USRNAME) diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h index 06ae7f35d9b5b..b350204714431 100644 --- a/flang/include/flang/Runtime/extensions.h +++ b/flang/include/flang/Runtime/extensions.h @@ -12,14 +12,12 @@ #ifndef FORTRAN_RUNTIME_EXTENSIONS_H_ #define FORTRAN_RUNTIME_EXTENSIONS_H_ -#include "flang/Runtime/entry-names.h" - -#define FORTRAN_PROCEDURE_NAME(name) name##_ - #include "flang/Runtime/entry-names.h" #include #include +#define FORTRAN_PROCEDURE_NAME(name) name##_ + #ifdef _WIN32 // UID and GID don't exist on Windows, these exist to avoid errors. typedef std::uint32_t uid_t; @@ -89,5 +87,8 @@ int FORTRAN_PROCEDURE_NAME(ierrno)(); // GNU extension subroutine PERROR(STRING) void RTNAME(Perror)(const char *str); +// MCLOCK -- returns accumulated time in ticks +int FORTRAN_PROCEDURE_NAME(mclock)(); + } // extern "C" #endif // FORTRAN_RUNTIME_EXTENSIONS_H_ From 3de11b70620d911613a48d493048cb48bb76ec19 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 16 Jul 2025 09:10:26 -0700 Subject: [PATCH 069/813] [flang] Catch bad members of BIND(C) COMMON block (#148971) Variables that can't be BIND(C), like pointers, can't be in a BIND(C) common block, either. Fixes https://github.com/llvm/llvm-project/issues/148922. --- flang/lib/Semantics/check-declarations.cpp | 48 +++++++++++++++++----- flang/test/Semantics/bind-c18.f90 | 7 ++++ 2 files changed, 44 insertions(+), 11 deletions(-) create mode 100644 flang/test/Semantics/bind-c18.f90 diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index f9d64485f1407..a2f2906af10b8 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -151,8 +151,8 @@ class CheckHelper { void CheckProcedureAssemblyName(const Symbol &symbol); void CheckExplicitSave(const Symbol &); parser::Messages WhyNotInteroperableDerivedType(const Symbol &); - parser::Messages WhyNotInteroperableObject( - const Symbol &, bool allowNonInteroperableType = false); + parser::Messages WhyNotInteroperableObject(const Symbol &, + bool allowNonInteroperableType = false, bool forCommonBlock = false); parser::Messages WhyNotInteroperableFunctionResult(const Symbol &); parser::Messages WhyNotInteroperableProcedure(const Symbol &, bool isError); void CheckBindC(const Symbol &); @@ -519,11 +519,35 @@ void CheckHelper::Check(const Symbol &symbol) { } void CheckHelper::CheckCommonBlock(const Symbol &symbol) { + auto restorer{messages_.SetLocation(symbol.name())}; CheckGlobalName(symbol); if (symbol.attrs().test(Attr::BIND_C)) { CheckBindC(symbol); + for (auto ref : symbol.get().objects()) { + if (ref->has()) { + if (auto msgs{WhyNotInteroperableObject(*ref, + /*allowInteroperableType=*/false, /*forCommonBlock=*/true)}; + !msgs.empty()) { + parser::Message &reason{msgs.messages().front()}; + parser::Message *msg{nullptr}; + if (reason.IsFatal()) { + msg = messages_.Say(symbol.name(), + "'%s' may not be a member of BIND(C) COMMON block /%s/"_err_en_US, + ref->name(), symbol.name()); + } else { + msg = messages_.Say(symbol.name(), + "'%s' should not be a member of BIND(C) COMMON block /%s/"_warn_en_US, + ref->name(), symbol.name()); + } + if (msg) { + msg->Attach( + std::move(reason.set_severity(parser::Severity::Because))); + } + } + } + } } - for (MutableSymbolRef ref : symbol.get().objects()) { + for (auto ref : symbol.get().objects()) { if (ref->test(Symbol::Flag::CrayPointee)) { messages_.Say(ref->name(), "Cray pointee '%s' may not be a member of a COMMON block"_err_en_US, @@ -3154,14 +3178,16 @@ parser::Messages CheckHelper::WhyNotInteroperableDerivedType( } parser::Messages CheckHelper::WhyNotInteroperableObject( - const Symbol &symbol, bool allowNonInteroperableType) { + const Symbol &symbol, bool allowNonInteroperableType, bool forCommonBlock) { parser::Messages msgs; - if (examinedByWhyNotInteroperable_.find(symbol) != - examinedByWhyNotInteroperable_.end()) { - return msgs; + if (!forCommonBlock) { + if (examinedByWhyNotInteroperable_.find(symbol) != + examinedByWhyNotInteroperable_.end()) { + return msgs; + } + examinedByWhyNotInteroperable_.insert(symbol); } bool isExplicitBindC{symbol.attrs().test(Attr::BIND_C)}; - examinedByWhyNotInteroperable_.insert(symbol); CHECK(symbol.has()); if (isExplicitBindC && !symbol.owner().IsModule()) { msgs.Say(symbol.name(), @@ -3258,7 +3284,7 @@ parser::Messages CheckHelper::WhyNotInteroperableObject( msgs.Say(symbol.name(), "An interoperable pointer must not be CONTIGUOUS"_err_en_US); } - if (msgs.AnyFatalError()) { + if (!forCommonBlock && msgs.AnyFatalError()) { examinedByWhyNotInteroperable_.erase(symbol); } return msgs; @@ -3338,8 +3364,8 @@ parser::Messages CheckHelper::WhyNotInteroperableProcedure( // on the C side by either a cdesc_t * or a void *. F'2023 18.3.7 (5) bool allowNonInteroperableType{!dummy->attrs().test(Attr::VALUE) && (IsDescriptor(*dummy) || IsAssumedType(*dummy))}; - dummyMsgs = - WhyNotInteroperableObject(*dummy, allowNonInteroperableType); + dummyMsgs = WhyNotInteroperableObject( + *dummy, allowNonInteroperableType, /*forCommonBlock=*/false); } else { CheckBindC(*dummy); } diff --git a/flang/test/Semantics/bind-c18.f90 b/flang/test/Semantics/bind-c18.f90 new file mode 100644 index 0000000000000..f61111458c6d9 --- /dev/null +++ b/flang/test/Semantics/bind-c18.f90 @@ -0,0 +1,7 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 +bind(c) :: /blk/ +!ERROR: 'x' may not be a member of BIND(C) COMMON block /blk/ +common /blk/ x +!BECAUSE: A scalar interoperable variable may not be ALLOCATABLE or POINTER +integer, pointer :: x +end From 4f8597f071bab5113a945bd653bec84bd820d4a3 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Wed, 16 Jul 2025 21:46:12 +0530 Subject: [PATCH 070/813] [LoopInterchange] Add test for floating point math flags (#149090) Adding a test where both `ninf` and `reassoc` flags are present on the instruction. We don't know yet if it is legal to interchange. Prima facie, it does not look like it should be legal but more analysis is needed. --- .../reductions-non-wrapped-operations.ll | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll index 3ed69485bc8f2..0eb6fe98b8bb7 100644 --- a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll +++ b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll @@ -411,6 +411,44 @@ exit: ret void } +; FIXME: Is it really legal to interchange the loops when +; both reassoc and ninf are set? +; Check that the interchange is legal if the floating-point addition is marked +; as reassoc. +; +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_reassoc_ninf_fadd +define void @reduction_reassoc_ninf_fadd(ptr %A) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %sum.j.next = fadd reassoc ninf float %sum.j, %a + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %sum.i.lcssa = phi float [ %sum.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + ; Check that the loops aren't exchanged if there is a reduction of ; non-reassociative floating-point multiplication. ; From e8dc96d9de14c4b2317b11b8bc6e9310113697b3 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 16 Jul 2025 09:31:43 -0700 Subject: [PATCH 071/813] [lldb] Document MCP tools & resources (#148708) Add documentation for the tools and resources exposed by LLDB's MCP server. --- lldb/docs/use/mcp.md | 64 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/lldb/docs/use/mcp.md b/lldb/docs/use/mcp.md index 375c164fe771c..b7474246b54f3 100644 --- a/lldb/docs/use/mcp.md +++ b/lldb/docs/use/mcp.md @@ -75,7 +75,69 @@ Configuration example for [Visual Studio Code](https://code.visualstudio.com/doc } ``` -### Troubleshooting +## Tools + +Tools are a primitive in the Model Context Protocol that enable servers to +expose functionality to clients. + +LLDB's MCP integration exposes one tool, named `lldb_command` which allows the +model to run the same commands a user would type in the LLDB command +interpreter. It takes two arguments: + +1. The unique debugger ID as a number. +2. The command and its arguments as a string. + +## Resources + +Resources are a primitive in the Model Context Protocol that allow servers to +expose content that can be read by clients. + +LLDB's MCP integration exposes a resource for each debugger and target +instance. Debugger resources are accessible using the following URI: + +``` +lldb://debugger/ +``` + +Example output: + +```json +{ + "contents": [ + { + "uri": "lldb://debugger/1", + "mimeType": "application/json", + "text": "{\"debugger_id\":1,\"name\":\"debugger_1\",\"num_targets\":1}" + } + ] +} +``` + +Debuggers can contain one or more targets, which are accessible using the +following URI: + +``` +lldb://debugger//target/ +``` + +Example output: + +```json +{ + "contents": [ + { + "uri": "lldb://debugger/1/target/0", + "mimeType": "application/json", + "text": "{\"arch\":\"arm64-apple-macosx26.0.0\",\"debugger_id\":1,\"dummy\":false,\"path\":\"/bin/count\",\"platform\":\"host\",\"selected\":true,\"target_idx\":0}" + } + ] +} +``` + +Note that unlike the debugger id, which is unique, the target index is not +stable and may be reused when a target is removed and a new target is added. + +## Troubleshooting The MCP server uses the `Host` log channel. You can enable logging with the `log enable` command. From b470ac419d7e8eec6c8a27539096e38a1751ee12 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 16 Jul 2025 17:32:27 +0100 Subject: [PATCH 072/813] [DebugInfo] Delete debug-intrinsic verifier checks (#149066) We no longer produce debug-intrinsics, and whenever they're spotted in bitcode or textual IR they get autoupgraded. We could quite reasonably reject them out of hand as a construct that shouldn't be present. However, the DXIL folks are likely to be converting records back to intrinsics for years to come, and there's no need to make that an error. There's no value in verifying them IMO. --- llvm/lib/IR/Verifier.cpp | 197 ++------------------------------------- 1 file changed, 8 insertions(+), 189 deletions(-) diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 48688453b6986..8c8ed3c5e47ba 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -597,7 +597,6 @@ class Verifier : public InstVisitor, VerifierSupport { void visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call); void visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI); void visitVPIntrinsic(VPIntrinsic &VPI); - void visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII); void visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI); void visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI); void visitAtomicRMWInst(AtomicRMWInst &RMWI); @@ -636,15 +635,12 @@ class Verifier : public InstVisitor, VerifierSupport { void verifyFrameRecoverIndices(); void verifySiblingFuncletUnwinds(); - void verifyFragmentExpression(const DbgVariableIntrinsic &I); void verifyFragmentExpression(const DbgVariableRecord &I); template void verifyFragmentExpression(const DIVariable &V, DIExpression::FragmentInfo Fragment, ValueOrMetadata *Desc); - void verifyFnArgs(const DbgVariableIntrinsic &I); void verifyFnArgs(const DbgVariableRecord &DVR); - void verifyNotEntryValue(const DbgVariableIntrinsic &I); void verifyNotEntryValue(const DbgVariableRecord &I); /// Module-level debug info verification... @@ -5497,11 +5493,6 @@ void Verifier::visitInstruction(Instruction &I) { } } - if (auto *DII = dyn_cast(&I)) { - verifyFragmentExpression(*DII); - verifyNotEntryValue(*DII); - } - SmallVector, 4> MDs; I.getAllMetadata(MDs); for (auto Attachment : MDs) { @@ -5706,18 +5697,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { visitConstrainedFPIntrinsic(cast(Call)); break; case Intrinsic::dbg_declare: // llvm.dbg.declare - Check(isa(Call.getArgOperand(0)), - "invalid llvm.dbg.declare intrinsic call 1", Call); - visitDbgIntrinsic("declare", cast(Call)); - break; - case Intrinsic::dbg_value: // llvm.dbg.value - visitDbgIntrinsic("value", cast(Call)); - break; - case Intrinsic::dbg_assign: // llvm.dbg.assign - visitDbgIntrinsic("assign", cast(Call)); - break; - case Intrinsic::dbg_label: // llvm.dbg.label - visitDbgLabelIntrinsic("label", cast(Call)); + case Intrinsic::dbg_value: // llvm.dbg.value + case Intrinsic::dbg_assign: // llvm.dbg.assign + case Intrinsic::dbg_label: // llvm.dbg.label + // We no longer interpret debug intrinsics (the old variable-location + // design). They're meaningless as far as LLVM is concerned we could make + // it an error for them to appear, but it's possible we'll have users + // converting back to intrinsics for the forseeable future (such as DXIL), + // so tolerate their existance. break; case Intrinsic::memcpy: case Intrinsic::memcpy_inline: @@ -7126,123 +7113,6 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) { } } -void Verifier::visitDbgIntrinsic(StringRef Kind, DbgVariableIntrinsic &DII) { - auto *MD = DII.getRawLocation(); - CheckDI(isa(MD) || isa(MD) || - (isa(MD) && !cast(MD)->getNumOperands()), - "invalid llvm.dbg." + Kind + " intrinsic address/value", &DII, MD); - CheckDI(isa(DII.getRawVariable()), - "invalid llvm.dbg." + Kind + " intrinsic variable", &DII, - DII.getRawVariable()); - CheckDI(isa(DII.getRawExpression()), - "invalid llvm.dbg." + Kind + " intrinsic expression", &DII, - DII.getRawExpression()); - - if (auto *DAI = dyn_cast(&DII)) { - CheckDI(isa(DAI->getRawAssignID()), - "invalid llvm.dbg.assign intrinsic DIAssignID", &DII, - DAI->getRawAssignID()); - const auto *RawAddr = DAI->getRawAddress(); - CheckDI( - isa(RawAddr) || - (isa(RawAddr) && !cast(RawAddr)->getNumOperands()), - "invalid llvm.dbg.assign intrinsic address", &DII, - DAI->getRawAddress()); - CheckDI(isa(DAI->getRawAddressExpression()), - "invalid llvm.dbg.assign intrinsic address expression", &DII, - DAI->getRawAddressExpression()); - // All of the linked instructions should be in the same function as DII. - for (Instruction *I : at::getAssignmentInsts(DAI)) - CheckDI(DAI->getFunction() == I->getFunction(), - "inst not in same function as dbg.assign", I, DAI); - } - - // Ignore broken !dbg attachments; they're checked elsewhere. - if (MDNode *N = DII.getDebugLoc().getAsMDNode()) - if (!isa(N)) - return; - - BasicBlock *BB = DII.getParent(); - Function *F = BB ? BB->getParent() : nullptr; - - // The scopes for variables and !dbg attachments must agree. - DILocalVariable *Var = DII.getVariable(); - DILocation *Loc = DII.getDebugLoc(); - CheckDI(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment", - &DII, BB, F); - - DISubprogram *VarSP = getSubprogram(Var->getRawScope()); - DISubprogram *LocSP = getSubprogram(Loc->getRawScope()); - if (!VarSP || !LocSP) - return; // Broken scope chains are checked elsewhere. - - CheckDI(VarSP == LocSP, - "mismatched subprogram between llvm.dbg." + Kind + - " variable and !dbg attachment", - &DII, BB, F, Var, Var->getScope()->getSubprogram(), Loc, - Loc->getScope()->getSubprogram()); - - // This check is redundant with one in visitLocalVariable(). - CheckDI(isType(Var->getRawType()), "invalid type ref", Var, - Var->getRawType()); - verifyFnArgs(DII); -} - -void Verifier::visitDbgLabelIntrinsic(StringRef Kind, DbgLabelInst &DLI) { - CheckDI(isa(DLI.getRawLabel()), - "invalid llvm.dbg." + Kind + " intrinsic variable", &DLI, - DLI.getRawLabel()); - - // Ignore broken !dbg attachments; they're checked elsewhere. - if (MDNode *N = DLI.getDebugLoc().getAsMDNode()) - if (!isa(N)) - return; - - BasicBlock *BB = DLI.getParent(); - Function *F = BB ? BB->getParent() : nullptr; - - // The scopes for variables and !dbg attachments must agree. - DILabel *Label = DLI.getLabel(); - DILocation *Loc = DLI.getDebugLoc(); - Check(Loc, "llvm.dbg." + Kind + " intrinsic requires a !dbg attachment", &DLI, - BB, F); - - DISubprogram *LabelSP = getSubprogram(Label->getRawScope()); - DISubprogram *LocSP = getSubprogram(Loc->getRawScope()); - if (!LabelSP || !LocSP) - return; - - CheckDI(LabelSP == LocSP, - "mismatched subprogram between llvm.dbg." + Kind + - " label and !dbg attachment", - &DLI, BB, F, Label, Label->getScope()->getSubprogram(), Loc, - Loc->getScope()->getSubprogram()); -} - -void Verifier::verifyFragmentExpression(const DbgVariableIntrinsic &I) { - DILocalVariable *V = dyn_cast_or_null(I.getRawVariable()); - DIExpression *E = dyn_cast_or_null(I.getRawExpression()); - - // We don't know whether this intrinsic verified correctly. - if (!V || !E || !E->isValid()) - return; - - // Nothing to do if this isn't a DW_OP_LLVM_fragment expression. - auto Fragment = E->getFragmentInfo(); - if (!Fragment) - return; - - // The frontend helps out GDB by emitting the members of local anonymous - // unions as artificial local variables with shared storage. When SROA splits - // the storage for artificial local variables that are smaller than the entire - // union, the overhang piece will be outside of the allotted space for the - // variable and this check fails. - // FIXME: Remove this check as soon as clang stops doing this; it hides bugs. - if (V->isArtificial()) - return; - - verifyFragmentExpression(*V, *Fragment, &I); -} void Verifier::verifyFragmentExpression(const DbgVariableRecord &DVR) { DILocalVariable *V = dyn_cast_or_null(DVR.getRawVariable()); DIExpression *E = dyn_cast_or_null(DVR.getRawExpression()); @@ -7285,34 +7155,6 @@ void Verifier::verifyFragmentExpression(const DIVariable &V, CheckDI(FragSize != *VarSize, "fragment covers entire variable", Desc, &V); } -void Verifier::verifyFnArgs(const DbgVariableIntrinsic &I) { - // This function does not take the scope of noninlined function arguments into - // account. Don't run it if current function is nodebug, because it may - // contain inlined debug intrinsics. - if (!HasDebugInfo) - return; - - // For performance reasons only check non-inlined ones. - if (I.getDebugLoc()->getInlinedAt()) - return; - - DILocalVariable *Var = I.getVariable(); - CheckDI(Var, "dbg intrinsic without variable"); - - unsigned ArgNo = Var->getArg(); - if (!ArgNo) - return; - - // Verify there are no duplicate function argument debug info entries. - // These will cause hard-to-debug assertions in the DWARF backend. - if (DebugFnArgs.size() < ArgNo) - DebugFnArgs.resize(ArgNo, nullptr); - - auto *Prev = DebugFnArgs[ArgNo - 1]; - DebugFnArgs[ArgNo - 1] = Var; - CheckDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I, - Prev, Var); -} void Verifier::verifyFnArgs(const DbgVariableRecord &DVR) { // This function does not take the scope of noninlined function arguments into // account. Don't run it if current function is nodebug, because it may @@ -7342,29 +7184,6 @@ void Verifier::verifyFnArgs(const DbgVariableRecord &DVR) { Prev, Var); } -void Verifier::verifyNotEntryValue(const DbgVariableIntrinsic &I) { - DIExpression *E = dyn_cast_or_null(I.getRawExpression()); - - // We don't know whether this intrinsic verified correctly. - if (!E || !E->isValid()) - return; - - if (isa(I.getRawLocation())) { - Value *VarValue = I.getVariableLocationOp(0); - if (isa(VarValue) || isa(VarValue)) - return; - // We allow EntryValues for swift async arguments, as they have an - // ABI-guarantee to be turned into a specific register. - if (auto *ArgLoc = dyn_cast_or_null(VarValue); - ArgLoc && ArgLoc->hasAttribute(Attribute::SwiftAsync)) - return; - } - - CheckDI(!E->isEntryValue(), - "Entry values are only allowed in MIR unless they target a " - "swiftasync Argument", - &I); -} void Verifier::verifyNotEntryValue(const DbgVariableRecord &DVR) { DIExpression *E = dyn_cast_or_null(DVR.getRawExpression()); From 6f660e269242d51a8d36a9a1f98a2244e8311a1a Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Wed, 16 Jul 2025 09:38:53 -0700 Subject: [PATCH 073/813] Fix MSVC warning in benchmark (#147357) Building LLVM with MSVC is raising the following warning: ``` llvm\third-party\benchmark\src\sysinfo.cc(375): warning C4062: enumerator 'CacheUnknown' in switch of enum '_PROCESSOR_CACHE_TYPE' is not handled ``` This change resolves the warning by moving the `Unknown` type into a case block for `CacheUnknown`. Not sure how this code flows back into the original source. --- third-party/benchmark/src/sysinfo.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc index 3993ae17f7fc4..837be8f9cf891 100644 --- a/third-party/benchmark/src/sysinfo.cc +++ b/third-party/benchmark/src/sysinfo.cc @@ -358,7 +358,6 @@ std::vector GetCacheSizesWindows() { C.num_sharing = static_cast(b.count()); C.level = cache.Level; C.size = cache.Size; - C.type = "Unknown"; switch (cache.Type) { case CacheUnified: C.type = "Unified"; @@ -372,6 +371,9 @@ std::vector GetCacheSizesWindows() { case CacheTrace: C.type = "Trace"; break; + case CacheUnknown: + C.type = "Unknown"; + break; } res.push_back(C); } From 8519143a9fd368e7cfcf61582683c4e48e7d67d0 Mon Sep 17 00:00:00 2001 From: Sirui Mu Date: Thu, 17 Jul 2025 00:41:28 +0800 Subject: [PATCH 074/813] [CIR] Add rotate operation (#148426) This patch adds `cir.rotate` operation for the `__builtin_rotateleft` and `__builtin_rotateright` families of builtin calls. --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 39 +++++ clang/include/clang/CIR/MissingFeatures.h | 1 + clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 26 ++++ clang/lib/CIR/CodeGen/CIRGenFunction.h | 2 + .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 16 ++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h | 10 ++ clang/test/CIR/CodeGen/builtin_bit.cpp | 138 ++++++++++++++++++ 7 files changed, 232 insertions(+) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 2ce23dbb27ec6..d19cd83d78b40 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -2934,6 +2934,45 @@ def CIR_ByteSwapOp : CIR_BitOpBase<"byte_swap", }]; } +//===----------------------------------------------------------------------===// +// RotateOp +//===----------------------------------------------------------------------===// + +def CIR_RotateOp : CIR_Op<"rotate", [Pure, SameOperandsAndResultType]> { + let summary = "Rotate the bits in the operand integer"; + let description = [{ + The `cir.rotate` rotates the bits in `input` by the given amount `amount`. + The rotate direction is specified by the `left` and `right` keyword. + + `input` must be an unsigned integer and its width must be either 8, 16, 32, + or 64. The types of `input`, `amount`, and the result must all match. + + Example: + + ```mlir + %r = cir.rotate left %0, %1 : !u32i + %r = cir.rotate right %0, %1 : !u32i + ``` + }]; + + let results = (outs CIR_IntType:$result); + let arguments = (ins + CIR_UIntOfWidths<[8, 16, 32, 64]>:$input, + CIR_IntType:$amount, + UnitAttr:$rotateLeft + ); + + let assemblyFormat = [{ + (`left` $rotateLeft^) : (`right`)? + $input `,` $amount `:` type($result) attr-dict + }]; + + let extraClassDeclaration = [{ + bool isRotateLeft() { return getRotateLeft(); } + bool isRotateRight() { return !getRotateLeft(); } + }]; +} + //===----------------------------------------------------------------------===// // Assume Operations //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 48e309063d38b..182e4b6784d2f 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -254,6 +254,7 @@ struct MissingFeatures { static bool dtorCleanups() { return false; } static bool completeDtors() { return false; } static bool vtableInitialization() { return false; } + static bool msvcBuiltins() { return false; } // Missing types static bool dataMemberType() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index 72e8d71c366d8..476f994959285 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -57,6 +57,20 @@ static RValue emitBuiltinBitOp(CIRGenFunction &cgf, const CallExpr *e, return RValue::get(result); } +RValue CIRGenFunction::emitRotate(const CallExpr *e, bool isRotateLeft) { + mlir::Value input = emitScalarExpr(e->getArg(0)); + mlir::Value amount = emitScalarExpr(e->getArg(1)); + + // TODO(cir): MSVC flavor bit rotate builtins use different types for input + // and amount, but cir.rotate requires them to have the same type. Cast amount + // to the type of input when necessary. + assert(!cir::MissingFeatures::msvcBuiltins()); + + auto r = builder.create(getLoc(e->getSourceRange()), input, + amount, isRotateLeft); + return RValue::get(r); +} + RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, const CallExpr *e, ReturnValueSlot returnValue) { @@ -219,6 +233,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, mlir::Value arg = emitScalarExpr(e->getArg(0)); return RValue::get(builder.create(loc, arg)); } + + case Builtin::BI__builtin_rotateleft8: + case Builtin::BI__builtin_rotateleft16: + case Builtin::BI__builtin_rotateleft32: + case Builtin::BI__builtin_rotateleft64: + return emitRotate(e, /*isRotateLeft=*/true); + + case Builtin::BI__builtin_rotateright8: + case Builtin::BI__builtin_rotateright16: + case Builtin::BI__builtin_rotateright32: + case Builtin::BI__builtin_rotateright64: + return emitRotate(e, /*isRotateLeft=*/false); } // If this is an alias for a lib function (e.g. __builtin_sin), emit diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 1346333739bc1..3baabba5adfe1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1030,6 +1030,8 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::LogicalResult emitReturnStmt(const clang::ReturnStmt &s); + RValue emitRotate(const CallExpr *e, bool isRotateLeft); + mlir::Value emitScalarConstant(const ConstantEmission &constant, Expr *e); /// Emit a conversion from the specified type to the specified destination diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 7dcea0c8eb529..840e856ba0cf8 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -872,6 +872,21 @@ mlir::LogicalResult CIRToLLVMReturnOpLowering::matchAndRewrite( return mlir::LogicalResult::success(); } +mlir::LogicalResult CIRToLLVMRotateOpLowering::matchAndRewrite( + cir::RotateOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + // Note that LLVM intrinsic calls to @llvm.fsh{r,l}.i* have the same type as + // the operand. + mlir::Value input = adaptor.getInput(); + if (op.isRotateLeft()) + rewriter.replaceOpWithNewOp(op, input, input, + adaptor.getAmount()); + else + rewriter.replaceOpWithNewOp(op, input, input, + adaptor.getAmount()); + return mlir::LogicalResult::success(); +} + static mlir::LogicalResult rewriteCallOrInvoke(mlir::Operation *op, mlir::ValueRange callOperands, mlir::ConversionPatternRewriter &rewriter, @@ -2077,6 +2092,7 @@ void ConvertCIRToLLVMPass::runOnOperation() { CIRToLLVMGetBitfieldOpLowering, CIRToLLVMGetGlobalOpLowering, CIRToLLVMGetMemberOpLowering, + CIRToLLVMRotateOpLowering, CIRToLLVMSelectOpLowering, CIRToLLVMSetBitfieldOpLowering, CIRToLLVMShiftOpLowering, diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h index 3c30b1bc5b072..3faf1e900848e 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h @@ -160,6 +160,16 @@ class CIRToLLVMReturnOpLowering mlir::ConversionPatternRewriter &) const override; }; +class CIRToLLVMRotateOpLowering + : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(cir::RotateOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; + class CIRToLLVMCallOpLowering : public mlir::OpConversionPattern { public: using mlir::OpConversionPattern::OpConversionPattern; diff --git a/clang/test/CIR/CodeGen/builtin_bit.cpp b/clang/test/CIR/CodeGen/builtin_bit.cpp index f017b6eb51971..4ac82bd749e8a 100644 --- a/clang/test/CIR/CodeGen/builtin_bit.cpp +++ b/clang/test/CIR/CodeGen/builtin_bit.cpp @@ -416,3 +416,141 @@ unsigned long long test_builtin_bswap64(unsigned long long x) { // OGCG-LABEL: @_Z20test_builtin_bswap64y // OGCG: %{{.+}} = call i64 @llvm.bswap.i64(i64 %{{.+}}) + +unsigned char test_builtin_rotateleft8(unsigned char x, unsigned char y) { + return __builtin_rotateleft8(x, y); +} + +// CIR-LABEL: @_Z24test_builtin_rotateleft8hh +// CIR: %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u8i + +// LLVM-LABEL: @_Z24test_builtin_rotateleft8hh +// LLVM: %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1 +// LLVM-NEXT: %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1 +// LLVM-NEXT: %{{.+}} = call i8 @llvm.fshl.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]]) + +// OGCG-LABEL: @_Z24test_builtin_rotateleft8hh +// OGCG: %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1 +// OGCG-NEXT: %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1 +// OGCG-NEXT: %{{.+}} = call i8 @llvm.fshl.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]]) + +unsigned short test_builtin_rotateleft16(unsigned short x, unsigned short y) { + return __builtin_rotateleft16(x, y); +} + +// CIR-LABEL: @_Z25test_builtin_rotateleft16tt +// CIR: %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u16i + +// LLVM-LABEL: @_Z25test_builtin_rotateleft16tt +// LLVM: %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2 +// LLVM-NEXT: %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2 +// LLVM-NEXT: %{{.+}} = call i16 @llvm.fshl.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]]) + +// OGCG-LABEL: @_Z25test_builtin_rotateleft16tt +// OGCG: %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2 +// OGCG-NEXT: %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2 +// OGCG-NEXT: %{{.+}} = call i16 @llvm.fshl.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]]) + +unsigned test_builtin_rotateleft32(unsigned x, unsigned y) { + return __builtin_rotateleft32(x, y); +} + +// CIR-LABEL: @_Z25test_builtin_rotateleft32jj +// CIR: %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u32i + +// LLVM-LABEL: @_Z25test_builtin_rotateleft32jj +// LLVM: %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4 +// LLVM-NEXT: %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4 +// LLVM-NEXT: %{{.+}} = call i32 @llvm.fshl.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]]) + +// OGCG-LABEL: @_Z25test_builtin_rotateleft32jj +// OGCG: %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4 +// OGCG-NEXT: %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4 +// OGCG-NEXT: %{{.+}} = call i32 @llvm.fshl.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]]) + +unsigned long long test_builtin_rotateleft64(unsigned long long x, + unsigned long long y) { + return __builtin_rotateleft64(x, y); +} + +// CIR-LABEL: @_Z25test_builtin_rotateleft64yy +// CIR: %{{.+}} = cir.rotate left %{{.+}}, %{{.+}} : !u64i + +// LLVM-LABEL: @_Z25test_builtin_rotateleft64yy +// LLVM: %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8 +// LLVM-NEXT: %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8 +// LLVM-NEXT: %{{.+}} = call i64 @llvm.fshl.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]]) + +// OGCG-LABEL: @_Z25test_builtin_rotateleft64yy +// OGCG: %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8 +// OGCG-NEXT: %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8 +// OGCG-NEXT: %{{.+}} = call i64 @llvm.fshl.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]]) + +unsigned char test_builtin_rotateright8(unsigned char x, unsigned char y) { + return __builtin_rotateright8(x, y); +} + +// CIR-LABEL: @_Z25test_builtin_rotateright8hh +// CIR: %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u8i + +// LLVM-LABEL: @_Z25test_builtin_rotateright8hh +// LLVM: %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1 +// LLVM-NEXT: %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1 +// LLVM-NEXT: %{{.+}} = call i8 @llvm.fshr.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]]) + +// OGCG-LABEL: @_Z25test_builtin_rotateright8hh +// OGCG: %[[INPUT:.+]] = load i8, ptr %{{.+}}, align 1 +// OGCG-NEXT: %[[AMOUNT:.+]] = load i8, ptr %{{.+}}, align 1 +// OGCG-NEXT: %{{.+}} = call i8 @llvm.fshr.i8(i8 %[[INPUT]], i8 %[[INPUT]], i8 %[[AMOUNT]]) + +unsigned short test_builtin_rotateright16(unsigned short x, unsigned short y) { + return __builtin_rotateright16(x, y); +} + +// CIR-LABEL: @_Z26test_builtin_rotateright16tt +// CIR: %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u16i + +// LLVM-LABEL: @_Z26test_builtin_rotateright16tt +// LLVM: %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2 +// LLVM-NEXT: %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2 +// LLVM-NEXT: %{{.+}} = call i16 @llvm.fshr.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]]) + +// OGCG-LABEL: @_Z26test_builtin_rotateright16tt +// OGCG: %[[INPUT:.+]] = load i16, ptr %{{.+}}, align 2 +// OGCG-NEXT: %[[AMOUNT:.+]] = load i16, ptr %{{.+}}, align 2 +// OGCG-NEXT: %{{.+}} = call i16 @llvm.fshr.i16(i16 %[[INPUT]], i16 %[[INPUT]], i16 %[[AMOUNT]]) + +unsigned test_builtin_rotateright32(unsigned x, unsigned y) { + return __builtin_rotateright32(x, y); +} + +// CIR-LABEL: @_Z26test_builtin_rotateright32jj +// CIR: %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u32i + +// LLVM-LABEL: @_Z26test_builtin_rotateright32jj +// LLVM: %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4 +// LLVM-NEXT: %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4 +// LLVM-NEXT: %{{.+}} = call i32 @llvm.fshr.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]]) + +// OGCG-LABEL: @_Z26test_builtin_rotateright32jj +// OGCG: %[[INPUT:.+]] = load i32, ptr %{{.+}}, align 4 +// OGCG-NEXT: %[[AMOUNT:.+]] = load i32, ptr %{{.+}}, align 4 +// OGCG-NEXT: %{{.+}} = call i32 @llvm.fshr.i32(i32 %[[INPUT]], i32 %[[INPUT]], i32 %[[AMOUNT]]) + +unsigned long long test_builtin_rotateright64(unsigned long long x, + unsigned long long y) { + return __builtin_rotateright64(x, y); +} + +// CIR-LABEL: @_Z26test_builtin_rotateright64yy +// CIR: %{{.+}} = cir.rotate right %{{.+}}, %{{.+}} : !u64i + +// LLVM-LABEL: @_Z26test_builtin_rotateright64yy +// LLVM: %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8 +// LLVM-NEXT: %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8 +// LLVM-NEXT: %{{.+}} = call i64 @llvm.fshr.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]]) + +// OGCG-LABEL: @_Z26test_builtin_rotateright64yy +// OGCG: %[[INPUT:.+]] = load i64, ptr %{{.+}}, align 8 +// OGCG-NEXT: %[[AMOUNT:.+]] = load i64, ptr %{{.+}}, align 8 +// OGCG-NEXT: %{{.+}} = call i64 @llvm.fshr.i64(i64 %[[INPUT]], i64 %[[INPUT]], i64 %[[AMOUNT]]) From 97922a7d401a4bbbc74013d92f98119e5bdfaebd Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 16 Jul 2025 16:43:49 +0000 Subject: [PATCH 075/813] [lldb][docs] Add CAMKE_BUILD_TYPE to standlone build instructions The first stage requires it, the second appears to default to debug mode, which works but it's better we advise release mode to match. --- lldb/docs/resources/build.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst index 480430fede928..4bbec891da0b3 100644 --- a/lldb/docs/resources/build.rst +++ b/lldb/docs/resources/build.rst @@ -211,6 +211,7 @@ Clang. Then we build the ``ALL`` target with ninja: $ cmake -B /path/to/llvm-build -G Ninja \ -DLLVM_ENABLE_PROJECTS=clang \ + -DCMAKE_BUILD_TYPE=Release \ [] /path/to/llvm-project/llvm $ ninja @@ -224,6 +225,7 @@ build directory for Clang, remember to pass its module path via ``Clang_DIR`` :: $ cmake -B /path/to/lldb-build -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ -DLLVM_DIR=/path/to/llvm-build/lib/cmake/llvm \ [] /path/to/llvm-project/lldb $ ninja lldb lldb-server From 9f364fe9c446d498f46efff327871dc62db8212f Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 16 Jul 2025 17:47:21 +0100 Subject: [PATCH 076/813] [lldb][docs] Add section on testing with QEMU user (#149057) This is not recommended to basically anyone but on occasion it's useful and could be used for testing with other simulator programs for example bare metal simulators. It is not something we do officially support or make any quality guarantees for. Adding this is also an excuse to document the limitations and make the time spent setting up system mode look more worthwhile and might be good to cite in future discussions about testing in simulation. --- lldb/docs/resources/qemu-testing.rst | 47 ++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/lldb/docs/resources/qemu-testing.rst b/lldb/docs/resources/qemu-testing.rst index 8571287a04262..90b8fd50cb5c4 100644 --- a/lldb/docs/resources/qemu-testing.rst +++ b/lldb/docs/resources/qemu-testing.rst @@ -167,3 +167,50 @@ The result of this is that: Your VM configuration should have ports ``54321`` and ``49140`` forwarded for this to work. + +QEMU user mode emulation +------------------------ + +Serious testing of LLDB should be done using system mode emulation. The following +is presented for information only and is not a supported testing configuration +supported by the LLDB project. + +However, it is possible to run the test suite against user mode QEMU if you just +want to test a specific aspect of ``lldb`` and are ok ignoring a lot of expected +failures. This method can also be adapted for simulators with a qemu-like command +line interface. + +(``lldb-server`` cannot be tested using user mode QEMU because that does not +emulate the debugging system calls that ``lldb-server`` tries to make) + +Change ``LLDB_TEST_USER_ARGS`` to choose the ``qemu-user`` platform and +configure it for your architecture. The example below is for AArch64 and assumes +that ``qemu-aarch64`` is installed and on your path. + +If you need to override how the ``qemu-user`` platform finds the QEMU binary, +look up the rest of the platform's settings in LLDB. + +:: + + -DLLDB_TEST_USER_ARGS="--platform-name;qemu-user;--setting;platform.plugin.qemu-user.architecture=aarch64;--arch;aarch64" + +Also set ``LLDB_TEST_COMPILER`` to something that can target the emulated +architecture. Then you should be able to run ``ninja check-lldb`` and it will +run the tests on QEMU user automatically. + +You will see a number of failures compared to a normal test run. Reasons for +this can be, but are not limited to: + +* QEMU's built-in debug stub acting differently and supporting different + features to different extents, when compared to ``lldb-server``. We try to + be compatible but LLDB is not regularly tested with QEMU user. + +* Tests that spawn new processes to attach to. QEMU user only emulates a single + process. + +* Watchpoints. Either these are not emulated or behave differently to real + hardware. Add ``--skip-category;watchpoint`` to ``-DLLDB_TEST_USER_ARGS`` to + skip those. + +* Lack of memory region information due to QEMU communicating this in the + GDB server format which LLDB does not use. From bd6c16c6cfe28105d992fa997dce6e18ea86a5a4 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Wed, 16 Jul 2025 09:51:25 -0700 Subject: [PATCH 077/813] [MachineOutliner] Avoid ranges that cross bundle boundary (#148977) We found some code that was hitting this assert because `getOutlinableRanges()` was trying to create a range that crossed a bundle boundary. https://github.com/llvm/llvm-project/blob/ae3bba4d15a10646ea91c6c0795633b82939857b/llvm/include/llvm/CodeGen/MachineInstrBundleIterator.h#L133-L135 Avoid creating those ranges and add a test that hit the assert. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 13 ++++-- .../machine-outliner-safe-range-in-middle.mir | 42 +++++++++++++++---- 2 files changed, 43 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index cdb224d0cd09f..996b0edd24200 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -9585,10 +9585,15 @@ AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB, }; auto SaveRangeIfNonEmpty = [&RangeLen, &Ranges, &RangeBegin, &RangeEnd]() { // At least one unsafe register is not dead. We do not want to outline at - // this point. If it is long enough to outline from, save the range - // [RangeBegin, RangeEnd). - if (RangeLen > 1) - Ranges.push_back(std::make_pair(RangeBegin, RangeEnd)); + // this point. If it is long enough to outline from and does not cross a + // bundle boundary, save the range [RangeBegin, RangeEnd). + if (RangeLen <= 1) + return; + if (!RangeBegin.isEnd() && RangeBegin->isBundledWithPred()) + return; + if (!RangeEnd.isEnd() && RangeEnd->isBundledWithPred()) + return; + Ranges.emplace_back(RangeBegin, RangeEnd); }; // Find the first point where all unsafe registers are dead. // FIND: <-- end of first potential range diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir b/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir index 23811425101fd..b99ca25a5432d 100644 --- a/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir +++ b/llvm/test/CodeGen/AArch64/machine-outliner-safe-range-in-middle.mir @@ -16,27 +16,53 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: BL @OUTLINED_FUNCTION_0, implicit-def $lr, implicit $sp, implicit-def $lr, implicit-def $x0, implicit-def $x1, implicit-def $x2, implicit-def $x3, implicit-def $x16, implicit $x0, implicit $sp ; CHECK-NEXT: $x9 = ADDXri $x16, 16, 0 - ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 0 + ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 1 ; CHECK-NEXT: BL @OUTLINED_FUNCTION_0, implicit-def $lr, implicit $sp, implicit-def $lr, implicit-def $x0, implicit-def $x1, implicit-def $x2, implicit-def $x3, implicit-def $x16, implicit $x0, implicit $sp ; CHECK-NEXT: $x9 = ADDXri $x9, 16, 0 - ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 0 + ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 2 ; CHECK-NEXT: RET undef $x9 $x0 = ADDXri $x0, 0, 0 $x1 = ADDXri $x0, 1, 0 $x2 = ADDXri $x0, 2, 0 $x3 = ADDXri $x0, 3, 0 - - ; End safe range $x16 = ADDXri $x0, 16, 0 $x9 = ADDXri $x16, 16, 0 - $x16 = ADDXri killed $x16, 16, 0 - + $x16 = ADDXri killed $x16, 16, 1 + ; End safe range $x0 = ADDXri $x0, 0, 0 $x1 = ADDXri $x0, 1, 0 $x2 = ADDXri $x0, 2, 0 $x3 = ADDXri $x0, 3, 0 - ; End safe range $x16 = ADDXri $x0, 16, 0 $x9 = ADDXri $x9, 16, 0 - $x16 = ADDXri killed $x16, 16, 0 + $x16 = ADDXri killed $x16, 16, 2 + ; End safe range RET undef $x9 +... +--- +name: unsafe_range_bundle +tracksRegLiveness: true +machineFunctionInfo: + hasRedZone: false +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: unsafe_range_bundle + ; CHECK: liveins: $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x0 = ADDXri $x0, 0, 0 + ; CHECK-NEXT: $x16 = ADDXri $x0, 16, 0 + ; CHECK-NEXT: BUNDLE { + ; CHECK-NEXT: $x16 = ADDXri killed $x16, 16, 3 + ; CHECK-NEXT: $x1 = ADDXri $x0, 0, 0 + ; CHECK-NEXT: } + ; CHECK-NEXT: RET undef $x9 + $x0 = ADDXri $x0, 0, 0 + $x16 = ADDXri $x0, 16, 0 + BUNDLE { ; Bundle crosses a safe range + $x16 = ADDXri killed $x16, 16, 3 + ; End safe range + $x1 = ADDXri $x0, 0, 0 + } + RET undef $x9 +... From 8349bbd0b98c84836d55593c7eb035c2b0f4e87a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 16 Jul 2025 10:08:11 -0700 Subject: [PATCH 078/813] [flang][cuda] Exit early when there is no device components (#149005) - Exit early when there is no device components - Make the retrieval of the record type more robust --- flang/lib/Lower/ConvertVariable.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 2bfa9618aa4b9..6c4516686f9d0 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -802,13 +802,20 @@ initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter, const Fortran::semantics::DerivedTypeSpec *derived{type ? type->AsDerived() : nullptr}; if (derived) { + if (!FindCUDADeviceAllocatableUltimateComponent(*derived)) + return; // No device components. + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); mlir::Location loc = converter.getCurrentLocation(); fir::ExtendedValue exv = converter.getSymbolExtendedValue(symbol.GetUltimate(), &symMap); - auto recTy = mlir::dyn_cast( - fir::unwrapRefType(fir::getBase(exv).getType())); + mlir::Type baseTy = fir::unwrapRefType(fir::getBase(exv).getType()); + if (auto boxTy = mlir::dyn_cast(baseTy)) + baseTy = boxTy.getEleTy(); + baseTy = fir::unwrapRefType(baseTy); + auto recTy = + mlir::dyn_cast(fir::unwrapSequenceType(baseTy)); assert(recTy && "expected fir::RecordType"); llvm::SmallVector coordinates; From 0f0305021c6e880f8787b9be6606c27e1a0641ed Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Wed, 16 Jul 2025 10:12:58 -0700 Subject: [PATCH 079/813] [CI] Make email check workflow fail when author's email is private in Github UI (#148694) **Problem** Currently, the email check workflow uses `git` to see email used for the last commit but the email address used when merging is actually governed by GitHub settings not what's stored in `git`. Due to this, the email check workflow passes even when the author's email is private in Github. We saw several such cases in our fork of llvm. See https://github.com/intel/llvm/issues/17675 **Solution** Try to find user's email using GH's GraphQL APIs. User's email will be null if it's hidden in the profile. --------- Signed-off-by: Agarwal, Udit --- .github/workflows/email-check.yaml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml index 904ad718f97dd..3339b1eed667b 100644 --- a/.github/workflows/email-check.yaml +++ b/.github/workflows/email-check.yaml @@ -20,14 +20,30 @@ jobs: - name: Extract author email id: author + env: + GH_TOKEN: ${{ github.token }} run: | - git log -1 - echo "EMAIL=$(git show -s --format='%ae' HEAD~0)" >> $GITHUB_OUTPUT + # Use Github GraphQL APIs to get the email associated with the PR author because this takes into account the GitHub settings for email privacy. + query=' + query($login: String!) { + user(login: $login) { + email + } + }' + + PR_AUTHOR=${{ github.event.pull_request.user.login }} + + email=$(gh api graphql -f login="$PR_AUTHOR" -f query="$query" --jq '.data.user.email') + echo "EMAIL_AUTHOR_GH_UI=$email" >> "$GITHUB_OUTPUT" + # Create empty comment file echo "[]" > comments + # When EMAIL_AUTHOR_GH_UI is NULL, author's email is hidden in GitHub UI. + # In this case, we warn the user to turn off "Keep my email addresses private" + # setting in their account. - name: Validate author email - if: ${{ endsWith(steps.author.outputs.EMAIL, 'noreply.github.com') }} + if: ${{ steps.author.outputs.EMAIL_AUTHOR_GH_UI == '' }} env: COMMENT: >- ⚠️ We detected that you are using a GitHub private e-mail address to contribute to the repo.
From 8d21025c3aeb1c98caef08f8446ec138c62288d1 Mon Sep 17 00:00:00 2001 From: Brad Richardson Date: Wed, 16 Jul 2025 12:14:26 -0500 Subject: [PATCH 080/813] [flang] Parallel runtime library design doc (PRIF) (#76088) The Parallel Runtime Interface for Fortran (PRIF) specifies the interface design for supporting Fortran's multi-image parallel features in flang. --------- Co-authored-by: Katherine Rasmussen Co-authored-by: Damian Rouson Co-authored-by: Dan Bonachea --- flang/docs/ParallelMultiImageFortranRuntime.md | 18 ++++++++++++++++++ flang/docs/index.md | 1 + 2 files changed, 19 insertions(+) create mode 100644 flang/docs/ParallelMultiImageFortranRuntime.md diff --git a/flang/docs/ParallelMultiImageFortranRuntime.md b/flang/docs/ParallelMultiImageFortranRuntime.md new file mode 100644 index 0000000000000..8cf0055e5817b --- /dev/null +++ b/flang/docs/ParallelMultiImageFortranRuntime.md @@ -0,0 +1,18 @@ + + +# Multi-Image Parallel Fortran Runtime + + +The Parallel Runtime Interface for Fortran (PRIF) defines an +interface designed for LLVM Flang to target implementations of +Fortran's multi-image parallel features. + +The current revision of the PRIF specification is here: + + diff --git a/flang/docs/index.md b/flang/docs/index.md index 2568ad70c5d09..016577bcb1e98 100644 --- a/flang/docs/index.md +++ b/flang/docs/index.md @@ -78,6 +78,7 @@ on how to get in touch with us and to learn more about the current status. OpenMP-semantics OptionComparison Overview + ParallelMultiImageFortranRuntime ParameterizedDerivedTypes ParserCombinators Parsing From fdec9fd4f81fd336d6d5d50bbd48cd0e095f46b9 Mon Sep 17 00:00:00 2001 From: duhbbx Date: Thu, 17 Jul 2025 01:22:54 +0800 Subject: [PATCH 081/813] [Clang] Fix export declaration diagnostic message (#149059) Change the error message from "export declaration can only be used within a module purview" to "export declaration can only be used within a module interface" to be technically accurate. The previous message was misleading because export declarations are actually within a module purview when used in module implementation units, but they are only allowed in module interface units. This addresses the issue pointed out in GitHub issue #149008 where Bigcheese noted that the diagnostic wording was incorrect. Fixes #149008 --- clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 +- clang/test/CXX/drs/cwg8xx.cpp | 4 ++-- .../module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm | 4 ++-- clang/test/CXX/module/module.interface/p1.cpp | 6 +++--- clang/test/Modules/cxx20-10-2-ex1.cpp | 2 +- clang/test/Modules/cxx20-export-import.cpp | 2 +- clang/test/Modules/cxx20-import-diagnostics-a.cpp | 2 +- clang/test/Modules/export-in-non-modules.cpp | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index e94e91cbd56d9..b4a9527c7ba22 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -12368,7 +12368,7 @@ def err_export_using_internal : Error< "using declaration referring to %1 with %select{internal|module|unknown}0 " "linkage cannot be exported">; def err_export_not_in_module_interface : Error< - "export declaration can only be used within a module purview">; + "export declaration can only be used within a module interface">; def err_export_inline_not_defined : Error< "inline function not defined%select{| before the private module fragment}0">; def err_export_partition_impl : Error< diff --git a/clang/test/CXX/drs/cwg8xx.cpp b/clang/test/CXX/drs/cwg8xx.cpp index ecb9113ccfe66..7395f04c8e399 100644 --- a/clang/test/CXX/drs/cwg8xx.cpp +++ b/clang/test/CXX/drs/cwg8xx.cpp @@ -9,10 +9,10 @@ namespace cwg820 { // cwg820: 2.7 export template struct B {}; // cxx98-17-warning@-1 {{exported templates are unsupported}} -// since-cxx20-error@-2 {{export declaration can only be used within a module purview}} +// since-cxx20-error@-2 {{export declaration can only be used within a module interface}} export template void f() {} // cxx98-17-warning@-1 {{exported templates are unsupported}} -// since-cxx20-error@-2 {{export declaration can only be used within a module purview}} +// since-cxx20-error@-2 {{export declaration can only be used within a module interface}} } // namespace cwg820 namespace cwg873 { // cwg873: 3.0 diff --git a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm index 2158d7fa84b86..ebc76ad16467d 100644 --- a/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm +++ b/clang/test/CXX/module/dcl.dcl/dcl.module/dcl.module.interface/p1.cppm @@ -9,7 +9,7 @@ //--- ExportDeclNotInModulePurview.cppm // expected-error@* {{missing 'export module' declaration in module interface unit}} -export int b; // expected-error {{export declaration can only be used within a module purview}} +export int b; // expected-error {{export declaration can only be used within a module interface}} //--- A.cppm // expected-no-diagnostics @@ -18,7 +18,7 @@ export int a; //--- AddExport.cppm module A; // #module-decl -export int b; // expected-error {{export declaration can only be used within a module purview}} +export int b; // expected-error {{export declaration can only be used within a module interface}} // expected-note@#module-decl {{add 'export' here}} //--- AddExport2.cppm diff --git a/clang/test/CXX/module/module.interface/p1.cpp b/clang/test/CXX/module/module.interface/p1.cpp index c3bfca930f5cc..1754d9ea14618 100644 --- a/clang/test/CXX/module/module.interface/p1.cpp +++ b/clang/test/CXX/module/module.interface/p1.cpp @@ -7,7 +7,7 @@ //--- errors.cpp module; -export int a; // expected-error {{export declaration can only be used within a module purview}} +export int a; // expected-error {{export declaration can only be used within a module interface}} export module M; export int b; // #1 namespace N { @@ -37,8 +37,8 @@ namespace N { //--- impl.cpp module M; // #M -export int b2; // expected-error {{export declaration can only be used within a module purview}} +export int b2; // expected-error {{export declaration can only be used within a module interface}} namespace N { - export int c2; // expected-error {{export declaration can only be used within a module purview}} + export int c2; // expected-error {{export declaration can only be used within a module interface}} } // expected-note@#M 2+{{add 'export'}} diff --git a/clang/test/Modules/cxx20-10-2-ex1.cpp b/clang/test/Modules/cxx20-10-2-ex1.cpp index 0cd6f77466f4b..749b15213098a 100644 --- a/clang/test/Modules/cxx20-10-2-ex1.cpp +++ b/clang/test/Modules/cxx20-10-2-ex1.cpp @@ -14,7 +14,7 @@ export int x; module; #include "std-10-2-ex1.h" -// expected-error@std-10-2-ex1.h:* {{export declaration can only be used within a module purview}} +// expected-error@std-10-2-ex1.h:* {{export declaration can only be used within a module interface}} export module M1; export namespace {} // expected-error {{anonymous namespaces cannot be exported}} diff --git a/clang/test/Modules/cxx20-export-import.cpp b/clang/test/Modules/cxx20-export-import.cpp index 0b505668e8589..c14883e575575 100644 --- a/clang/test/Modules/cxx20-export-import.cpp +++ b/clang/test/Modules/cxx20-export-import.cpp @@ -11,4 +11,4 @@ export module dummy; //--- test.cpp -export import dummy; // expected-error {{export declaration can only be used within a module purview}} +export import dummy; // expected-error {{export declaration can only be used within a module interface}} diff --git a/clang/test/Modules/cxx20-import-diagnostics-a.cpp b/clang/test/Modules/cxx20-import-diagnostics-a.cpp index 1b38259e0358c..72a31ea1d7d78 100644 --- a/clang/test/Modules/cxx20-import-diagnostics-a.cpp +++ b/clang/test/Modules/cxx20-import-diagnostics-a.cpp @@ -110,7 +110,7 @@ module; module AOK1; -export import C; // expected-error {{export declaration can only be used within a module purview}} +export import C; // expected-error {{export declaration can only be used within a module interface}} int theAnswer () { return 42; } diff --git a/clang/test/Modules/export-in-non-modules.cpp b/clang/test/Modules/export-in-non-modules.cpp index 69360eb46d774..7b2575c60f1fd 100644 --- a/clang/test/Modules/export-in-non-modules.cpp +++ b/clang/test/Modules/export-in-non-modules.cpp @@ -1,4 +1,4 @@ // RUN: %clang_cc1 -std=c++20 %s -fsyntax-only -verify -export struct Unit { // expected-error {{export declaration can only be used within a module purview}} +export struct Unit { // expected-error {{export declaration can only be used within a module interface}} bool operator<(const Unit &); }; From bd0f9dd86b16660debca39ce76abdd9da1c157a3 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Wed, 16 Jul 2025 19:25:48 +0200 Subject: [PATCH 082/813] [CIR] Upstream unary not for ComplexType (#148857) Upstream unary not for ComplexType https://github.com/llvm/llvm-project/issues/141365 --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 21 +++++ clang/lib/CIR/CodeGen/CIRGenBuilder.h | 16 ---- clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 6 ++ .../Dialect/Transforms/LoweringPrepare.cpp | 62 ++++++++++++- clang/test/CIR/CodeGen/complex-unary.cpp | 90 +++++++++++++++++++ 5 files changed, 175 insertions(+), 20 deletions(-) create mode 100644 clang/test/CIR/CodeGen/complex-unary.cpp diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 277c278fd38b7..25baf278bba38 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -129,6 +129,22 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { cir::BoolAttr getTrueAttr() { return getCIRBoolAttr(true); } cir::BoolAttr getFalseAttr() { return getCIRBoolAttr(false); } + mlir::Value createComplexCreate(mlir::Location loc, mlir::Value real, + mlir::Value imag) { + auto resultComplexTy = cir::ComplexType::get(real.getType()); + return create(loc, resultComplexTy, real, imag); + } + + mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) { + auto operandTy = mlir::cast(operand.getType()); + return create(loc, operandTy.getElementType(), operand); + } + + mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) { + auto operandTy = mlir::cast(operand.getType()); + return create(loc, operandTy.getElementType(), operand); + } + mlir::Value createNot(mlir::Value value) { return create(value.getLoc(), value.getType(), cir::UnaryOpKind::Not, value); @@ -169,6 +185,11 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { return create(loc); } + mlir::Value createUnaryOp(mlir::Location loc, cir::UnaryOpKind kind, + mlir::Value operand) { + return create(loc, kind, operand); + } + mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) { return cir::ConstPtrAttr::get(type, getI64IntegerAttr(value)); } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index 5bd53ebc52ab5..f855bdad2d7c3 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -348,22 +348,6 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { return CIRBaseBuilderTy::createStore(loc, val, dst.getPointer(), align); } - mlir::Value createComplexCreate(mlir::Location loc, mlir::Value real, - mlir::Value imag) { - auto resultComplexTy = cir::ComplexType::get(real.getType()); - return create(loc, resultComplexTy, real, imag); - } - - mlir::Value createComplexReal(mlir::Location loc, mlir::Value operand) { - auto operandTy = mlir::cast(operand.getType()); - return create(loc, operandTy.getElementType(), operand); - } - - mlir::Value createComplexImag(mlir::Location loc, mlir::Value operand) { - auto operandTy = mlir::cast(operand.getType()); - return create(loc, operandTy.getElementType(), operand); - } - /// Create a cir.complex.real_ptr operation that derives a pointer to the real /// part of the complex value pointed to by the specified pointer value. mlir::Value createComplexRealPtr(mlir::Location loc, mlir::Value value) { diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index 3273d9000771a..6663f5ea1e758 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -57,6 +57,7 @@ class ComplexExprEmitter : public StmtVisitor { mlir::Value VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *e); mlir::Value VisitUnaryDeref(const Expr *e); + mlir::Value VisitUnaryNot(const UnaryOperator *e); struct BinOpInfo { mlir::Location loc; @@ -338,6 +339,11 @@ mlir::Value ComplexExprEmitter::VisitUnaryDeref(const Expr *e) { return emitLoadOfLValue(e); } +mlir::Value ComplexExprEmitter::VisitUnaryNot(const UnaryOperator *e) { + mlir::Value op = Visit(e->getSubExpr()); + return builder.createNot(op); +} + mlir::Value ComplexExprEmitter::emitPromoted(const Expr *e, QualType promotionTy) { e = e->IgnoreParens(); diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index 5493b86a0a321..c708cf9d9fa61 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -8,7 +8,9 @@ #include "PassDetail.h" #include "clang/AST/ASTContext.h" +#include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h" #include "clang/CIR/Dialect/IR/CIRDialect.h" +#include "clang/CIR/Dialect/IR/CIROpsEnums.h" #include "clang/CIR/Dialect/Passes.h" #include @@ -21,17 +23,69 @@ struct LoweringPreparePass : public LoweringPrepareBase { LoweringPreparePass() = default; void runOnOperation() override; - void runOnOp(Operation *op); + void runOnOp(mlir::Operation *op); + void lowerUnaryOp(cir::UnaryOp op); }; } // namespace -void LoweringPreparePass::runOnOp(Operation *op) {} +void LoweringPreparePass::lowerUnaryOp(cir::UnaryOp op) { + mlir::Type ty = op.getType(); + if (!mlir::isa(ty)) + return; + + mlir::Location loc = op.getLoc(); + cir::UnaryOpKind opKind = op.getKind(); + + CIRBaseBuilderTy builder(getContext()); + builder.setInsertionPointAfter(op); + + mlir::Value operand = op.getInput(); + mlir::Value operandReal = builder.createComplexReal(loc, operand); + mlir::Value operandImag = builder.createComplexImag(loc, operand); + + mlir::Value resultReal; + mlir::Value resultImag; + + switch (opKind) { + case cir::UnaryOpKind::Inc: + case cir::UnaryOpKind::Dec: + llvm_unreachable("Complex unary Inc/Dec NYI"); + break; + + case cir::UnaryOpKind::Plus: + case cir::UnaryOpKind::Minus: + llvm_unreachable("Complex unary Plus/Minus NYI"); + break; + + case cir::UnaryOpKind::Not: + resultReal = operandReal; + resultImag = + builder.createUnaryOp(loc, cir::UnaryOpKind::Minus, operandImag); + break; + } + + mlir::Value result = builder.createComplexCreate(loc, resultReal, resultImag); + op.replaceAllUsesWith(result); + op.erase(); +} + +void LoweringPreparePass::runOnOp(mlir::Operation *op) { + if (auto unary = dyn_cast(op)) + lowerUnaryOp(unary); +} void LoweringPreparePass::runOnOperation() { - llvm::SmallVector opsToTransform; + mlir::Operation *op = getOperation(); + + llvm::SmallVector opsToTransform; + + op->walk([&](mlir::Operation *op) { + if (mlir::isa(op)) + opsToTransform.push_back(op); + }); - for (auto *o : opsToTransform) + for (mlir::Operation *o : opsToTransform) runOnOp(o); } diff --git a/clang/test/CIR/CodeGen/complex-unary.cpp b/clang/test/CIR/CodeGen/complex-unary.cpp new file mode 100644 index 0000000000000..33f3c2fa895d3 --- /dev/null +++ b/clang/test/CIR/CodeGen/complex-unary.cpp @@ -0,0 +1,90 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-canonicalize -o %t.cir %s 2>&1 | FileCheck --check-prefix=CIR-BEFORE %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-after=cir-lowering-prepare -o %t.cir %s 2>&1 | FileCheck --check-prefixes=CIR-AFTER %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +void foo() { + int _Complex a; + int _Complex b = ~a; +} + +// CIR-BEFORE: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-BEFORE: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex, !cir.complex +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[RESULT]] : !cir.complex, !cir.ptr> + +// CIR-AFTER: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-AFTER: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !s32i +// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !s32i +// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !s32i, !s32i +// CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !s32i -> !cir.complex +// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex, !cir.ptr> + +// LLVM: %[[COMPLEX:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: %[[RESULT:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { i32, i32 }, ptr %[[COMPLEX]], align 4 +// LLVM: %[[REAL:.*]] = extractvalue { i32, i32 } %[[TMP]], 0 +// LLVM: %[[IMAG:.*]] = extractvalue { i32, i32 } %[[TMP]], 1 +// LLVM: %[[IMAG_MINUS:.*]] = sub i32 0, %[[IMAG]] +// LLVM: %[[RESULT_TMP:.*]] = insertvalue { i32, i32 } {{.*}}, i32 %[[REAL]], 0 +// LLVM: %[[RESULT_VAL:.*]] = insertvalue { i32, i32 } %[[RESULT_TMP]], i32 %[[IMAG_MINUS]], 1 +// LLVM: store { i32, i32 } %[[RESULT_VAL]], ptr %[[RESULT]], align 4 + +// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[RESULT:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0 +// OGCG: %[[A_REAL:.*]] = load i32, ptr %[[A_REAL_PTR]], align 4 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1 +// OGCG: %[[A_IMAG:.*]] = load i32, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[A_IMAG_MINUS:.*]] = sub i32 0, %[[A_IMAG]] +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[RESULT]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[RESULT]], i32 0, i32 1 +// OGCG: store i32 %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: store i32 %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4 + +void foo2() { + float _Complex a; + float _Complex b = ~a; +} + +// CIR-BEFORE: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-BEFORE: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex, !cir.complex +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[RESULT]] : !cir.complex, !cir.ptr> + +// CIR-AFTER: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-AFTER: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float +// CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex +// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex, !cir.ptr> + +// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[RESULT:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[COMPLEX]], align 4 +// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0 +// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1 +// LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]] +// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0 +// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[RESULT]], align 4 + +// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4 +// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0 +// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1 +// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float %[[A_IMAG]] +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1 +// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4 From 55b417a75fb4cbd13066510cba13d1c214095eab Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Wed, 16 Jul 2025 18:32:47 +0100 Subject: [PATCH 083/813] [Offload] Cache symbols in program (#148209) When creating a new symbol, check that it already exists. If it does, return that pointer rather than building a new symbol structure. --- offload/liboffload/src/OffloadImpl.cpp | 57 +++++++++++-------- .../OffloadAPI/symbol/olGetSymbol.cpp | 18 ++++++ 2 files changed, 52 insertions(+), 23 deletions(-) diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index c4e7f9689a900..ffc9016bca0a3 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -84,17 +84,20 @@ struct ol_program_impl_t { DeviceImage(DeviceImage) {} plugin::DeviceImageTy *Image; std::unique_ptr ImageData; - std::vector> Symbols; + std::mutex SymbolListMutex; __tgt_device_image DeviceImage; + llvm::StringMap> KernelSymbols; + llvm::StringMap> GlobalSymbols; }; struct ol_symbol_impl_t { - ol_symbol_impl_t(GenericKernelTy *Kernel) - : PluginImpl(Kernel), Kind(OL_SYMBOL_KIND_KERNEL) {} - ol_symbol_impl_t(GlobalTy &&Global) - : PluginImpl(Global), Kind(OL_SYMBOL_KIND_GLOBAL_VARIABLE) {} + ol_symbol_impl_t(const char *Name, GenericKernelTy *Kernel) + : PluginImpl(Kernel), Kind(OL_SYMBOL_KIND_KERNEL), Name(Name) {} + ol_symbol_impl_t(const char *Name, GlobalTy &&Global) + : PluginImpl(Global), Kind(OL_SYMBOL_KIND_GLOBAL_VARIABLE), Name(Name) {} std::variant PluginImpl; ol_symbol_kind_t Kind; + llvm::StringRef Name; }; namespace llvm { @@ -714,32 +717,40 @@ Error olGetSymbol_impl(ol_program_handle_t Program, const char *Name, ol_symbol_kind_t Kind, ol_symbol_handle_t *Symbol) { auto &Device = Program->Image->getDevice(); + std::lock_guard Lock{Program->SymbolListMutex}; + switch (Kind) { case OL_SYMBOL_KIND_KERNEL: { - auto KernelImpl = Device.constructKernel(Name); - if (!KernelImpl) - return KernelImpl.takeError(); + auto &Kernel = Program->KernelSymbols[Name]; + if (!Kernel) { + auto KernelImpl = Device.constructKernel(Name); + if (!KernelImpl) + return KernelImpl.takeError(); - if (auto Err = KernelImpl->init(Device, *Program->Image)) - return Err; + if (auto Err = KernelImpl->init(Device, *Program->Image)) + return Err; + + Kernel = std::make_unique(KernelImpl->getName(), + &*KernelImpl); + } - *Symbol = - Program->Symbols - .emplace_back(std::make_unique(&*KernelImpl)) - .get(); + *Symbol = Kernel.get(); return Error::success(); } case OL_SYMBOL_KIND_GLOBAL_VARIABLE: { - GlobalTy GlobalObj{Name}; - if (auto Res = Device.Plugin.getGlobalHandler().getGlobalMetadataFromDevice( - Device, *Program->Image, GlobalObj)) - return Res; - - *Symbol = Program->Symbols - .emplace_back( - std::make_unique(std::move(GlobalObj))) - .get(); + auto &Global = Program->KernelSymbols[Name]; + if (!Global) { + GlobalTy GlobalObj{Name}; + if (auto Res = + Device.Plugin.getGlobalHandler().getGlobalMetadataFromDevice( + Device, *Program->Image, GlobalObj)) + return Res; + + Global = std::make_unique(GlobalObj.getName().c_str(), + std::move(GlobalObj)); + } + *Symbol = Global.get(); return Error::success(); } default: diff --git a/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp b/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp index 5e87ab5b29621..1f496b9c6e1ae 100644 --- a/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp +++ b/offload/unittests/OffloadAPI/symbol/olGetSymbol.cpp @@ -41,6 +41,14 @@ TEST_P(olGetSymbolKernelTest, Success) { ASSERT_NE(Kernel, nullptr); } +TEST_P(olGetSymbolKernelTest, SuccessSamePtr) { + ol_symbol_handle_t KernelA = nullptr; + ol_symbol_handle_t KernelB = nullptr; + ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &KernelA)); + ASSERT_SUCCESS(olGetSymbol(Program, "foo", OL_SYMBOL_KIND_KERNEL, &KernelB)); + ASSERT_EQ(KernelA, KernelB); +} + TEST_P(olGetSymbolKernelTest, InvalidNullProgram) { ol_symbol_handle_t Kernel = nullptr; ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE, @@ -72,6 +80,16 @@ TEST_P(olGetSymbolGlobalTest, Success) { ASSERT_NE(Global, nullptr); } +TEST_P(olGetSymbolGlobalTest, SuccessSamePtr) { + ol_symbol_handle_t GlobalA = nullptr; + ol_symbol_handle_t GlobalB = nullptr; + ASSERT_SUCCESS( + olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &GlobalA)); + ASSERT_SUCCESS( + olGetSymbol(Program, "global", OL_SYMBOL_KIND_GLOBAL_VARIABLE, &GlobalB)); + ASSERT_EQ(GlobalA, GlobalB); +} + TEST_P(olGetSymbolGlobalTest, InvalidNullProgram) { ol_symbol_handle_t Global = nullptr; ASSERT_ERROR( From 037d34815efc6b2c0b3f9f4d19945e49aac831d1 Mon Sep 17 00:00:00 2001 From: sribee8 Date: Wed, 16 Jul 2025 17:57:47 +0000 Subject: [PATCH 084/813] [libc] Updated fuzz tests for trig functions (#148891) Fuzz tests were set up incorrectly so updated trig functions to match the correct format. --------- Co-authored-by: Sriya Pratipati --- libc/fuzzing/math/acos_fuzz.cpp | 42 ++++++++++++++++-------- libc/fuzzing/math/asin_fuzz.cpp | 43 +++++++++++++++++-------- libc/fuzzing/math/cos_fuzz.cpp | 47 +++++++++++++++++---------- libc/fuzzing/math/log10_fuzz.cpp | 4 +-- libc/fuzzing/math/log1p_fuzz.cpp | 4 +-- libc/fuzzing/math/log2_fuzz.cpp | 4 +-- libc/fuzzing/math/log_fuzz.cpp | 4 +-- libc/fuzzing/math/sin_fuzz.cpp | 47 +++++++++++++++++---------- libc/fuzzing/math/sincos_fuzz.cpp | 53 +++++++++++++++++++++---------- libc/fuzzing/math/sqrt_fuzz.cpp | 4 +-- libc/fuzzing/math/tan_fuzz.cpp | 47 +++++++++++++++++---------- 11 files changed, 196 insertions(+), 103 deletions(-) diff --git a/libc/fuzzing/math/acos_fuzz.cpp b/libc/fuzzing/math/acos_fuzz.cpp index d2b5456026839..48fb4eacc3a79 100644 --- a/libc/fuzzing/math/acos_fuzz.cpp +++ b/libc/fuzzing/math/acos_fuzz.cpp @@ -12,26 +12,40 @@ #include "src/math/acos.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(double x) { - // remove NaN and inf and values outside accepted range - if (isnan(x) || isinf(x) || x > 1 || x < -1) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_acos(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); + // remove NaN and inf and values outside accepted range + if (isnan(x) || isinf(x) || x > 1 || x < -1) + continue; - double result = LIBC_NAMESPACE::acos(x); + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; - if (result != to_compare) - __builtin_trap(); + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_acos(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + + double result = LIBC_NAMESPACE::acos(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; diff --git a/libc/fuzzing/math/asin_fuzz.cpp b/libc/fuzzing/math/asin_fuzz.cpp index 94ae5c7bfdeee..e27d179606824 100644 --- a/libc/fuzzing/math/asin_fuzz.cpp +++ b/libc/fuzzing/math/asin_fuzz.cpp @@ -12,26 +12,41 @@ #include "src/math/asin.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(double x) { - // remove NaN and inf and values outside accepted range - if (isnan(x) || isinf(x) || x > 1 || x < -1) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_asin(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - double result = LIBC_NAMESPACE::asin(x); + // remove NaN and inf and values outside accepted range + if (isnan(x) || isinf(x) || x > 1 || x < -1) + continue; - if (result != to_compare) - __builtin_trap(); + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; + + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_asin(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + + double result = LIBC_NAMESPACE::asin(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; diff --git a/libc/fuzzing/math/cos_fuzz.cpp b/libc/fuzzing/math/cos_fuzz.cpp index 5b5ba0f7de717..6ed1e9ed8f309 100644 --- a/libc/fuzzing/math/cos_fuzz.cpp +++ b/libc/fuzzing/math/cos_fuzz.cpp @@ -12,28 +12,43 @@ #include "src/math/cos.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(const double x) { - // remove NaN and inf as preconditions - if (isnan(x)) - return 0; - if (isinf(x)) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_cos(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - double result = LIBC_NAMESPACE::cos(x); + // remove NaN and inf as preconditions + if (isnan(x)) + continue; + if (isinf(x)) + continue; - if (result != to_compare) - __builtin_trap(); + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; + + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_cos(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + + double result = LIBC_NAMESPACE::cos(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; diff --git a/libc/fuzzing/math/log10_fuzz.cpp b/libc/fuzzing/math/log10_fuzz.cpp index 23134f4903a45..369408cc288b5 100644 --- a/libc/fuzzing/math/log10_fuzz.cpp +++ b/libc/fuzzing/math/log10_fuzz.cpp @@ -27,10 +27,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { // remove NaN and inf and values outside accepted range if (isnan(x) || isinf(x) || x < 0) - return 0; + continue; // signed zeros already tested in unit tests if (signbit(x) && x == 0.0) - return 0; + continue; mpfr_set_d(input, x, MPFR_RNDN); int output = mpfr_log10(input, input, MPFR_RNDN); diff --git a/libc/fuzzing/math/log1p_fuzz.cpp b/libc/fuzzing/math/log1p_fuzz.cpp index 5e138a65e3716..e02c61a352c1f 100644 --- a/libc/fuzzing/math/log1p_fuzz.cpp +++ b/libc/fuzzing/math/log1p_fuzz.cpp @@ -26,10 +26,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { data += sizeof(double); // remove NaN and inf and values outside accepted range if (isnan(x) || isinf(x) || x < -1) - return 0; + continue; // signed zeros already tested in unit tests if (signbit(x) && x == 0.0) - return 0; + continue; mpfr_set_d(input, x, MPFR_RNDN); int output = mpfr_log1p(input, input, MPFR_RNDN); diff --git a/libc/fuzzing/math/log2_fuzz.cpp b/libc/fuzzing/math/log2_fuzz.cpp index aa19649b95126..c3e53c639cba9 100644 --- a/libc/fuzzing/math/log2_fuzz.cpp +++ b/libc/fuzzing/math/log2_fuzz.cpp @@ -27,10 +27,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { // remove NaN and inf and values outside accepted range if (isnan(x) || isinf(x) || x < 0) - return 0; + continue; // signed zeros already tested in unit tests if (signbit(x) && x == 0.0) - return 0; + continue; mpfr_set_d(input, x, MPFR_RNDN); int output = mpfr_log2(input, input, MPFR_RNDN); diff --git a/libc/fuzzing/math/log_fuzz.cpp b/libc/fuzzing/math/log_fuzz.cpp index 03aa678d1f16c..9618accf3db26 100644 --- a/libc/fuzzing/math/log_fuzz.cpp +++ b/libc/fuzzing/math/log_fuzz.cpp @@ -27,10 +27,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { // remove NaN and inf and values outside accepted range if (isnan(x) || isinf(x) || x < 0) - return 0; + continue; // signed zeros already tested in unit tests if (signbit(x) && x == 0.0) - return 0; + continue; mpfr_set_d(input, x, MPFR_RNDN); int output = mpfr_log(input, input, MPFR_RNDN); mpfr_subnormalize(input, output, MPFR_RNDN); diff --git a/libc/fuzzing/math/sin_fuzz.cpp b/libc/fuzzing/math/sin_fuzz.cpp index a5f0fa95c1581..f6d59c7e496bc 100644 --- a/libc/fuzzing/math/sin_fuzz.cpp +++ b/libc/fuzzing/math/sin_fuzz.cpp @@ -12,28 +12,43 @@ #include "src/math/sin.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(const double x) { - // remove NaN and inf as preconditions - if (isnan(x)) - return 0; - if (isinf(x)) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_sin(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - double result = LIBC_NAMESPACE::sin(x); + // remove NaN and inf as preconditions + if (isnan(x)) + continue; + if (isinf(x)) + continue; - if (result != to_compare) - __builtin_trap(); + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; + + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_sin(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + + double result = LIBC_NAMESPACE::sin(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; diff --git a/libc/fuzzing/math/sincos_fuzz.cpp b/libc/fuzzing/math/sincos_fuzz.cpp index fd3dfae23168c..3d3306721fc47 100644 --- a/libc/fuzzing/math/sincos_fuzz.cpp +++ b/libc/fuzzing/math/sincos_fuzz.cpp @@ -12,15 +12,12 @@ #include "src/math/sincos.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(double x) { - // remove NaN and inf as preconditions - if (isnan(x) || isinf(x)) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_t sin_x; mpfr_t cos_x; @@ -28,21 +25,43 @@ extern "C" int LLVMFuzzerTestOneInput(double x) { mpfr_init2(input, 53); mpfr_init2(sin_x, 53); mpfr_init2(cos_x, 53); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - mpfr_set_d(input, x, MPFR_RNDN); + // remove NaN and inf as preconditions + if (isnan(x) || isinf(x)) + continue; - int output = mpfr_sin_cos(sin_x, cos_x, input, MPFR_RNDN); - mpfr_subnormalize(sin_x, output, MPFR_RNDN); - mpfr_subnormalize(cos_x, output, MPFR_RNDN); + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; - double to_compare_sin = mpfr_get_d(sin_x, MPFR_RNDN); - double to_compare_cos = mpfr_get_d(cos_x, MPFR_RNDN); + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_sin_cos(sin_x, cos_x, input, MPFR_RNDN); + mpfr_subnormalize(sin_x, output, MPFR_RNDN); + mpfr_subnormalize(cos_x, output, MPFR_RNDN); - double sin_res, cos_res; - LIBC_NAMESPACE::sincos(x, &sin_res, &cos_res); + double to_compare_sin = mpfr_get_d(sin_x, MPFR_RNDN); + double to_compare_cos = mpfr_get_d(cos_x, MPFR_RNDN); - if (sin_res != to_compare_sin || cos_res != to_compare_cos) - __builtin_trap(); + double sin_res, cos_res; + LIBC_NAMESPACE::sincos(x, &sin_res, &cos_res); + + if (sin_res != to_compare_sin || cos_res != to_compare_cos) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing sin output: " << sin_res + << std::endl; + std::cout << std::hexfloat << "Expected sin: " << to_compare_sin + << std::endl; + std::cout << std::hexfloat << "Failing cos output: " << cos_res + << std::endl; + std::cout << std::hexfloat << "Expected cos: " << to_compare_cos + << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); mpfr_clear(sin_x); diff --git a/libc/fuzzing/math/sqrt_fuzz.cpp b/libc/fuzzing/math/sqrt_fuzz.cpp index e81cf1afd3728..969b4f58e342c 100644 --- a/libc/fuzzing/math/sqrt_fuzz.cpp +++ b/libc/fuzzing/math/sqrt_fuzz.cpp @@ -26,10 +26,10 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { data += sizeof(double); // remove NaN and inf and values outside accepted range if (isnan(x) || isinf(x) || x < 0) - return 0; + continue; // signed zeros already tested in unit tests if (signbit(x) && x == 0.0) - return 0; + continue; mpfr_set_d(input, x, MPFR_RNDN); int output = mpfr_sqrt(input, input, MPFR_RNDN); diff --git a/libc/fuzzing/math/tan_fuzz.cpp b/libc/fuzzing/math/tan_fuzz.cpp index 2a462fa34fce4..63d3b12866a0e 100644 --- a/libc/fuzzing/math/tan_fuzz.cpp +++ b/libc/fuzzing/math/tan_fuzz.cpp @@ -12,28 +12,43 @@ #include "src/math/tan.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(const double x) { - // remove NaN and inf as preconditions - if (isnan(x)) - return 0; - if (isinf(x)) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_tan(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - double result = LIBC_NAMESPACE::tan(x); + // remove NaN and inf as preconditions + if (isnan(x)) + continue; + if (isinf(x)) + continue; - if (result != to_compare) - __builtin_trap(); + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; + + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_tan(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + + double result = LIBC_NAMESPACE::tan(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; From ba271cc07334c74df55741701e5b22032c0cddbb Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Wed, 16 Jul 2025 10:58:47 -0700 Subject: [PATCH 085/813] Revert "[AMDGPU][MC] Allow op_sel in v_alignbit_b32 etc in GFX9 and GFX10 (#142188) (#149138) This reverts commit ce7851f6b7d59e50f92cb4e8dbfd801576c8b641. The intrinsic llvm.amdgcn.alignbyte was not properly handled for gfx10. --- llvm/lib/Target/AMDGPU/SIInstructions.td | 47 ++----------------- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 29 +----------- .../AMDGPU/GlobalISel/inst-select-bswap.mir | 19 -------- .../AMDGPU/GlobalISel/inst-select-fshr.mir | 22 +-------- .../branch-folding-implicit-def-subreg.ll | 6 +-- llvm/test/MC/AMDGPU/gfx10_asm_vop3.s | 24 ---------- llvm/test/MC/AMDGPU/gfx7_err_pos.s | 13 ----- llvm/test/MC/AMDGPU/gfx8_err_pos.s | 10 ---- llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s | 24 ---------- .../MC/Disassembler/AMDGPU/gfx10_vop3.txt | 24 ---------- .../test/MC/Disassembler/AMDGPU/gfx9_vop3.txt | 24 ---------- 11 files changed, 12 insertions(+), 230 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index d48eb52d2faae..2a6fcadd4c49c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2473,7 +2473,6 @@ def : AMDGPUPat < >; let True16Predicate = NotHasTrue16BitInsts in { -let SubtargetPredicate = isNotGFX9Plus in { def : ROTRPattern ; def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), @@ -2483,35 +2482,6 @@ def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), def : GCNPat<(i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1)))), (V_ALIGNBIT_B32_e64 (i32 (EXTRACT_SUBREG (i64 $src0), sub1)), (i32 (EXTRACT_SUBREG (i64 $src0), sub0)), $src1)>; -} // isNotGFX9Plus - -let SubtargetPredicate = isGFX9GFX10 in { -def : GCNPat < - (rotr i32:$src0, i32:$src1), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src0, - /* src2_modifiers */ 0, - $src1, /* clamp */ 0, /* op_sel */ 0) ->; - -foreach pat = [(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), - (i32 (trunc (srl i64:$src0, (i32 ShiftAmt32Imm:$src1))))] in -def : GCNPat; - -def : GCNPat<(fshr i32:$src0, i32:$src1, i32:$src2), - (V_ALIGNBIT_B32_opsel_e64 /* src0_modifiers */ 0, $src0, - /* src1_modifiers */ 0, $src1, - /* src2_modifiers */ 0, - $src2, /* clamp */ 0, /* op_sel */ 0) ->; -} // isGFX9GFX10 } // end True16Predicate = NotHasTrue16BitInsts let True16Predicate = UseRealTrue16Insts in { @@ -3112,8 +3082,6 @@ def : GCNPat < (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) >; -// This pattern for bswap is used for pre-GFX8. For GFX8+, bswap is mapped -// to V_PERM_B32. let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (i32 (bswap i32:$a)), @@ -3589,20 +3557,15 @@ def : GCNPat < // Take the upper 16 bits from V[0] and the lower 16 bits from V[1] // Special case, can use V_ALIGNBIT (always uses encoded literal) -let True16Predicate = NotHasTrue16BitInsts in { -defvar BuildVectorToAlignBitPat = +let True16Predicate = NotHasTrue16BitInsts in +def : GCNPat < (vecTy (DivergentBinFrag (Ty !if(!eq(Ty, i16), (Ty (trunc (srl VGPR_32:$a, (i32 16)))), (Ty (bitconvert (i16 (trunc (srl VGPR_32:$a, (i32 16)))))))), - (Ty VGPR_32:$b))); - -let SubtargetPredicate = isNotGFX9Plus in -def : GCNPat; - -let SubtargetPredicate = isGFX9GFX10 in -def : GCNPat; -} //True16Predicate = NotHasTrue16BitInsts + (Ty VGPR_32:$b))), + (V_ALIGNBIT_B32_e64 VGPR_32:$b, VGPR_32:$a, (i32 16)) +>; let True16Predicate = UseFakeTrue16Insts in def : GCNPat < diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 75c531913ded1..2e7f25b67fb63 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -224,12 +224,6 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32", fshr, null_frag>; defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile, int_amdgcn_alignbyte>; - -// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32. -// Hardware uses opsel[1:0] to byte-select src2. Other opsel bits are ignored. -defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile>; -defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile>; - let True16Predicate = UseRealTrue16Insts in defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16>; let True16Predicate = UseFakeTrue16Insts in @@ -1960,9 +1954,6 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" -defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; -defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; - defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { @@ -2113,8 +2104,8 @@ defm V_BFI_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14a>; defm V_FMA_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x14b>; defm V_FMA_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x14c>; defm V_LERP_U8 : VOP3_Real_gfx6_gfx7_gfx10<0x14d>; -defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7<0x14e>; -defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7<0x14f>; +defm V_ALIGNBIT_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14e>; +defm V_ALIGNBYTE_B32 : VOP3_Real_gfx6_gfx7_gfx10<0x14f>; defm V_MULLIT_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x150>; defm V_MIN3_F32 : VOP3_Real_gfx6_gfx7_gfx10<0x151>; defm V_MIN3_I32 : VOP3_Real_gfx6_gfx7_gfx10<0x152>; @@ -2257,17 +2248,6 @@ multiclass VOP3_Real_BITOP3_gfx9 op, string AsmName, bit isSingle = 0> } } -// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi. -// The following is created to support that. -multiclass VOP3OpSel_Real_gfx9_with_name op, string opName, string AsmName> { - defvar psName = opName#"_e64"; - def _gfx9 : VOP3_Real(psName), SIEncodingFamily.VI>, // note: encoding family is VI - VOP3OpSel_gfx9 (psName).Pfl> { - VOP3_Pseudo ps = !cast(psName); - let AsmString = AsmName # ps.AsmOperands; - } -} - } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; @@ -2287,10 +2267,8 @@ defm V_BFI_B32 : VOP3_Real_vi <0x1ca>; defm V_FMA_F32 : VOP3_Real_vi <0x1cb>; defm V_FMA_F64 : VOP3_Real_vi <0x1cc>; defm V_LERP_U8 : VOP3_Real_vi <0x1cd>; -let SubtargetPredicate = isGFX8Only in { defm V_ALIGNBIT_B32 : VOP3_Real_vi <0x1ce>; defm V_ALIGNBYTE_B32 : VOP3_Real_vi <0x1cf>; -} defm V_MIN3_F32 : VOP3_Real_vi <0x1d0>; defm V_MIN3_I32 : VOP3_Real_vi <0x1d1>; defm V_MIN3_U32 : VOP3_Real_vi <0x1d2>; @@ -2335,9 +2313,6 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16" defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">; defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">; -defm V_ALIGNBIT_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">; -defm V_ALIGNBYTE_B32_opsel : VOP3OpSel_Real_gfx9_with_name <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">; - defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">; defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">; defm V_MAD_I16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir index dde566d9643d8..5b8c2840b0156 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-bswap.mir @@ -1,8 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX7 %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s --- name: bswap_i32_vv @@ -21,7 +19,6 @@ body: | ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16711935 ; GFX7-NEXT: [[V_BFI_B32_e64_:%[0-9]+]]:vgpr_32 = V_BFI_B32_e64 [[S_MOV_B32_]], [[V_ALIGNBIT_B32_e64_1]], [[V_ALIGNBIT_B32_e64_]], implicit $exec ; GFX7-NEXT: S_ENDPGM 0, implicit [[V_BFI_B32_e64_]] - ; ; GFX8-LABEL: name: bswap_i32_vv ; GFX8: liveins: $vgpr0 ; GFX8-NEXT: {{ $}} @@ -29,22 +26,6 @@ body: | ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 ; GFX8-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] - ; - ; GFX9-LABEL: name: bswap_i32_vv - ; GFX9: liveins: $vgpr0 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 - ; GFX9-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec - ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] - ; - ; GFX10-LABEL: name: bswap_i32_vv - ; GFX10: liveins: $vgpr0 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 66051 - ; GFX10-NEXT: [[V_PERM_B32_e64_:%[0-9]+]]:vgpr_32 = V_PERM_B32_e64 0, [[COPY]], [[S_MOV_B32_]], implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_PERM_B32_e64_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_BSWAP %0 S_ENDPGM 0, implicit %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir index fa95f33909b76..0a4cb3ccf2957 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fshr.mir @@ -1,8 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11 %s --- @@ -24,24 +24,6 @@ body: | ; GCN-NEXT: [[V_ALIGNBIT_B32_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_e64 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec ; GCN-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_e64_]] ; - ; GFX9-LABEL: name: fshr_s32 - ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX9-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] - ; - ; GFX10-LABEL: name: fshr_s32 - ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 - ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[V_ALIGNBIT_B32_opsel_e64_:%[0-9]+]]:vgpr_32 = V_ALIGNBIT_B32_opsel_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec - ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ALIGNBIT_B32_opsel_e64_]] - ; ; GFX11-LABEL: name: fshr_s32 ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index 92c63fead15ac..ae90cfb631e8d 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -766,10 +766,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec ; GFX90A-NEXT: renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr46, implicit $exec - ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_opsel_e64 0, killed $sgpr47, 0, killed $vgpr10, 0, 1, 0, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr17, 0, $vgpr16, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr47, killed $vgpr10, 1, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec ; GFX90A-NEXT: renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec - ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_opsel_e64 0, $vgpr15, 0, $vgpr14, 0, 1, 0, 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec ; GFX90A-NEXT: renamable $sgpr52_sgpr53 = S_XOR_B64 $exec, -1, implicit-def dead $scc ; GFX90A-NEXT: renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s index 3d6af6ba6dbf8..6bb0f4b1dff2d 100644 --- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s +++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s @@ -3628,18 +3628,6 @@ v_alignbit_b32 v5, v1, v2, exec_lo v_alignbit_b32 v5, v1, v2, exec_hi // GFX10: encoding: [0x05,0x00,0x4e,0xd5,0x01,0x05,0xfe,0x01] -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1] -// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04] - -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1] -// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04] - -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1] -// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04] - -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04] - v_alignbyte_b32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04] @@ -3727,18 +3715,6 @@ v_alignbyte_b32 v5, v1, v2, exec_lo v_alignbyte_b32 v5, v1, v2, exec_hi // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0xfe,0x01] -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1] -// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04] - -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1] -// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04] - -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1] -// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04] - -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04] - v_mullit_f32 v5, v1, v2, v3 // GFX10: encoding: [0x05,0x00,0x50,0xd5,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/AMDGPU/gfx7_err_pos.s b/llvm/test/MC/AMDGPU/gfx7_err_pos.s index 7b6b241e04707..9dcbd4a4074af 100644 --- a/llvm/test/MC/AMDGPU/gfx7_err_pos.s +++ b/llvm/test/MC/AMDGPU/gfx7_err_pos.s @@ -44,16 +44,3 @@ s_load_dword s5, s[2:3], glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: cache policy is not supported for SMRD instructions // CHECK-NEXT:{{^}}s_load_dword s5, s[2:3], glc // CHECK-NEXT:{{^}} ^ - -//============================================================================== -// not a valid operand - -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. -// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK-NEXT:{{^}} ^ - -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. -// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx8_err_pos.s b/llvm/test/MC/AMDGPU/gfx8_err_pos.s index a475c739e690d..1e8457d54049a 100644 --- a/llvm/test/MC/AMDGPU/gfx8_err_pos.s +++ b/llvm/test/MC/AMDGPU/gfx8_err_pos.s @@ -49,13 +49,3 @@ v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERV // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. // CHECK-NEXT:{{^}}v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 src1_sel:WORD_0 // CHECK-NEXT:{{^}} ^ - -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. -// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK-NEXT:{{^}} ^ - -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand. -// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s index a1cd9ce8ef18e..f3f4cae22538a 100644 --- a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s +++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s @@ -2829,18 +2829,6 @@ v_alignbit_b32 v5, v1, v2, src_execz v_alignbit_b32 v5, v1, v2, src_scc // CHECK: [0x05,0x00,0xce,0xd1,0x01,0x05,0xf6,0x03] -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] -// CHECK: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] - -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] -// CHECK: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] - -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] -// CHECK: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] - -v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] -// CHECK: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] - v_alignbyte_b32 v5, v1, v2, v3 // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04] @@ -3012,18 +3000,6 @@ v_alignbyte_b32 v5, v1, v2, src_execz v_alignbyte_b32 v5, v1, v2, src_scc // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xf6,0x03] -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1] -// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04] - -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1] -// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04] - -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1] -// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04] - -v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] -// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04] - v_min3_f32 v5, v1, v2, v3 // CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt index 08ed50d92ba83..721babdd64245 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop3.txt @@ -1146,18 +1146,6 @@ # GFX10: v_alignbit_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x4e,0xd5,0x6a,0x04,0x0e,0x04 -# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04 - -# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04 - -# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04 - -# GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04 - # GFX10: v_alignbyte_b32 v255, v1, v2, v3 ; encoding: [0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04] 0xff,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04 @@ -1245,18 +1233,6 @@ # GFX10: v_alignbyte_b32 v5, vcc_lo, v2, v3 ; encoding: [0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04] 0x05,0x00,0x4f,0xd5,0x6a,0x04,0x0e,0x04 -# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04 - -# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04 - -# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04 - -# GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04] -0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04 - # GFX10: v_and_b32_e64 v255, v1, v2 ; encoding: [0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00] 0xff,0x00,0x1b,0xd5,0x01,0x05,0x02,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt index 802d6368507e2..618e081525414 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop3.txt @@ -11310,18 +11310,6 @@ # CHECK: v_alignbit_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01] 0x05,0x00,0xce,0xd1,0x01,0x05,0xfe,0x01 -# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04] -0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04 - -# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04] -0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04 - -# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04] -0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04 - -# CHECK: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04] -0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04 - # CHECK: v_alignbyte_b32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04] 0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04 @@ -11418,18 +11406,6 @@ # CHECK: v_alignbyte_b32 v5, v1, v2, exec_hi ; encoding: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01] 0x05,0x00,0xcf,0xd1,0x01,0x05,0xfe,0x01 -# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04] -0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04 - -# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04] -0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04 - -# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04] -0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04 - -# CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04] -0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04 - # CHECK: v_min3_f32 v5, v1, v2, v3 ; encoding: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04] 0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04 From 560e7df6893495fabe91bc921f9cc0e28a25eb73 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Wed, 16 Jul 2025 10:58:54 -0700 Subject: [PATCH 086/813] AMDGPU: Handle the co-execution hazards for TRANS for gfx1250 (#149024) For the co-execution of the TRANS ops, the requirement is: 1 independent op or V_NOP (since TRANS takes 2 cycles) after the trans op before its sources can be overwritten or the output of the trans op can be used. --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 46 ++++++ llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h | 1 + llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll | 2 + .../AMDGPU/trans-coexecution-hazard.mir | 132 ++++++++++++++++++ 4 files changed, 181 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 0976fccf78d86..bbed828b4fed3 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1189,6 +1189,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { } fixVALUPartialForwardingHazard(MI); fixVALUTransUseHazard(MI); + fixVALUTransCoexecutionHazards(MI); fixWMMAHazards(MI); fixShift64HighRegBug(MI); fixVALUMaskWriteHazard(MI); @@ -1809,6 +1810,51 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { return true; } +bool GCNHazardRecognizer::fixVALUTransCoexecutionHazards(MachineInstr *MI) { + if (!AMDGPU::isGFX1250(ST) || // Coexecution disabled. + !SIInstrInfo::isVALU(*MI) || SIInstrInfo::isTRANS(*MI)) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + auto IsTransHazardFn = [MI, TII, TRI](const MachineInstr &I) { + if (!SIInstrInfo::isTRANS(I)) + return false; + + // RAW: Trans(I) writes, VALU(MI) reads. + Register TransDef = TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg(); + for (const MachineOperand &ValuUse : MI->explicit_uses()) { + if (ValuUse.isReg() && TRI->regsOverlap(TransDef, ValuUse.getReg())) + return true; + } + + auto *ValuDst = TII->getNamedOperand(*MI, AMDGPU::OpName::vdst); + if (!ValuDst || !ValuDst->isReg()) + return false; + + // WAR: Trans(I) reads, VALU(MI) writes. + Register ValuDef = ValuDst->getReg(); + for (const MachineOperand &TransUse : I.explicit_uses()) { + if (TransUse.isReg() && TRI->regsOverlap(ValuDef, TransUse.getReg())) + return true; + } + + return false; + }; + + auto IsExpiredFn = [](const MachineInstr &I, int) { + return SIInstrInfo::isVALU(I); + }; + + const int HasVALU = std::numeric_limits::max(); + if (::getWaitStatesSince(IsTransHazardFn, MI, IsExpiredFn) == HasVALU) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + return true; +} + bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) { if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI)) return false; diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index bbc55851bf967..ef6ddd874f58a 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -104,6 +104,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer { bool fixLdsDirectVMEMHazard(MachineInstr *MI); bool fixVALUPartialForwardingHazard(MachineInstr *MI); bool fixVALUTransUseHazard(MachineInstr *MI); + bool fixVALUTransCoexecutionHazards(MachineInstr *MI); bool fixWMMAHazards(MachineInstr *MI); bool fixShift64HighRegBug(MachineInstr *MI); bool fixVALUMaskWriteHazard(MachineInstr *MI); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll index 5936d6aa86b82..47b2b68f05abc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll @@ -66,6 +66,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) ; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v0.l +; GFX12-TRUE16-NEXT: v_nop ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) ; GFX12-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l @@ -90,6 +91,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) ; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 ; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0 +; GFX12-FAKE16-NEXT: v_nop ; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2) ; GFX12-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir b/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir new file mode 100644 index 0000000000000..fa27d689dd8dd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/trans-coexecution-hazard.mir @@ -0,0 +1,132 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1250 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass post-RA-hazard-rec -o - %s | FileCheck -check-prefixes=GCN,GFX1200 %s + +--- +name: trans_writes_valu_reads_hazard +body: | + bb.0: + ; GFX1250-LABEL: name: trans_writes_valu_reads_hazard + ; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; + ; GFX1200-LABEL: name: trans_writes_valu_reads_hazard + ; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1200-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_writes_valu_valu_reads_hazard_covered +body: | + bb.0: + ; GCN-LABEL: name: trans_writes_valu_valu_reads_hazard_covered + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_writes_salu_valu_reads_hazard +body: | + bb.0: + ; GFX1250-LABEL: name: trans_writes_salu_valu_reads_hazard + ; GFX1250: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + ; + ; GFX1200-LABEL: name: trans_writes_salu_valu_reads_hazard + ; GFX1200: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + ; GFX1200-NEXT: $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + $vgpr4 = V_ADD_F32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_no_hazard + ; GCN: $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_SQRT_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_reads_valu_writes_hazard +body: | + bb.0: + ; GFX1250-LABEL: name: trans_reads_valu_writes_hazard + ; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + ; + ; GFX1200-LABEL: name: trans_reads_valu_writes_hazard + ; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec +... + +--- +name: trans_reads_valu_valu_writes_hazard_covered +body: | + bb.0: + ; GCN-LABEL: name: trans_reads_valu_valu_writes_hazard_covered + ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr2, $vgpr3, implicit $mode, implicit $exec + $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_reads__salu_valu_writes_hazard +body: | + bb.0: + ; GFX1250-LABEL: name: trans_reads__salu_valu_writes_hazard + ; GFX1250: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1250-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + ; GFX1250-NEXT: V_NOP_e32 implicit $exec + ; GFX1250-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec + ; + ; GFX1200-LABEL: name: trans_reads__salu_valu_writes_hazard + ; GFX1200: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GFX1200-NEXT: $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + ; GFX1200-NEXT: $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $sgpr2 = S_ADD_U32 $sgpr0, $sgpr1, implicit-def $scc + $vgpr0 = V_ADD_F32_e32 $vgpr4, $vgpr2, implicit $mode, implicit $exec +... + +--- +name: trans_writes_trans_reads_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_writes_trans_reads_no_hazard + ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr2 = V_SQRT_F32_e32 $vgpr1, implicit $mode, implicit $exec +... + +--- +name: trans_reads_trans_writes_no_hazard +body: | + bb.0: + ; GCN-LABEL: name: trans_reads_trans_writes_no_hazard + ; GCN: $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec + $vgpr1 = V_COS_F32_e32 $vgpr0, implicit $mode, implicit $exec + $vgpr0 = V_SQRT_F32_e32 $vgpr2, implicit $mode, implicit $exec +... From 22994edb5fd71198c48670255c979fcc962930a1 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Wed, 16 Jul 2025 11:05:41 -0700 Subject: [PATCH 087/813] [OpenACC][Sema] Implement warning for non-effective 'private' (#149004) A 'private' variable reference needs to have a default constructor and a destructor, else we cannot properly emit them in codegen. This patch adds a warning-as-default-error to diagnose this. We'll have to do something similar for firstprivate/reduction, however it isn't clear whether we could skip the check for default-constructor for those two (they still need a destructor!). Depending on how we intend to create them (and we probably have to figure this out?), we could either require JUST a copy-constructor (then make the init section just the alloca, and the copy-ctor be the 'copy' section), OR they require a default-constructor + copy-assignment. --- .../clang/Basic/DiagnosticSemaKinds.td | 6 + clang/lib/Sema/SemaOpenACC.cpp | 70 +++++++++++- ...te_firstprivate_reduction_required_ops.cpp | 103 ++++++++++++++++++ 3 files changed, 174 insertions(+), 5 deletions(-) create mode 100644 clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index b4a9527c7ba22..b2ea65ae111be 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -13489,6 +13489,12 @@ def err_acc_invalid_default_type def err_acc_device_type_multiple_archs : Error<"OpenACC 'device_type' clause on a 'set' construct only permits " "one architecture">; +def warn_acc_var_referenced_lacks_op + : Warning<"variable of type %0 referenced in OpenACC '%1' clause does not " + "have a %enum_select{%DefCtor{default " + "constructor}|%Dtor{destructor}}2; reference has no effect">, + InGroup>, + DefaultError; // AMDGCN builtins diagnostics def err_amdgcn_load_lds_size_invalid_value : Error<"invalid size value">; diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index 46aa7dd0dcc21..128a5db57bf73 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -624,6 +624,66 @@ void SemaOpenACC::CheckDeclReference(SourceLocation Loc, Expr *E, Decl *D) { // loop (or we aren't in a loop!) so skip the diagnostic. } +namespace { +// Check whether the type of the thing we are referencing is OK for things like +// private, firstprivate, and reduction, which require certain operators to be +// available. +ExprResult CheckVarType(SemaOpenACC &S, OpenACCClauseKind CK, Expr *VarExpr, + Expr *InnerExpr) { + // There is nothing to do here, only these three have these sorts of + // restrictions. + if (CK != OpenACCClauseKind::Private && + CK != OpenACCClauseKind::FirstPrivate && + CK != OpenACCClauseKind::Reduction) + return VarExpr; + + // We can't test this if it isn't here, or if the type isn't clear yet. + if (!InnerExpr || InnerExpr->isTypeDependent()) + return VarExpr; + + const auto *RD = InnerExpr->getType()->getAsCXXRecordDecl(); + + // if this isn't a C++ record decl, we can create/copy/destroy this thing at + // will without problem, so this is a success. + if (!RD) + return VarExpr; + + // TODO: OpenACC: + // Private must have default ctor + dtor in InnerExpr + // FirstPrivate must have copyctor + dtor in InnerExpr + // Reduction must have copyctor + dtor + operation in InnerExpr + + // TODO OpenACC: It isn't clear what the requirements are for default + // constructor/copy constructor are for First private and reduction, but + // private requires a default constructor. + if (CK == OpenACCClauseKind::Private) { + bool HasNonDeletedDefaultCtor = + llvm::find_if(RD->ctors(), [](const CXXConstructorDecl *CD) { + return CD->isDefaultConstructor() && !CD->isDeleted(); + }) != RD->ctors().end(); + if (!HasNonDeletedDefaultCtor && !RD->needsImplicitDefaultConstructor()) { + S.Diag(InnerExpr->getBeginLoc(), + clang::diag::warn_acc_var_referenced_lacks_op) + << InnerExpr->getType() << CK + << clang::diag::AccVarReferencedReason::DefCtor; + return ExprError(); + } + } + + // All 3 things need to make sure they have a dtor. + bool DestructorDeleted = + RD->getDestructor() && RD->getDestructor()->isDeleted(); + if (DestructorDeleted && !RD->needsImplicitDestructor()) { + S.Diag(InnerExpr->getBeginLoc(), + clang::diag::warn_acc_var_referenced_lacks_op) + << InnerExpr->getType() << CK + << clang::diag::AccVarReferencedReason::Dtor; + return ExprError(); + } + return VarExpr; +} +} // namespace + ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK, Expr *VarExpr) { // This has unique enough restrictions that we should split it to a separate @@ -660,7 +720,7 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK, if (const auto *DRE = dyn_cast(CurVarExpr)) { if (isa( DRE->getFoundDecl()->getCanonicalDecl())) - return VarExpr; + return CheckVarType(*this, CK, VarExpr, CurVarExpr); } // If CK is a Reduction, this special cases for OpenACC3.3 2.5.15: "A var in a @@ -679,9 +739,9 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK, // declare, reduction, and use_device. const auto *This = dyn_cast(ME->getBase()); if (This && This->isImplicit()) - return VarExpr; + return CheckVarType(*this, CK, VarExpr, CurVarExpr); } else { - return VarExpr; + return CheckVarType(*this, CK, VarExpr, CurVarExpr); } } } @@ -690,14 +750,14 @@ ExprResult SemaOpenACC::ActOnVar(OpenACCDirectiveKind DK, OpenACCClauseKind CK, // doesn't fall into 'variable or array name' if (CK != OpenACCClauseKind::UseDevice && DK != OpenACCDirectiveKind::Declare && isa(CurVarExpr)) - return VarExpr; + return CheckVarType(*this, CK, VarExpr, CurVarExpr); // Nothing really we can do here, as these are dependent. So just return they // are valid. if (isa(CurVarExpr) || (CK != OpenACCClauseKind::Reduction && isa(CurVarExpr))) - return VarExpr; + return CheckVarType(*this, CK, VarExpr, CurVarExpr); // There isn't really anything we can do in the case of a recovery expr, so // skip the diagnostic rather than produce a confusing diagnostic. diff --git a/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp b/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp new file mode 100644 index 0000000000000..e0aee123fe754 --- /dev/null +++ b/clang/test/SemaOpenACC/private_firstprivate_reduction_required_ops.cpp @@ -0,0 +1,103 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +struct ImplicitCtorDtor{}; + +struct ImplDeletedCtor{ + ImplDeletedCtor(int i); +}; + +struct DefaultedCtor { + DefaultedCtor() = default; +}; + +struct ImpledCtor { + ImpledCtor() = default; +}; + + +struct DeletedCtor { + DeletedCtor() = delete; +}; + +struct ImpledDtor { + ~ImpledDtor(); +}; + +struct DefaultedDtor { + ~DefaultedDtor() = default; +}; + +struct DeletedDtor { + ~DeletedDtor() = delete; +}; + +struct ImplicitDelDtor { + DeletedDtor d; +}; + +void private_uses(ImplicitCtorDtor &CDT, ImplDeletedCtor &IDC, + DefaultedCtor &DefC, ImpledCtor &IC, DeletedCtor &DelC, + ImpledDtor &ID, DefaultedDtor &DefD, DeletedDtor &DelD, + ImplicitDelDtor &IDD) { + +#pragma acc parallel private(CDT) + ; + + // expected-error@+1{{variable of type 'ImplDeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}} +#pragma acc parallel private(IDC) + ; + +#pragma acc parallel private(DefC) + ; + +#pragma acc parallel private(IC) + ; + + // expected-error@+1{{variable of type 'DeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}} +#pragma acc parallel private(DelC) + ; + +#pragma acc parallel private(ID) + ; + +#pragma acc parallel private(DefD) + ; + + // expected-error@+1{{variable of type 'DeletedDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}} +#pragma acc parallel private(DelD) + ; + + // expected-error@+1{{variable of type 'ImplicitDelDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}} +#pragma acc parallel private(IDD) + ; + +} + +template +void private_templ(T& t) { +#pragma acc parallel private(t) // #PRIV + ; +} + +void inst(ImplicitCtorDtor &CDT, ImplDeletedCtor &IDC, + DefaultedCtor &DefC, ImpledCtor &IC, DeletedCtor &DelC, + ImpledDtor &ID, DefaultedDtor &DefD, DeletedDtor &DelD, + ImplicitDelDtor &IDD) { + private_templ(CDT); + // expected-error@#PRIV{{variable of type 'ImplDeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}} + // expected-note@+1{{in instantiation}} + private_templ(IDC); + private_templ(DefC); + private_templ(IC); + // expected-error@#PRIV{{variable of type 'DeletedCtor' referenced in OpenACC 'private' clause does not have a default constructor; reference has no effect}} + // expected-note@+1{{in instantiation}} + private_templ(DelC); + private_templ(ID); + private_templ(DefD); + // expected-error@#PRIV{{variable of type 'DeletedDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}} + // expected-note@+1{{in instantiation}} + private_templ(DelD); + // expected-error@#PRIV{{variable of type 'ImplicitDelDtor' referenced in OpenACC 'private' clause does not have a destructor; reference has no effect}} + // expected-note@+1{{in instantiation}} + private_templ(IDD); +} From 056f0a10b320fc2fd75f46aa67d68708303d89ad Mon Sep 17 00:00:00 2001 From: raoanag <127366241+raoanag@users.noreply.github.com> Date: Wed, 16 Jul 2025 11:28:55 -0700 Subject: [PATCH 088/813] [HLSL][DXIL] Implement `refract` intrinsic (#147342) - [x] Implement refract using HLSL source in hlsl_intrinsics.h - [x] Implement the refract SPIR-V target built-in in clang/include/clang/Basic/BuiltinsSPIRV.td - [x] Add sema checks for refract to CheckSPIRVBuiltinFunctionCall in clang/lib/Sema/SemaSPIRV.cpp - [x] Add codegen for spv refract to EmitSPIRVBuiltinExpr in CGBuiltin.cpp - [x] Add codegen tests to clang/test/CodeGenHLSL/builtins/refract.hlsl - [x] Add spv codegen test to clang/test/CodeGenSPIRV/Builtins/refract.c - [x] Add sema tests to clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl - [x] Add spv sema tests to clang/test/SemaSPIRV/BuiltIns/refract-errors.c - [x] Create the int_spv_refract intrinsic in IntrinsicsSPIRV.td - [x] In SPIRVInstructionSelector.cpp create the refract lowering and map it to int_spv_refract in SPIRVInstructionSelector::selectIntrinsic. - [x] Create SPIR-V backend test case in llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll - [x] Check for what OpenCL support is needed. Resolves https://github.com/llvm/llvm-project/issues/99153 --- clang/include/clang/Basic/BuiltinsSPIRVVK.td | 1 + clang/lib/CodeGen/TargetBuiltins/SPIR.cpp | 12 + .../lib/Headers/hlsl/hlsl_intrinsic_helpers.h | 10 + clang/lib/Headers/hlsl/hlsl_intrinsics.h | 59 +++++ clang/lib/Sema/SemaSPIRV.cpp | 80 ++++++ clang/test/CodeGenHLSL/builtins/refract.hlsl | 244 ++++++++++++++++++ clang/test/CodeGenSPIRV/Builtins/refract.c | 74 ++++++ .../SemaHLSL/BuiltIns/refract-errors.hlsl | 66 +++++ .../test/SemaSPIRV/BuiltIns/refract-errors.c | 41 +++ llvm/include/llvm/IR/IntrinsicsSPIRV.td | 7 +- .../Target/SPIRV/SPIRVInstructionSelector.cpp | 2 + .../CodeGen/SPIRV/hlsl-intrinsics/refract.ll | 36 +++ .../CodeGen/SPIRV/opencl/refract-error.ll | 12 + 13 files changed, 643 insertions(+), 1 deletion(-) create mode 100644 clang/test/CodeGenHLSL/builtins/refract.hlsl create mode 100644 clang/test/CodeGenSPIRV/Builtins/refract.c create mode 100644 clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl create mode 100644 clang/test/SemaSPIRV/BuiltIns/refract-errors.c create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll create mode 100644 llvm/test/CodeGen/SPIRV/opencl/refract-error.ll diff --git a/clang/include/clang/Basic/BuiltinsSPIRVVK.td b/clang/include/clang/Basic/BuiltinsSPIRVVK.td index 61cc0343c415e..5dc3c7588cd2a 100644 --- a/clang/include/clang/Basic/BuiltinsSPIRVVK.td +++ b/clang/include/clang/Basic/BuiltinsSPIRVVK.td @@ -11,3 +11,4 @@ include "clang/Basic/BuiltinsSPIRVBase.td" def reflect : SPIRVBuiltin<"void(...)", [NoThrow, Const]>; def faceforward : SPIRVBuiltin<"void(...)", [NoThrow, Const, CustomTypeChecking]>; +def refract : SPIRVBuiltin<"void(...)", [NoThrow, Const, CustomTypeChecking]>; diff --git a/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp b/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp index 16243951c7bec..243aad8bf7083 100644 --- a/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/SPIR.cpp @@ -58,6 +58,18 @@ Value *CodeGenFunction::EmitSPIRVBuiltinExpr(unsigned BuiltinID, /*ReturnType=*/I->getType(), Intrinsic::spv_reflect, ArrayRef{I, N}, nullptr, "spv.reflect"); } + case SPIRV::BI__builtin_spirv_refract: { + Value *I = EmitScalarExpr(E->getArg(0)); + Value *N = EmitScalarExpr(E->getArg(1)); + Value *eta = EmitScalarExpr(E->getArg(2)); + assert(E->getArg(0)->getType()->hasFloatingRepresentation() && + E->getArg(1)->getType()->hasFloatingRepresentation() && + E->getArg(2)->getType()->isFloatingType() && + "refract operands must have a float representation"); + return Builder.CreateIntrinsic( + /*ReturnType=*/I->getType(), Intrinsic::spv_refract, + ArrayRef{I, N, eta}, nullptr, "spv.refract"); + } case SPIRV::BI__builtin_spirv_smoothstep: { Value *Min = EmitScalarExpr(E->getArg(0)); Value *Max = EmitScalarExpr(E->getArg(1)); diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h index 4eb7b8f45c85a..e8ccccb489815 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsic_helpers.h @@ -71,6 +71,16 @@ constexpr vector reflect_vec_impl(vector I, vector N) { #endif } +template constexpr T refract_impl(T I, T N, U Eta) { +#if (__has_builtin(__builtin_spirv_refract)) + return __builtin_spirv_refract(I, N, Eta); +#endif + T Mul = dot(N, I); + T K = 1 - Eta * Eta * (1 - Mul * Mul); + T Result = (Eta * I - (Eta * Mul + sqrt(K)) * N); + return select(K < 0, static_cast(0), Result); +} + template constexpr T fmod_impl(T X, T Y) { #if !defined(__DIRECTX__) return __builtin_elementwise_fmod(X, Y); diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h index ea880105fac3b..499a05328ee4f 100644 --- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h +++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h @@ -475,6 +475,65 @@ reflect(__detail::HLSL_FIXED_VECTOR I, return __detail::reflect_vec_impl(I, N); } +//===----------------------------------------------------------------------===// +// refract builtin +//===----------------------------------------------------------------------===// + +/// \fn T refract(T I, T N, T eta) +/// \brief Returns a refraction using an entering ray, \a I, a surface +/// normal, \a N and refraction index \a eta +/// \param I The entering ray. +/// \param N The surface normal. +/// \param eta The refraction index. +/// +/// The return value is a floating-point vector that represents the refraction +/// using the refraction index, \a eta, for the direction of the entering ray, +/// \a I, off a surface with the normal \a N. +/// +/// This function calculates the refraction vector using the following formulas: +/// k = 1.0 - eta * eta * (1.0 - dot(N, I) * dot(N, I)) +/// if k < 0.0 the result is 0.0 +/// otherwise, the result is eta * I - (eta * dot(N, I) + sqrt(k)) * N +/// +/// I and N must already be normalized in order to achieve the desired result. +/// +/// I and N must be a scalar or vector whose component type is +/// floating-point. +/// +/// eta must be a 16-bit or 32-bit floating-point scalar. +/// +/// Result type, the type of I, and the type of N must all be the same type. + +template +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +const inline __detail::enable_if_t<__detail::is_arithmetic::Value && + __detail::is_same::value, + T> refract(T I, T N, T eta) { + return __detail::refract_impl(I, N, eta); +} + +template +const inline __detail::enable_if_t< + __detail::is_arithmetic::Value && __detail::is_same::value, T> +refract(T I, T N, T eta) { + return __detail::refract_impl(I, N, eta); +} + +template +_HLSL_16BIT_AVAILABILITY(shadermodel, 6.2) +const inline __detail::HLSL_FIXED_VECTOR refract( + __detail::HLSL_FIXED_VECTOR I, + __detail::HLSL_FIXED_VECTOR N, half eta) { + return __detail::refract_impl(I, N, eta); +} + +template +const inline __detail::HLSL_FIXED_VECTOR +refract(__detail::HLSL_FIXED_VECTOR I, + __detail::HLSL_FIXED_VECTOR N, float eta) { + return __detail::refract_impl(I, N, eta); +} + //===----------------------------------------------------------------------===// // smoothstep builtin //===----------------------------------------------------------------------===// diff --git a/clang/lib/Sema/SemaSPIRV.cpp b/clang/lib/Sema/SemaSPIRV.cpp index 76d3cff908b37..c8ea0d09c4081 100644 --- a/clang/lib/Sema/SemaSPIRV.cpp +++ b/clang/lib/Sema/SemaSPIRV.cpp @@ -46,6 +46,49 @@ static bool CheckAllArgsHaveSameType(Sema *S, CallExpr *TheCall) { return false; } +static bool CheckAllArgTypesAreCorrect( + Sema *S, CallExpr *TheCall, + llvm::ArrayRef< + llvm::function_ref> + Checks) { + unsigned NumArgs = TheCall->getNumArgs(); + assert(Checks.size() == NumArgs && + "Wrong number of checks for Number of args."); + // Apply each check to the corresponding argument + for (unsigned I = 0; I < NumArgs; ++I) { + Expr *Arg = TheCall->getArg(I); + if (Checks[I](S, Arg->getBeginLoc(), I + 1, Arg->getType())) + return true; + } + return false; +} + +static bool CheckFloatOrHalfRepresentation(Sema *S, SourceLocation Loc, + int ArgOrdinal, + clang::QualType PassedType) { + clang::QualType BaseType = + PassedType->isVectorType() + ? PassedType->castAs()->getElementType() + : PassedType; + if (!BaseType->isHalfType() && !BaseType->isFloat16Type() && + !BaseType->isFloat32Type()) + return S->Diag(Loc, diag::err_builtin_invalid_arg_type) + << ArgOrdinal << /* scalar or vector of */ 5 << /* no int */ 0 + << /* half or float */ 2 << PassedType; + return false; +} + +static bool CheckFloatOrHalfScalarRepresentation(Sema *S, SourceLocation Loc, + int ArgOrdinal, + clang::QualType PassedType) { + if (!PassedType->isHalfType() && !PassedType->isFloat16Type() && + !PassedType->isFloat32Type()) + return S->Diag(Loc, diag::err_builtin_invalid_arg_type) + << ArgOrdinal << /* scalar */ 1 << /* no int */ 0 + << /* half or float */ 2 << PassedType; + return false; +} + static std::optional processConstant32BitIntArgument(Sema &SemaRef, CallExpr *Call, int Argument) { ExprResult Arg = @@ -235,6 +278,43 @@ bool SemaSPIRV::CheckSPIRVBuiltinFunctionCall(const TargetInfo &TI, TheCall->setType(RetTy); break; } + case SPIRV::BI__builtin_spirv_refract: { + if (SemaRef.checkArgCount(TheCall, 3)) + return true; + + llvm::function_ref + ChecksArr[] = {CheckFloatOrHalfRepresentation, + CheckFloatOrHalfRepresentation, + CheckFloatOrHalfScalarRepresentation}; + if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall, + llvm::ArrayRef(ChecksArr))) + return true; + // Check that first two arguments are vectors/scalars of the same type + QualType Arg0Type = TheCall->getArg(0)->getType(); + if (!SemaRef.getASTContext().hasSameUnqualifiedType( + Arg0Type, TheCall->getArg(1)->getType())) + return SemaRef.Diag(TheCall->getBeginLoc(), + diag::err_vec_builtin_incompatible_vector) + << TheCall->getDirectCallee() << /* first two */ 0 + << SourceRange(TheCall->getArg(0)->getBeginLoc(), + TheCall->getArg(1)->getEndLoc()); + + // Check that scalar type of 3rd arg is same as base type of first two args + clang::QualType BaseType = + Arg0Type->isVectorType() + ? Arg0Type->castAs()->getElementType() + : Arg0Type; + if (!SemaRef.getASTContext().hasSameUnqualifiedType( + BaseType, TheCall->getArg(2)->getType())) + return SemaRef.Diag(TheCall->getBeginLoc(), + diag::err_hlsl_builtin_scalar_vector_mismatch) + << /* all */ 0 << TheCall->getDirectCallee() << Arg0Type + << TheCall->getArg(2)->getType(); + + QualType RetTy = TheCall->getArg(0)->getType(); + TheCall->setType(RetTy); + break; + } case SPIRV::BI__builtin_spirv_smoothstep: { if (SemaRef.checkArgCount(TheCall, 3)) return true; diff --git a/clang/test/CodeGenHLSL/builtins/refract.hlsl b/clang/test/CodeGenHLSL/builtins/refract.hlsl new file mode 100644 index 0000000000000..eda256451ee2b --- /dev/null +++ b/clang/test/CodeGenHLSL/builtins/refract.hlsl @@ -0,0 +1,244 @@ +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: dxil-pc-shadermodel6.3-library %s -fnative-half-type \ +// RUN: -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -triple \ +// RUN: spirv-unknown-vulkan-compute %s -fnative-half-type \ +// RUN: -emit-llvm -o - | FileCheck %s --check-prefix=SPVCHECK + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh( +// CHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK: [[ENTRY:.*:]] +// CHECK: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}} +// CHECK: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}} +// CHECK: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}} +// CHECK: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half 0xH3C00, [[MUL2_I]] +// CHECK: [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[MUL1_I]], [[SUB_I]] +// CHECK: [[SUB4_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half 0xH3C00, [[MUL3_I]] +// CHECK: [[MUL5_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}} +// CHECK: [[MUL6_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}} +// CHECK: [[TMP0:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.sqrt.f16(half %{{.*}}) +// CHECK: [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn half [[MUL6_I]], %{{.*}} +// CHECK: [[MUL7_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half [[ADD_I]], %{{.*}} +// CHECK: [[SUB8_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn half %{{.*}}, [[MUL7_I]] +// CHECK: [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt half %{{.*}}, 0xH0000 +// CHECK: [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CMP_I]], half 0xH0000, half %{{.*}} +// CHECK: ret half [[HLSL_SELECT_I]] +// +// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) half @_Z17test_refract_halfDhDhDh( +// SPVCHECK-SAME: half noundef nofpclass(nan inf) [[I:%.*]], half noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] +// SPVCHECK: [[ENTRY:.*:]] +// SPVCHECK: [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef half @llvm.spv.refract.f16.f16(half %{{.*}}, half %{{.*}}, half %{{.*}}) +// SPVCHECK: ret half [[SPV_REFRACT_I]] +// +half test_refract_half(half I, half N, half ETA) { + return refract(I, N, ETA); +} + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x half> @_Z18test_refract_half2Dv2_DhS_Dh( +// CHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK: [[ENTRY:.*:]] +// CHECK: [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}) +// CHECK: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}} +// CHECK: [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}} +// CHECK: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> splat (half 0xH3C00), [[MUL3_I]] +// CHECK: [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, [[SUB_I]] +// CHECK: [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> splat (half 0xH3C00), [[MUL4_I]] +// CHECK: [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}} +// CHECK: [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> %{{.*}}, %{{.*}} +// CHECK: [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.sqrt.v2f16(<2 x half> %{{.*}}) +// CHECK: [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <2 x half> [[MUL11_I]], [[TMP17]] +// CHECK: [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x half> [[ADD_I]], %{{.*}} +// CHECK: [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x half> [[MUL8_I]], [[MUL12_I]] +// CHECK: [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <2 x half> %{{.*}}, zeroinitializer +// CHECK: [[CAST:%.*]] = extractelement <2 x i1> [[CMP_I]], i32 0 +// CHECK: [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <2 x half> zeroinitializer, <2 x half> %{{.*}} +// CHECK: ret <2 x half> [[HLSL_SELECT_I]] +// +// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x half> @_Z18test_refract_half2Dv2_DhS_Dh( +// SPVCHECK-SAME: <2 x half> noundef nofpclass(nan inf) [[I:%.*]], <2 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// SPVCHECK: [[ENTRY:.*:]] +// SPVCHECK: [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <2 x half> @llvm.spv.refract.v2f16.f16(<2 x half> %{{.*}}, <2 x half> %{{.*}}, half %{{.*}}) +// SPVCHECK: ret <2 x half> [[SPV_REFRACT_I]] +// +half2 test_refract_half2(half2 I, half2 N, half ETA) { + return refract(I, N, ETA); +} + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x half> @_Z18test_refract_half3Dv3_DhS_Dh( +// CHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK: [[ENTRY:.*:]] +// CHECK: [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}) +// CHECK: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}} +// CHECK: [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}} +// CHECK: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> splat (half 0xH3C00), [[MUL3_I]] +// CHECK: [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, [[SUB_I]] +// CHECK: [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> splat (half 0xH3C00), [[MUL4_I]] +// CHECK: [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}} +// CHECK: [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> %{{.*}}, %{{.*}} +// CHECK: [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.sqrt.v3f16(<3 x half> %{{.*}}) +// CHECK: [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <3 x half> [[MUL11_I]], [[TMP17]] +// CHECK: [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x half> [[ADD_I]], %{{.*}} +// CHECK: [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x half> [[MUL8_I]], [[MUL12_I]] +// CHECK: [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <3 x half> %{{.*}}, zeroinitializer +// CHECK: [[CAST:%.*]] = extractelement <3 x i1> [[CMP_I]], i32 0 +// CHECK: [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <3 x half> zeroinitializer, <3 x half> %{{.*}} +// CHECK: ret <3 x half> [[HLSL_SELECT_I]] +// +// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x half> @_Z18test_refract_half3Dv3_DhS_Dh( +// SPVCHECK-SAME: <3 x half> noundef nofpclass(nan inf) [[I:%.*]], <3 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// SPVCHECK: [[ENTRY:.*:]] +// SPVCHECK: [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <3 x half> @llvm.spv.refract.v3f16.f16(<3 x half> %{{.*}}, <3 x half> %{{.*}}, half %{{.*}}) +// SPVCHECK: ret <3 x half> [[SPV_REFRACT_I]] +// +half3 test_refract_half3(half3 I, half3 N, half ETA) { + return refract(I, N, ETA); +} + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x half> @_Z18test_refract_half4Dv4_DhS_Dh( +// CHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK: [[ENTRY:.*:]] +// CHECK: [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.dx.fdot.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}) +// CHECK: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn half %{{.*}}, %{{.*}} +// CHECK: [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}} +// CHECK: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> splat (half 0xH3C00), [[MUL3_I]] +// CHECK: [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, [[SUB_I]] +// CHECK: [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> splat (half 0xH3C00), [[MUL4_I]] +// CHECK: [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}} +// CHECK: [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> %{{.*}}, %{{.*}} +// CHECK: [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.sqrt.v4f16(<4 x half> %{{.*}}) +// CHECK: [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <4 x half> [[MUL11_I]], [[TMP17]] +// CHECK: [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x half> [[ADD_I]], %{{.*}} +// CHECK: [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x half> [[MUL8_I]], [[MUL12_I]] +// CHECK: [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <4 x half> %{{.*}}, zeroinitializer +// CHECK: [[CAST:%.*]] = extractelement <4 x i1> [[CMP_I]], i32 0 +// CHECK: [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <4 x half> zeroinitializer, <4 x half> %{{.*}} +// CHECK: ret <4 x half> [[HLSL_SELECT_I]] +// +// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x half> @_Z18test_refract_half4Dv4_DhS_Dh( +// SPVCHECK-SAME: <4 x half> noundef nofpclass(nan inf) [[I:%.*]], <4 x half> noundef nofpclass(nan inf) [[N:%.*]], half noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// SPVCHECK: [[ENTRY:.*:]] +// SPVCHECK: [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> %{{.*}}, <4 x half> %{{.*}}, half %{{.*}}) +// SPVCHECK: ret <4 x half> [[SPV_REFRACT_I]] +// +half4 test_refract_half4(half4 I, half4 N, half ETA) { + return refract(I, N, ETA); +} + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) float @_Z18test_refract_floatfff( +// CHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK: [[ENTRY:.*:]] +// CHECK: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}} +// CHECK: [[MUL1_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}} +// CHECK: [[MUL2_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}} +// CHECK: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float 1.000000e+00, [[MUL2_I]] +// CHECK: [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[MUL1_I]], [[SUB_I]] +// CHECK: [[SUB4_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float 1.000000e+00, [[MUL3_I]] +// CHECK: [[MUL5_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}} +// CHECK: [[MUL6_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}} +// CHECK: [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.sqrt.f32(float %{{.*}}) +// CHECK: [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn float [[MUL6_I]], %{{.*}} +// CHECK: [[MUL7_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float [[ADD_I]], %{{.*}} +// CHECK: [[SUB8_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn float %{{.*}}, [[MUL7_I]] +// CHECK: [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt float %{{.*}}, 0.000000e+00 +// CHECK: [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CMP_I]], float 0.000000e+00, float %{{.*}} +// CHECK: ret float [[HLSL_SELECT_I]] +// +// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) float @_Z18test_refract_floatfff( +// SPVCHECK-SAME: float noundef nofpclass(nan inf) [[I:%.*]], float noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// SPVCHECK: [[ENTRY:.*:]] +// SPVCHECK: [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef float @llvm.spv.refract.f32.f32(float %{{.*}}, float %{{.*}}, float %{{.*}}) +// SPVCHECK: ret float [[SPV_REFRACT_I]] +// +float test_refract_float(float I, float N, float ETA) { + return refract(I, N, ETA); +} + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <2 x float> @_Z19test_refract_float2Dv2_fS_f( +// CHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK: [[ENTRY:.*:]] +// CHECK: [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}) +// CHECK: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}} +// CHECK: [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}} +// CHECK: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), [[MUL3_I]] +// CHECK: [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, [[SUB_I]] +// CHECK: [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> splat (float 1.000000e+00), [[MUL4_I]] +// CHECK: [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}} +// CHECK: [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> %{{.*}}, %{{.*}} +// CHECK: [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.*}}) +// CHECK: [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <2 x float> [[MUL11_I]], [[TMP17]] +// CHECK: [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <2 x float> [[ADD_I]], %{{.*}} +// CHECK: [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <2 x float> [[MUL8_I]], [[MUL12_I]] +// CHECK: [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <2 x float> %{{.*}}, zeroinitializer +// CHECK: [[CAST:%.*]] = extractelement <2 x i1> [[CMP_I]], i32 0 +// CHECK: [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <2 x float> zeroinitializer, <2 x float> %{{.*}} +// CHECK: ret <2 x float> [[HLSL_SELECT_I]] +// +// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <2 x float> @_Z19test_refract_float2Dv2_fS_f( +// SPVCHECK-SAME: <2 x float> noundef nofpclass(nan inf) [[I:%.*]], <2 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// SPVCHECK: [[ENTRY:.*:]] +// SPVCHECK: [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <2 x float> @llvm.spv.refract.v2f32.f32(<2 x float> %{{.*}}, <2 x float> %{{.*}}, float %{{.*}}) +// SPVCHECK: ret <2 x float> [[SPV_REFRACT_I]] +// +float2 test_refract_float2(float2 I, float2 N, float ETA) { + return refract(I, N, ETA); +} + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <3 x float> @_Z19test_refract_float3Dv3_fS_f( +// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK: [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}) +// CHECK: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}} +// CHECK: [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}} +// CHECK: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), [[MUL3_I]] +// CHECK: [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, [[SUB_I]] +// CHECK: [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> splat (float 1.000000e+00), [[MUL4_I]] +// CHECK: [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}} +// CHECK: [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> %{{.*}}, %{{.*}} +// CHECK: [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.*}}) +// CHECK: [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <3 x float> [[MUL11_I]], [[TMP17]] +// CHECK: [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <3 x float> [[ADD_I]], %{{.*}} +// CHECK: [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <3 x float> [[MUL8_I]], [[MUL12_I]] +// CHECK: [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <3 x float> %{{.*}}, zeroinitializer +// CHECK: [[CAST:%.*]] = extractelement <3 x i1> [[CMP_I]], i32 0 +// CHECK: [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <3 x float> zeroinitializer, <3 x float> %{{.*}} +// CHECK: ret <3 x float> [[HLSL_SELECT_I]] +// +// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <3 x float> @_Z19test_refract_float3Dv3_fS_f( +// SPVCHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[I:%.*]], <3 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// SPVCHECK: [[ENTRY:.*:]] +// SPVCHECK: [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <3 x float> @llvm.spv.refract.v3f32.f32(<3 x float> %{{.*}}, <3 x float> %{{.*}}, float %{{.*}}) +// SPVCHECK: ret <3 x float> [[SPV_REFRACT_I]] +// +float3 test_refract_float3(float3 I, float3 N, float ETA) { + return refract(I, N, ETA); +} + +// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <4 x float> @_Z19test_refract_float4Dv4_fS_f +// CHECK-SAME: <4 x float> noundef nofpclass(nan inf) [[I:%.*]], <4 x float> noundef nofpclass(nan inf) [[N:%.*]], float noundef nofpclass(nan inf) [[ETA:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK: [[HLSL_DOT_I:%.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.dx.fdot.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}) +// CHECK: [[MUL_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn float %{{.*}}, %{{.*}} +// CHECK: [[MUL3_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}} +// CHECK: [[SUB_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), [[MUL3_I]] +// CHECK: [[MUL4_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, [[SUB_I]] +// CHECK: [[SUB5_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> splat (float 1.000000e+00), [[MUL4_I]] +// CHECK: [[MUL8_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}} +// CHECK: [[MUL11_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> %{{.*}}, %{{.*}} +// CHECK: [[TMP17:%.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.*}}) +// CHECK: [[ADD_I:%.*]] = fadd reassoc nnan ninf nsz arcp afn <4 x float> [[MUL11_I]], [[TMP17]] +// CHECK: [[MUL12_I:%.*]] = fmul reassoc nnan ninf nsz arcp afn <4 x float> [[ADD_I]], %{{.*}} +// CHECK: [[SUB13_I:%.*]] = fsub reassoc nnan ninf nsz arcp afn <4 x float> [[MUL8_I]], [[MUL12_I]] +// CHECK: [[CMP_I:%.*]] = fcmp reassoc nnan ninf nsz arcp afn olt <4 x float> %{{.*}}, zeroinitializer +// CHECK: [[CAST:%.*]] = extractelement <4 x i1> [[CMP_I]], i32 0 +// CHECK: [[HLSL_SELECT_I:%.*]] = select reassoc nnan ninf nsz arcp afn i1 [[CAST]], <4 x float> zeroinitializer, <4 x float> %{{.*}} +// CHECK: ret <4 x float> [[HLSL_SELECT_I]] + +// SPVCHECK-LABEL: define hidden spir_func noundef nofpclass(nan inf) <4 x float> @_Z19test_refract_float4Dv4_fS_f( +// SPVCHECK-SAME: <4 x float> noundef nofpclass(nan inf) %{{.*}}, <4 x float> noundef nofpclass(nan inf) %{{.*}}, float noundef nofpclass(nan inf) %{{.*}}) #[[ATTR0:[0-9]+]] { +// SPVCHECK: [[ENTRY:.*:]] +// SPVCHECK: [[SPV_REFRACT_I:%.*]] = call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, float %{{.*}}) +// SPVCHECK: ret <4 x float> [[SPV_REFRACT_I]] +// +float4 test_refract_float4(float4 I, float4 N, float ETA) { + return refract(I, N, ETA); +} diff --git a/clang/test/CodeGenSPIRV/Builtins/refract.c b/clang/test/CodeGenSPIRV/Builtins/refract.c new file mode 100644 index 0000000000000..f399462d68d4a --- /dev/null +++ b/clang/test/CodeGenSPIRV/Builtins/refract.c @@ -0,0 +1,74 @@ +// RUN: %clang_cc1 -O1 -triple spirv-pc-vulkan-compute %s -fnative-half-type -emit-llvm -o - | FileCheck %s + +typedef _Float16 half; +typedef half half2 __attribute__((ext_vector_type(2))); +typedef half half3 __attribute__((ext_vector_type(3))); +typedef half half4 __attribute__((ext_vector_type(4))); +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); + +// CHECK-LABEL: define spir_func half @test_refract_half( +// CHECK-SAME: half noundef [[I:%.*]], half noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK: [[SPV_REFRACT:%.*]] = tail call half @llvm.spv.refract.f16.f16(half [[I]], half [[N]], half [[ETA]]) +// CHECK-NEXT: ret half [[SPV_REFRACT]] +// +half test_refract_half(half I, half N, half eta) { return __builtin_spirv_refract(I, N, eta); } + +// CHECK-LABEL: define spir_func <2 x half> @test_refract_half2( +// CHECK-SAME: <2 x half> noundef [[I:%.*]], <2 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK: [[SPV_REFRACT:%.*]] = tail call <2 x half> @llvm.spv.refract.v2f16.f16(<2 x half> [[I]], <2 x half> [[N]], half [[ETA]]) +// CHECK-NEXT: ret <2 x half> [[SPV_REFRACT]] +// +half2 test_refract_half2(half2 I, half2 N, half eta) { return __builtin_spirv_refract(I, N, eta); } + +// CHECK-LABEL: define spir_func <3 x half> @test_refract_half3( +// CHECK-SAME: <3 x half> noundef [[I:%.*]], <3 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SPV_REFRACT:%.*]] = tail call <3 x half> @llvm.spv.refract.v3f16.f16(<3 x half> [[I]], <3 x half> [[N]], half [[ETA]]) +// CHECK-NEXT: ret <3 x half> [[SPV_REFRACT]] +// +half3 test_refract_half3(half3 I, half3 N, half eta) { return __builtin_spirv_refract(I, N, eta); } + +// CHECK-LABEL: define spir_func <4 x half> @test_refract_half4( +// CHECK-SAME: <4 x half> noundef [[I:%.*]], <4 x half> noundef [[N:%.*]], half noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SPV_REFRACT:%.*]] = tail call <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> [[I]], <4 x half> [[N]], half [[ETA]]) +// CHECK-NEXT: ret <4 x half> [[SPV_REFRACT]] +// +half4 test_refract_half4(half4 I, half4 N, half eta) { return __builtin_spirv_refract(I, N, eta); } + + +// CHECK-LABEL: define spir_func float @test_refract_float( +// CHECK-SAME: float noundef [[I:%.*]], float noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK: [[SPV_REFRACT:%.*]] = tail call float @llvm.spv.refract.f32.f32(float [[I]], float [[N]], float [[ETA]]) +// CHECK-NEXT: ret float [[SPV_REFRACT]] +// +float test_refract_float(float I, float N, float eta) { return __builtin_spirv_refract(I, N, eta); } + +// CHECK-LABEL: define spir_func <2 x float> @test_refract_float2( +// CHECK-SAME: <2 x float> noundef [[I:%.*]], <2 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK: [[SPV_REFRACT:%.*]] = tail call <2 x float> @llvm.spv.refract.v2f32.f32(<2 x float> [[I]], <2 x float> [[N]], float [[ETA]]) +// CHECK-NEXT: ret <2 x float> [[SPV_REFRACT]] +// +float2 test_refract_float2(float2 I, float2 N, float eta) { return __builtin_spirv_refract(I, N, eta); } + +// CHECK-LABEL: define spir_func <3 x float> @test_refract_float3( +// CHECK-SAME: <3 x float> noundef [[I:%.*]], <3 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SPV_REFRACT:%.*]] = tail call <3 x float> @llvm.spv.refract.v3f32.f32(<3 x float> [[I]], <3 x float> [[N]], float [[ETA]]) +// CHECK-NEXT: ret <3 x float> [[SPV_REFRACT]] +// +float3 test_refract_float3(float3 I, float3 N, float eta) { return __builtin_spirv_refract(I, N, eta); } + +// CHECK-LABEL: define spir_func <4 x float> @test_refract_float4( +// CHECK-SAME: <4 x float> noundef [[I:%.*]], <4 x float> noundef [[N:%.*]], float noundef [[ETA:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SPV_REFRACT:%.*]] = tail call <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> [[I]], <4 x float> [[N]], float [[ETA]]) +// CHECK-NEXT: ret <4 x float> [[SPV_REFRACT]] +// +float4 test_refract_float4(float4 I, float4 N, float eta) { return __builtin_spirv_refract(I, N, eta); } diff --git a/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl new file mode 100644 index 0000000000000..6cb3e56c20f0e --- /dev/null +++ b/clang/test/SemaHLSL/BuiltIns/refract-errors.hlsl @@ -0,0 +1,66 @@ +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify + +float test_no_second_arg(float3 p0) { + return refract(p0); + // expected-error@-1 {{no matching function for call to 'refract'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 1 was provided}} +} + +float test_no_third_arg(float3 p0) { + return refract(p0, p0); + // expected-error@-1 {{no matching function for call to 'refract'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 2 were provided}} +} + +float test_too_many_arg(float2 p0) { + return refract(p0, p0, p0, p0); + // expected-error@-1 {{no matching function for call to 'refract'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate function template not viable: requires 3 arguments, but 4 were provided}} +} + +float test_double_inputs(double p0, double p1, double p2) { + return refract(p0, p1, p2); + // expected-error@-1 {{no matching function for call to 'refract'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}} +} + +float test_int_inputs(int p0, int p1, int p2) { + return refract(p0, p1, p2); + // expected-error@-1 {{no matching function for call to 'refract'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored}} +} + +float1 test_vec1_inputs(float1 p0, float1 p1, float1 p2) { + return refract(p0, p1, p2); + // expected-error@-1 {{no matching function for call to 'refract'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with T = float1]: no type named 'Type' in 'hlsl::__detail::enable_if>'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with T = float1]: no type named 'Type' in 'hlsl::__detail::enable_if>'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 1]: no type named 'Type' in 'hlsl::__detail::enable_if'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 1]: no type named 'Type' in 'hlsl::__detail::enable_if'}} +} + +typedef float float5 __attribute__((ext_vector_type(5))); + +float5 test_vec5_inputs(float5 p0, float5 p1, float p2) { + return refract(p0, p1, p2); + // expected-error@-1 {{no matching function for call to 'refract'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: deduced conflicting types for parameter 'T' ('float5' (vector of 5 'float' values) vs. 'float')}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: deduced conflicting types for parameter 'T' ('float5' (vector of 5 'float' values) vs. 'float')}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 5]: no type named 'Type' in 'hlsl::__detail::enable_if'}} + // expected-note@hlsl/hlsl_intrinsics.h:* {{candidate template ignored: substitution failure [with L = 5]: no type named 'Type' in 'hlsl::__detail::enable_if'}} +} diff --git a/clang/test/SemaSPIRV/BuiltIns/refract-errors.c b/clang/test/SemaSPIRV/BuiltIns/refract-errors.c new file mode 100644 index 0000000000000..07486c2a60cbf --- /dev/null +++ b/clang/test/SemaSPIRV/BuiltIns/refract-errors.c @@ -0,0 +1,41 @@ +// RUN: %clang_cc1 %s -triple spirv-pc-vulkan-compute -verify + +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef _Float16 half; +typedef half half2 __attribute__((ext_vector_type(2))); + +float2 test_no_third_arg(float2 p0) { + return __builtin_spirv_refract(p0, p0); + // expected-error@-1 {{too few arguments to function call, expected 3, have 2}} +} + +float2 test_too_many_arg(float2 p0, float p1) { + return __builtin_spirv_refract(p0, p0, p1, p1); + // expected-error@-1 {{too many arguments to function call, expected 3, have 4}} +} + +float test_double_scalar_inputs(double p0, double p1, double p2) { + return __builtin_spirv_refract(p0, p1, p2); + // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'double')}} +} + +float test_int_scalar_inputs(int p0, int p1, int p2) { + return __builtin_spirv_refract(p0, p1, p2); + // expected-error@-1 {{1st argument must be a scalar or vector of 16 or 32 bit floating-point types (was 'int')}} +} + +float test_float_and_half_inputs(float2 p0, half2 p1, float p2) { + return __builtin_spirv_refract(p0, p1, p2); + // expected-error@-1 {{first two arguments to '__builtin_spirv_refract' must have the same type}} +} + +float test_float_and_half_2_inputs(float2 p0, float2 p1, half p2) { + return __builtin_spirv_refract(p0, p1, p2); + // expected-error@-1 {{all arguments to '__builtin_spirv_refract' must be of scalar or vector type with matching scalar element type: 'float2' (vector of 2 'float' values) vs 'half' (aka '_Float16')}} +} + +float2 test_mismatch_vector_size_inputs(float2 p0, float3 p1, float p2) { + return __builtin_spirv_refract(p0, p1, p2); + // expected-error@-1 {{first two arguments to '__builtin_spirv_refract' must have the same type}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td index 35c9cd63581d6..b5f0cdf479c08 100644 --- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td +++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td @@ -90,7 +90,12 @@ let TargetPrefix = "spv" in { def int_spv_length : DefaultAttrsIntrinsic<[LLVMVectorElementType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_reflect : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>; - def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; + def int_spv_refract + : DefaultAttrsIntrinsic<[LLVMMatchType<0>], + [llvm_anyfloat_ty, LLVMMatchType<0>, + llvm_anyfloat_ty], + [IntrNoMem]>; +def int_spv_rsqrt : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>; def int_spv_saturate : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; def int_spv_smoothstep : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; def int_spv_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [LLVMMatchType<0>, llvm_anyfloat_ty], [IntrNoMem]>; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index fd0bea0b90472..6608b3f2cbefd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -3120,6 +3120,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg, return selectExtInst(ResVReg, ResType, I, CL::fract, GL::Fract); case Intrinsic::spv_normalize: return selectExtInst(ResVReg, ResType, I, CL::normalize, GL::Normalize); + case Intrinsic::spv_refract: + return selectExtInst(ResVReg, ResType, I, GL::Refract); case Intrinsic::spv_reflect: return selectExtInst(ResVReg, ResType, I, GL::Reflect); case Intrinsic::spv_rsqrt: diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll new file mode 100644 index 0000000000000..b18e929568534 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/refract.ll @@ -0,0 +1,36 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %} + +; Make sure SPIRV operation function calls for refract are lowered correctly. + +; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450" +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + +define noundef <4 x half> @refract_half(<4 x half> noundef %I, <4 x half> noundef %N, half noundef %ETA) { +entry: + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#float_16:]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Refract %[[#arg0]] %[[#arg1]] %[[#arg2]] + %spv.refract.i = tail call reassoc nnan ninf nsz arcp afn noundef <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half> %I, <4 x half> %N, half %ETA) + ret <4 x half> %spv.refract.i +} + +define noundef <4 x float> @refract_float4(<4 x float> noundef %I, <4 x float> noundef %N, float noundef %ETA) { +entry: + %conv.i = fpext reassoc nnan ninf nsz arcp afn float %ETA to double + ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]] + ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] + ; CHECK: %[[#arg2:]] = OpFunctionParameter %[[#float_32:]] + ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Refract %[[#arg0]] %[[#arg1]] %[[#arg2]] + %spv.refract.i = tail call reassoc nnan ninf nsz arcp afn noundef <4 x float> @llvm.spv.refract.v4f32.f32(<4 x float> %I, <4 x float> %N, float %ETA) + ret <4 x float> %spv.refract.i +} + +declare <4 x half> @llvm.spv.refract.v4f16.f16(<4 x half>, <4 x half>, half) +declare <4 x float> @llvm.spv.reflect.v4f32.f32(<4 x float>, <4 x float>, float) diff --git a/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll b/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll new file mode 100644 index 0000000000000..28208fb2e72f8 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/opencl/refract-error.ll @@ -0,0 +1,12 @@ +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s +; RUN: not llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s + +; CHECK: LLVM ERROR: %{{.*}} = G_INTRINSIC intrinsic(@llvm.spv.refract), %{{.*}}, %{{.*}}, %{{.*}} is only supported with the GLSL extended instruction set. + +define noundef <4 x float> @refract_float4(<4 x float> noundef %I, <4 x float> noundef %N, float noundef %ETA) { +entry: + %spv.refract = call <4 x float> @llvm.spv.refract.f32(<4 x float> %I, <4 x float> %N, float %ETA) + ret <4 x float> %spv.refract +} + +declare <4 x float> @llvm.spv.refract.f32(<4 x float>, <4 x float>, float) From 82451d0b1341a9b6c01eaa5d27088ff9f3287853 Mon Sep 17 00:00:00 2001 From: sribee8 Date: Wed, 16 Jul 2025 18:36:17 +0000 Subject: [PATCH 089/813] [libc] Updated exp fuzz tests (#148912) Fuzz tests were previously in the wrong format, updated them to correct format. --------- Co-authored-by: Sriya Pratipati --- libc/fuzzing/math/exp10_fuzz.cpp | 41 +++++++++++++++++++++----------- libc/fuzzing/math/exp2_fuzz.cpp | 41 +++++++++++++++++++++----------- libc/fuzzing/math/exp_fuzz.cpp | 41 +++++++++++++++++++++----------- libc/fuzzing/math/expm1_fuzz.cpp | 41 +++++++++++++++++++++----------- 4 files changed, 108 insertions(+), 56 deletions(-) diff --git a/libc/fuzzing/math/exp10_fuzz.cpp b/libc/fuzzing/math/exp10_fuzz.cpp index 2baef03a264a4..d939948b723a5 100644 --- a/libc/fuzzing/math/exp10_fuzz.cpp +++ b/libc/fuzzing/math/exp10_fuzz.cpp @@ -12,27 +12,40 @@ #include "src/math/exp10.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(double x) { - // remove NaN and inf - if (isnan(x) || isinf(x)) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_exp10(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - double result = LIBC_NAMESPACE::exp10(x); + // remove NaN and inf + if (isnan(x) || isinf(x)) + continue; + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; - if (result != to_compare) - __builtin_trap(); + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_exp10(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + double result = LIBC_NAMESPACE::exp10(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; } diff --git a/libc/fuzzing/math/exp2_fuzz.cpp b/libc/fuzzing/math/exp2_fuzz.cpp index 8a2959047a6ca..a29d3c00da672 100644 --- a/libc/fuzzing/math/exp2_fuzz.cpp +++ b/libc/fuzzing/math/exp2_fuzz.cpp @@ -12,27 +12,40 @@ #include "src/math/exp2.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(double x) { - // remove NaN and inf - if (isnan(x) || isinf(x)) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_exp2(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - double result = LIBC_NAMESPACE::exp2(x); + // remove NaN and inf + if (isnan(x) || isinf(x)) + continue; + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; - if (result != to_compare) - __builtin_trap(); + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_exp2(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + double result = LIBC_NAMESPACE::exp2(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; } diff --git a/libc/fuzzing/math/exp_fuzz.cpp b/libc/fuzzing/math/exp_fuzz.cpp index 97bc12dfa64c9..66823596dc6fa 100644 --- a/libc/fuzzing/math/exp_fuzz.cpp +++ b/libc/fuzzing/math/exp_fuzz.cpp @@ -12,27 +12,40 @@ #include "src/math/exp.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(double x) { - // remove NaN and inf - if (isnan(x) || isinf(x)) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_exp(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - double result = LIBC_NAMESPACE::exp(x); + // remove NaN and inf + if (isnan(x) || isinf(x)) + continue; + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; - if (result != to_compare) - __builtin_trap(); + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_exp(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + double result = LIBC_NAMESPACE::exp(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; } diff --git a/libc/fuzzing/math/expm1_fuzz.cpp b/libc/fuzzing/math/expm1_fuzz.cpp index db507bb02b1d7..0690e449c3d23 100644 --- a/libc/fuzzing/math/expm1_fuzz.cpp +++ b/libc/fuzzing/math/expm1_fuzz.cpp @@ -12,27 +12,40 @@ #include "src/math/expm1.h" #include "utils/MPFRWrapper/mpfr_inc.h" +#include +#include +#include #include -extern "C" int LLVMFuzzerTestOneInput(double x) { - // remove NaN and inf - if (isnan(x) || isinf(x)) - return 0; - // signed zeros already tested in unit tests - if (signbit(x) && x == 0.0) - return 0; +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { mpfr_t input; mpfr_init2(input, 53); - mpfr_set_d(input, x, MPFR_RNDN); - int output = mpfr_expm1(input, input, MPFR_RNDN); - mpfr_subnormalize(input, output, MPFR_RNDN); - double to_compare = mpfr_get_d(input, MPFR_RNDN); + for (size_t i = 0; i < size / sizeof(double); ++i) { + double x; + std::memcpy(&x, data, sizeof(double)); + data += sizeof(double); - double result = LIBC_NAMESPACE::expm1(x); + // remove NaN and inf + if (isnan(x) || isinf(x)) + continue; + // signed zeros already tested in unit tests + if (signbit(x) && x == 0.0) + continue; - if (result != to_compare) - __builtin_trap(); + mpfr_set_d(input, x, MPFR_RNDN); + int output = mpfr_expm1(input, input, MPFR_RNDN); + mpfr_subnormalize(input, output, MPFR_RNDN); + double to_compare = mpfr_get_d(input, MPFR_RNDN); + double result = LIBC_NAMESPACE::expm1(x); + + if (result != to_compare) { + std::cout << std::hexfloat << "Failing input: " << x << std::endl; + std::cout << std::hexfloat << "Failing output: " << result << std::endl; + std::cout << std::hexfloat << "Expected: " << to_compare << std::endl; + __builtin_trap(); + } + } mpfr_clear(input); return 0; } From c4d4e761ef27d6dd27323cf3efa506db5e9e3457 Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Tue, 15 Jul 2025 13:31:02 -0300 Subject: [PATCH 090/813] [RISCV] Pre-commit RVV instructions to the x60 scheduling model and tests --- .../lib/Target/RISCV/RISCVSchedSpacemitX60.td | 532 +- .../RISCV/rvv/vxrm-insert-out-of-loop.ll | 32 +- .../tools/llvm-mca/RISCV/SpacemitX60/atomic.s | 188 +- .../RISCV/SpacemitX60/floating-point.s | 182 +- .../llvm-mca/RISCV/SpacemitX60/integer.s | 244 +- .../RISCV/SpacemitX60/rvv-arithmetic.s | 6820 +++++++++++++++++ .../llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s | 4328 +++++++++++ .../RISCV/SpacemitX60/rvv-comparison.s | 2704 +++++++ .../RISCV/SpacemitX60/rvv-conversion.s | 1757 +++++ .../llvm-mca/RISCV/SpacemitX60/rvv-fma.s | 2185 ++++++ .../tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s | 5599 ++++++++++++++ .../llvm-mca/RISCV/SpacemitX60/rvv-mask.s | 1864 +++++ .../llvm-mca/RISCV/SpacemitX60/rvv-minmax.s | 1108 +++ .../llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s | 2984 ++++++++ .../RISCV/SpacemitX60/rvv-permutation.s | 3504 +++++++++ .../RISCV/SpacemitX60/rvv-reduction.s | 1824 +++++ 16 files changed, 35540 insertions(+), 315 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-bitwise.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-comparison.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-conversion.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fma.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-fp.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mask.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-minmax.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-mul-div.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-permutation.s create mode 100644 llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-reduction.s diff --git a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td index 05388f2d13113..3e286a754e4ee 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td @@ -13,6 +13,17 @@ // //===----------------------------------------------------------------------===// +class SMX60IsWorstCaseMX MxList> { + string LLMUL = LargestLMUL.r; + bit c = !eq(mx, LLMUL); +} + +class SMX60IsWorstCaseMXSEW MxList, bit isF = 0> { + string LLMUL = LargestLMUL.r; + int SSEW = SmallestSEW.r; + bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW)); +} + def SpacemitX60Model : SchedMachineModel { let IssueWidth = 2; // dual-issue let MicroOpBufferSize = 0; // in-order @@ -44,6 +55,19 @@ let BufferSize = 0 in { // floating point instructions, this model assumes single issue as // increasing it reduces the gains we saw in performance def SMX60_FP : ProcResource<1>; + + // Vector pipeline + // Single issue for vector store/load instructions + def SMX60_VLS : ProcResource<1>; + + // The C908 user manual says: "Vector floating-point units support vector + // floating-point computation of different bits. In addition, vector integer + // units are added". Developer confirmed it's a separate VIEU + def SMX60_VIEU : ProcResource<1>; + + // The C908 user manual says: "The vector execution unit is developed by + // extending the floating-point unit", so let's assume single issue for now + def SMX60_VFP : ProcResource<1>; } //===----------------------------------------------------------------------===// @@ -232,9 +256,341 @@ let Latency = 4 in { def : WriteRes; } +// 6. Configuration-Setting Instructions +def : WriteRes; +def : WriteRes; +def : WriteRes; + +// 7. Vector Loads and Stores +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + // Unit-stride loads and stores + defm "" : LMULWriteResMX<"WriteVLDE", [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDFF", [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTE", [SMX60_VLS], mx, IsWorstCase>; + + // Mask loads and stores + defm "" : LMULWriteResMX<"WriteVLDM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + defm "" : LMULWriteResMX<"WriteVSTM", [SMX60_VLS], mx, IsWorstCase=!eq(mx, "M1")>; + + // Strided and indexed loads and stores + foreach eew = [8, 16, 32, 64] in { + defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SMX60_VLS], mx, IsWorstCase>; + } +} + +// Segmented loads and stores +foreach mx = SchedMxList in { + foreach nf=2-8 in { + foreach eew = [8, 16, 32, 64] in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + // Unit-stride segmented + defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Strided/indexed segmented + defm "" : LMULWriteResMX<"WriteVLSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSSSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + + // Indexed segmented + defm "" : LMULWriteResMX<"WriteVLOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVLUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSUXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSOXSEG" # nf # "e" #eew, [SMX60_VLS], mx, IsWorstCase>; + } + } +} + +// Whole register move/load/store +foreach LMul = [1, 2, 4, 8] in { + def : WriteRes("WriteVLD" # LMul # "R"), [SMX60_VLS]>; + def : WriteRes("WriteVST" # LMul # "R"), [SMX60_VLS]>; + + def : WriteRes("WriteVMov" # LMul # "V"), [SMX60_VIEU]>; +} + +// 11. Vector Integer Arithmetic Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>; +} + +// Widening +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>; +} + +// Vector Integer Division and Remainder +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +// Narrowing Shift and Clips +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 12. Vector Fixed-Point Arithmetic Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVSALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSALUI", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVAALUX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSMulX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSShiftI", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 13. Vector Floating-Point Instructions +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +// Widening +foreach mx = SchedMxListW in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFW in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// Narrowing +foreach mx = SchedMxListW in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; +} + +foreach mx = SchedMxListFW in { + foreach sew = SchedSEWSet.val in { + + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// Vector Floating-Point Division and Square Root +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// 14. Vector Reduction Operations +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListWRed in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListF in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxListFWRed in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SMX60_VFP], mx, sew, IsWorstCase>; + } +} + +// 15. Vector Mask Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVMALUV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMPopV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMFFSV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVMSFSV", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVIotaV", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVIdxV", [SMX60_VIEU], mx, IsWorstCase>; +} + +// 16. Vector Permutation Instructions +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>; + + defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>; +} + +def : WriteRes; +def : WriteRes; + +def : WriteRes; +def : WriteRes; + +// Gather and Compress +foreach mx = SchedMxList in { + foreach sew = SchedSEWSet.val in { + defvar IsWorstCase = SMX60IsWorstCaseMXSEW.c; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>; + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>; + } +} + +foreach mx = SchedMxList in { + defvar IsWorstCase = SMX60IsWorstCaseMX.c; + + defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>; + defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>; +} + // Others def : WriteRes; def : WriteRes; +def : WriteRes; //===----------------------------------------------------------------------===// // Bypass and advance @@ -341,10 +697,184 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; +// 6. Configuration-Setting Instructions +def : ReadAdvance; +def : ReadAdvance; + +// 7. Vector Loads and Stores +def : ReadAdvance; +def : ReadAdvance; +defm "" : LMULReadAdvance<"ReadVSTEV", 0>; +defm "" : LMULReadAdvance<"ReadVSTM", 0>; +def : ReadAdvance; +def : ReadAdvance; +defm "" : LMULReadAdvance<"ReadVSTS8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTS64V", 0>; +defm "" : LMULReadAdvance<"ReadVLDUXV", 0>; +defm "" : LMULReadAdvance<"ReadVLDOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTUXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTUX64V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64", 0>; +defm "" : LMULReadAdvance<"ReadVSTOXV", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX8V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX16V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX32V", 0>; +defm "" : LMULReadAdvance<"ReadVSTOX64V", 0>; +// LMUL Aware +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +// 12. Vector Integer Arithmetic Instructions +defm : LMULReadAdvance<"ReadVIALUV", 0>; +defm : LMULReadAdvance<"ReadVIALUX", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUV", 0>; +defm : LMULReadAdvanceW<"ReadVIWALUX", 0>; +defm : LMULReadAdvance<"ReadVExtV", 0>; +defm : LMULReadAdvance<"ReadVICALUV", 0>; +defm : LMULReadAdvance<"ReadVICALUX", 0>; +defm : LMULReadAdvance<"ReadVShiftV", 0>; +defm : LMULReadAdvance<"ReadVShiftX", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftV", 0>; +defm : LMULReadAdvanceW<"ReadVNShiftX", 0>; +defm : LMULReadAdvance<"ReadVICmpV", 0>; +defm : LMULReadAdvance<"ReadVICmpX", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxV", 0>; +defm : LMULReadAdvance<"ReadVIMinMaxX", 0>; +defm : LMULReadAdvance<"ReadVIMulV", 0>; +defm : LMULReadAdvance<"ReadVIMulX", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivV", 0>; +defm : LMULSEWReadAdvance<"ReadVIDivX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulX", 0>; +defm : LMULReadAdvance<"ReadVIMulAddV", 0>; +defm : LMULReadAdvance<"ReadVIMulAddX", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddV", 0>; +defm : LMULReadAdvanceW<"ReadVIWMulAddX", 0>; +defm : LMULReadAdvance<"ReadVIMergeV", 0>; +defm : LMULReadAdvance<"ReadVIMergeX", 0>; +defm : LMULReadAdvance<"ReadVIMovV", 0>; +defm : LMULReadAdvance<"ReadVIMovX", 0>; + +// 13. Vector Fixed-Point Arithmetic Instructions +defm "" : LMULReadAdvance<"ReadVSALUV", 0>; +defm "" : LMULReadAdvance<"ReadVSALUX", 0>; +defm "" : LMULReadAdvance<"ReadVAALUV", 0>; +defm "" : LMULReadAdvance<"ReadVAALUX", 0>; +defm "" : LMULReadAdvance<"ReadVSMulV", 0>; +defm "" : LMULReadAdvance<"ReadVSMulX", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftV", 0>; +defm "" : LMULReadAdvance<"ReadVSShiftX", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipV", 0>; +defm "" : LMULReadAdvanceW<"ReadVNClipX", 0>; + +// 14. Vector Floating-Point Instructions +defm "" : LMULSEWReadAdvanceF<"ReadVFALUV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFALUF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWALUF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFDivF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMulAddF", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWMulAddF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSqrtV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFRecpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpV", 0>; +defm "" : LMULReadAdvance<"ReadVFCmpF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFMinMaxF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjV", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFSgnjF", 0>; +defm "" : LMULReadAdvance<"ReadVFClassV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeV", 0>; +defm "" : LMULReadAdvance<"ReadVFMergeF", 0>; +defm "" : LMULReadAdvance<"ReadVFMovF", 0>; +defm "" : LMULSEWReadAdvanceF<"ReadVFCvtIToFV", 0>; +defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>; +defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>; +defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>; +defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>; + +// 15. Vector Reduction Operations +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +// 16. Vector Mask Instructions +defm "" : LMULReadAdvance<"ReadVMALUV", 0>; +defm "" : LMULReadAdvance<"ReadVMPopV", 0>; +defm "" : LMULReadAdvance<"ReadVMFFSV", 0>; +defm "" : LMULReadAdvance<"ReadVMSFSV", 0>; +defm "" : LMULReadAdvance<"ReadVIotaV", 0>; + +// 17. Vector Permutation Instructions +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +defm "" : LMULReadAdvance<"ReadVISlideV", 0>; +defm "" : LMULReadAdvance<"ReadVISlideX", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideV", 0>; +defm "" : LMULReadAdvance<"ReadVFSlideF", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherVV_index", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVRGatherEI16VV_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_data", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVX_index", 0>; +defm "" : LMULReadAdvance<"ReadVRGatherVI_data", 0>; +defm "" : LMULSEWReadAdvance<"ReadVCompressV", 0>; +// LMUL Aware +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +// Others +def : ReadAdvance; +def : ReadAdvance; +foreach mx = SchedMxList in { + def : ReadAdvance("ReadVPassthru_" # mx), 0>; + foreach sew = SchedSEWSet.val in + def : ReadAdvance("ReadVPassthru_" # mx # "_E" # sew), 0>; +} + //===----------------------------------------------------------------------===// // Unsupported extensions defm : UnsupportedSchedQ; -defm : UnsupportedSchedV; defm : UnsupportedSchedZabha; defm : UnsupportedSchedZbkb; defm : UnsupportedSchedZbkx; diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll index 261a1f8fd2c6c..7990dfc0880a5 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll @@ -304,27 +304,27 @@ define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_ ; RV64X60-NEXT: li t1, 0 ; RV64X60-NEXT: addi s1, a7, -1 ; RV64X60-NEXT: zext.w s1, s1 -; RV64X60-NEXT: mul t2, a1, s1 -; RV64X60-NEXT: mul t3, a3, s1 -; RV64X60-NEXT: mul t4, a5, s1 +; RV64X60-NEXT: mul t3, a1, s1 +; RV64X60-NEXT: mul t4, a3, s1 +; RV64X60-NEXT: mul t5, a5, s1 ; RV64X60-NEXT: add s0, a0, a6 -; RV64X60-NEXT: add s1, a2, a6 -; RV64X60-NEXT: add t5, a4, a6 -; RV64X60-NEXT: add s0, s0, t2 ; RV64X60-NEXT: csrr t2, vlenb -; RV64X60-NEXT: add t3, t3, s1 +; RV64X60-NEXT: add s1, a2, a6 +; RV64X60-NEXT: add t3, t3, s0 +; RV64X60-NEXT: add s0, a4, a6 +; RV64X60-NEXT: add t4, t4, s1 ; RV64X60-NEXT: li t6, 32 -; RV64X60-NEXT: add t4, t4, t5 -; RV64X60-NEXT: sltu t3, a0, t3 -; RV64X60-NEXT: sltu s1, a2, s0 -; RV64X60-NEXT: and t3, t3, s1 -; RV64X60-NEXT: or t5, a1, a3 -; RV64X60-NEXT: sltu s1, a0, t4 -; RV64X60-NEXT: sltu s0, a4, s0 -; RV64X60-NEXT: slti t4, t5, 0 +; RV64X60-NEXT: add t5, t5, s0 +; RV64X60-NEXT: sltu s0, a0, t4 +; RV64X60-NEXT: sltu s1, a2, t3 +; RV64X60-NEXT: and t4, s0, s1 +; RV64X60-NEXT: or s2, a1, a3 +; RV64X60-NEXT: sltu s0, a0, t5 +; RV64X60-NEXT: sltu s1, a4, t3 +; RV64X60-NEXT: slti t3, s2, 0 ; RV64X60-NEXT: and s0, s0, s1 ; RV64X60-NEXT: or s1, a1, a5 -; RV64X60-NEXT: or t4, t3, t4 +; RV64X60-NEXT: or t4, t4, t3 ; RV64X60-NEXT: slli t3, t2, 1 ; RV64X60-NEXT: slti s1, s1, 0 ; RV64X60-NEXT: or s0, s0, s1 diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s index bc9229471b20e..8838c862e6b75 100644 --- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s +++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/atomic.s @@ -107,6 +107,9 @@ amomaxu.d.aqrl s5, s4, (s3) # CHECK-NEXT: [2] - SMX60_IEUA:1 # CHECK-NEXT: [3] - SMX60_IEUB:1 # CHECK-NEXT: [4] - SMX60_LS:2 +# CHECK-NEXT: [5] - SMX60_VFP:1 +# CHECK-NEXT: [6] - SMX60_VIEU:1 +# CHECK-NEXT: [7] - SMX60_VLS:1 # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps @@ -215,98 +218,101 @@ amomaxu.d.aqrl s5, s4, (s3) # CHECK-NEXT: [2] - SMX60_IEUB # CHECK-NEXT: [3.0] - SMX60_LS # CHECK-NEXT: [3.1] - SMX60_LS +# CHECK-NEXT: [4] - SMX60_VFP +# CHECK-NEXT: [5] - SMX60_VIEU +# CHECK-NEXT: [6] - SMX60_VLS # CHECK: Resource pressure per iteration: -# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] -# CHECK-NEXT: - - - 44.00 44.00 +# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] +# CHECK-NEXT: - - - 44.00 44.00 - - - # CHECK: Resource pressure by instruction: -# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] Instructions: -# CHECK-NEXT: - - - 0.50 0.50 lr.w t0, (t1) -# CHECK-NEXT: - - - 0.50 0.50 lr.w.aq t1, (t2) -# CHECK-NEXT: - - - 0.50 0.50 lr.w.rl t2, (t3) -# CHECK-NEXT: - - - 0.50 0.50 lr.w.aqrl t3, (t4) -# CHECK-NEXT: - - - 0.50 0.50 sc.w t6, t5, (t4) -# CHECK-NEXT: - - - 0.50 0.50 sc.w.aq t5, t4, (t3) -# CHECK-NEXT: - - - 0.50 0.50 sc.w.rl t4, t3, (t2) -# CHECK-NEXT: - - - 0.50 0.50 sc.w.aqrl t3, t2, (t1) -# CHECK-NEXT: - - - 0.50 0.50 lr.d t0, (t1) -# CHECK-NEXT: - - - 0.50 0.50 lr.d.aq t1, (t2) -# CHECK-NEXT: - - - 0.50 0.50 lr.d.rl t2, (t3) -# CHECK-NEXT: - - - 0.50 0.50 lr.d.aqrl t3, (t4) -# CHECK-NEXT: - - - 0.50 0.50 sc.d t6, t5, (t4) -# CHECK-NEXT: - - - 0.50 0.50 sc.d.aq t5, t4, (t3) -# CHECK-NEXT: - - - 0.50 0.50 sc.d.rl t4, t3, (t2) -# CHECK-NEXT: - - - 0.50 0.50 sc.d.aqrl t3, t2, (t1) -# CHECK-NEXT: - - - 0.50 0.50 amoswap.w a4, ra, (s0) -# CHECK-NEXT: - - - 0.50 0.50 amoadd.w a1, a2, (a3) -# CHECK-NEXT: - - - 0.50 0.50 amoxor.w a2, a3, (a4) -# CHECK-NEXT: - - - 0.50 0.50 amoand.w a3, a4, (a5) -# CHECK-NEXT: - - - 0.50 0.50 amoor.w a4, a5, (a6) -# CHECK-NEXT: - - - 0.50 0.50 amomin.w a5, a6, (a7) -# CHECK-NEXT: - - - 0.50 0.50 amomax.w s7, s6, (s5) -# CHECK-NEXT: - - - 0.50 0.50 amominu.w s6, s5, (s4) -# CHECK-NEXT: - - - 0.50 0.50 amomaxu.w s5, s4, (s3) -# CHECK-NEXT: - - - 0.50 0.50 amoswap.w.aq a4, ra, (s0) -# CHECK-NEXT: - - - 0.50 0.50 amoadd.w.aq a1, a2, (a3) -# CHECK-NEXT: - - - 0.50 0.50 amoxor.w.aq a2, a3, (a4) -# CHECK-NEXT: - - - 0.50 0.50 amoand.w.aq a3, a4, (a5) -# CHECK-NEXT: - - - 0.50 0.50 amoor.w.aq a4, a5, (a6) -# CHECK-NEXT: - - - 0.50 0.50 amomin.w.aq a5, a6, (a7) -# CHECK-NEXT: - - - 0.50 0.50 amomax.w.aq s7, s6, (s5) -# CHECK-NEXT: - - - 0.50 0.50 amominu.w.aq s6, s5, (s4) -# CHECK-NEXT: - - - 0.50 0.50 amomaxu.w.aq s5, s4, (s3) -# CHECK-NEXT: - - - 0.50 0.50 amoswap.w.rl a4, ra, (s0) -# CHECK-NEXT: - - - 0.50 0.50 amoadd.w.rl a1, a2, (a3) -# CHECK-NEXT: - - - 0.50 0.50 amoxor.w.rl a2, a3, (a4) -# CHECK-NEXT: - - - 0.50 0.50 amoand.w.rl a3, a4, (a5) -# CHECK-NEXT: - - - 0.50 0.50 amoor.w.rl a4, a5, (a6) -# CHECK-NEXT: - - - 0.50 0.50 amomin.w.rl a5, a6, (a7) -# CHECK-NEXT: - - - 0.50 0.50 amomax.w.rl s7, s6, (s5) -# CHECK-NEXT: - - - 0.50 0.50 amominu.w.rl s6, s5, (s4) -# CHECK-NEXT: - - - 0.50 0.50 amomaxu.w.rl s5, s4, (s3) -# CHECK-NEXT: - - - 0.50 0.50 amoswap.w.aqrl a4, ra, (s0) -# CHECK-NEXT: - - - 0.50 0.50 amoadd.w.aqrl a1, a2, (a3) -# CHECK-NEXT: - - - 0.50 0.50 amoxor.w.aqrl a2, a3, (a4) -# CHECK-NEXT: - - - 0.50 0.50 amoand.w.aqrl a3, a4, (a5) -# CHECK-NEXT: - - - 0.50 0.50 amoor.w.aqrl a4, a5, (a6) -# CHECK-NEXT: - - - 0.50 0.50 amomin.w.aqrl a5, a6, (a7) -# CHECK-NEXT: - - - 0.50 0.50 amomax.w.aqrl s7, s6, (s5) -# CHECK-NEXT: - - - 0.50 0.50 amominu.w.aqrl s6, s5, (s4) -# CHECK-NEXT: - - - 0.50 0.50 amomaxu.w.aqrl s5, s4, (s3) -# CHECK-NEXT: - - - 0.50 0.50 amoswap.d a4, ra, (s0) -# CHECK-NEXT: - - - 0.50 0.50 amoadd.d a1, a2, (a3) -# CHECK-NEXT: - - - 0.50 0.50 amoxor.d a2, a3, (a4) -# CHECK-NEXT: - - - 0.50 0.50 amoand.d a3, a4, (a5) -# CHECK-NEXT: - - - 0.50 0.50 amoor.d a4, a5, (a6) -# CHECK-NEXT: - - - 0.50 0.50 amomin.d a5, a6, (a7) -# CHECK-NEXT: - - - 0.50 0.50 amomax.d s7, s6, (s5) -# CHECK-NEXT: - - - 0.50 0.50 amominu.d s6, s5, (s4) -# CHECK-NEXT: - - - 0.50 0.50 amomaxu.d s5, s4, (s3) -# CHECK-NEXT: - - - 0.50 0.50 amoswap.d.aq a4, ra, (s0) -# CHECK-NEXT: - - - 0.50 0.50 amoadd.d.aq a1, a2, (a3) -# CHECK-NEXT: - - - 0.50 0.50 amoxor.d.aq a2, a3, (a4) -# CHECK-NEXT: - - - 0.50 0.50 amoand.d.aq a3, a4, (a5) -# CHECK-NEXT: - - - 0.50 0.50 amoor.d.aq a4, a5, (a6) -# CHECK-NEXT: - - - 0.50 0.50 amomin.d.aq a5, a6, (a7) -# CHECK-NEXT: - - - 0.50 0.50 amomax.d.aq s7, s6, (s5) -# CHECK-NEXT: - - - 0.50 0.50 amominu.d.aq s6, s5, (s4) -# CHECK-NEXT: - - - 0.50 0.50 amomaxu.d.aq s5, s4, (s3) -# CHECK-NEXT: - - - 0.50 0.50 amoswap.d.rl a4, ra, (s0) -# CHECK-NEXT: - - - 0.50 0.50 amoadd.d.rl a1, a2, (a3) -# CHECK-NEXT: - - - 0.50 0.50 amoxor.d.rl a2, a3, (a4) -# CHECK-NEXT: - - - 0.50 0.50 amoand.d.rl a3, a4, (a5) -# CHECK-NEXT: - - - 0.50 0.50 amoor.d.rl a4, a5, (a6) -# CHECK-NEXT: - - - 0.50 0.50 amomin.d.rl a5, a6, (a7) -# CHECK-NEXT: - - - 0.50 0.50 amomax.d.rl s7, s6, (s5) -# CHECK-NEXT: - - - 0.50 0.50 amominu.d.rl s6, s5, (s4) -# CHECK-NEXT: - - - 0.50 0.50 amomaxu.d.rl s5, s4, (s3) -# CHECK-NEXT: - - - 0.50 0.50 amoswap.d.aqrl a4, ra, (s0) -# CHECK-NEXT: - - - 0.50 0.50 amoadd.d.aqrl a1, a2, (a3) -# CHECK-NEXT: - - - 0.50 0.50 amoxor.d.aqrl a2, a3, (a4) -# CHECK-NEXT: - - - 0.50 0.50 amoand.d.aqrl a3, a4, (a5) -# CHECK-NEXT: - - - 0.50 0.50 amoor.d.aqrl a4, a5, (a6) -# CHECK-NEXT: - - - 0.50 0.50 amomin.d.aqrl a5, a6, (a7) -# CHECK-NEXT: - - - 0.50 0.50 amomax.d.aqrl s7, s6, (s5) -# CHECK-NEXT: - - - 0.50 0.50 amominu.d.aqrl s6, s5, (s4) -# CHECK-NEXT: - - - 0.50 0.50 amomaxu.d.aqrl s5, s4, (s3) +# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions: +# CHECK-NEXT: - - - 0.50 0.50 - - - lr.w t0, (t1) +# CHECK-NEXT: - - - 0.50 0.50 - - - lr.w.aq t1, (t2) +# CHECK-NEXT: - - - 0.50 0.50 - - - lr.w.rl t2, (t3) +# CHECK-NEXT: - - - 0.50 0.50 - - - lr.w.aqrl t3, (t4) +# CHECK-NEXT: - - - 0.50 0.50 - - - sc.w t6, t5, (t4) +# CHECK-NEXT: - - - 0.50 0.50 - - - sc.w.aq t5, t4, (t3) +# CHECK-NEXT: - - - 0.50 0.50 - - - sc.w.rl t4, t3, (t2) +# CHECK-NEXT: - - - 0.50 0.50 - - - sc.w.aqrl t3, t2, (t1) +# CHECK-NEXT: - - - 0.50 0.50 - - - lr.d t0, (t1) +# CHECK-NEXT: - - - 0.50 0.50 - - - lr.d.aq t1, (t2) +# CHECK-NEXT: - - - 0.50 0.50 - - - lr.d.rl t2, (t3) +# CHECK-NEXT: - - - 0.50 0.50 - - - lr.d.aqrl t3, (t4) +# CHECK-NEXT: - - - 0.50 0.50 - - - sc.d t6, t5, (t4) +# CHECK-NEXT: - - - 0.50 0.50 - - - sc.d.aq t5, t4, (t3) +# CHECK-NEXT: - - - 0.50 0.50 - - - sc.d.rl t4, t3, (t2) +# CHECK-NEXT: - - - 0.50 0.50 - - - sc.d.aqrl t3, t2, (t1) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoswap.w a4, ra, (s0) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoadd.w a1, a2, (a3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoxor.w a2, a3, (a4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoand.w a3, a4, (a5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoor.w a4, a5, (a6) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomin.w a5, a6, (a7) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomax.w s7, s6, (s5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amominu.w s6, s5, (s4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomaxu.w s5, s4, (s3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoswap.w.aq a4, ra, (s0) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoadd.w.aq a1, a2, (a3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoxor.w.aq a2, a3, (a4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoand.w.aq a3, a4, (a5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoor.w.aq a4, a5, (a6) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomin.w.aq a5, a6, (a7) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomax.w.aq s7, s6, (s5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amominu.w.aq s6, s5, (s4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomaxu.w.aq s5, s4, (s3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoswap.w.rl a4, ra, (s0) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoadd.w.rl a1, a2, (a3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoxor.w.rl a2, a3, (a4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoand.w.rl a3, a4, (a5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoor.w.rl a4, a5, (a6) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomin.w.rl a5, a6, (a7) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomax.w.rl s7, s6, (s5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amominu.w.rl s6, s5, (s4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomaxu.w.rl s5, s4, (s3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoswap.w.aqrl a4, ra, (s0) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoadd.w.aqrl a1, a2, (a3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoxor.w.aqrl a2, a3, (a4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoand.w.aqrl a3, a4, (a5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoor.w.aqrl a4, a5, (a6) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomin.w.aqrl a5, a6, (a7) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomax.w.aqrl s7, s6, (s5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amominu.w.aqrl s6, s5, (s4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomaxu.w.aqrl s5, s4, (s3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoswap.d a4, ra, (s0) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoadd.d a1, a2, (a3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoxor.d a2, a3, (a4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoand.d a3, a4, (a5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoor.d a4, a5, (a6) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomin.d a5, a6, (a7) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomax.d s7, s6, (s5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amominu.d s6, s5, (s4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomaxu.d s5, s4, (s3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoswap.d.aq a4, ra, (s0) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoadd.d.aq a1, a2, (a3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoxor.d.aq a2, a3, (a4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoand.d.aq a3, a4, (a5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoor.d.aq a4, a5, (a6) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomin.d.aq a5, a6, (a7) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomax.d.aq s7, s6, (s5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amominu.d.aq s6, s5, (s4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomaxu.d.aq s5, s4, (s3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoswap.d.rl a4, ra, (s0) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoadd.d.rl a1, a2, (a3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoxor.d.rl a2, a3, (a4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoand.d.rl a3, a4, (a5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoor.d.rl a4, a5, (a6) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomin.d.rl a5, a6, (a7) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomax.d.rl s7, s6, (s5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amominu.d.rl s6, s5, (s4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomaxu.d.rl s5, s4, (s3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoswap.d.aqrl a4, ra, (s0) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoadd.d.aqrl a1, a2, (a3) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoxor.d.aqrl a2, a3, (a4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoand.d.aqrl a3, a4, (a5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amoor.d.aqrl a4, a5, (a6) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomin.d.aqrl a5, a6, (a7) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomax.d.aqrl s7, s6, (s5) +# CHECK-NEXT: - - - 0.50 0.50 - - - amominu.d.aqrl s6, s5, (s4) +# CHECK-NEXT: - - - 0.50 0.50 - - - amomaxu.d.aqrl s5, s4, (s3) diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s index b86fcbccbeabb..78f4e7f50c745 100644 --- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s +++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/floating-point.s @@ -135,6 +135,9 @@ fclass.d a3, ft10 # CHECK-NEXT: [2] - SMX60_IEUA:1 # CHECK-NEXT: [3] - SMX60_IEUB:1 # CHECK-NEXT: [4] - SMX60_LS:2 +# CHECK-NEXT: [5] - SMX60_VFP:1 +# CHECK-NEXT: [6] - SMX60_VIEU:1 +# CHECK-NEXT: [7] - SMX60_VLS:1 # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps @@ -240,95 +243,98 @@ fclass.d a3, ft10 # CHECK-NEXT: [2] - SMX60_IEUB # CHECK-NEXT: [3.0] - SMX60_LS # CHECK-NEXT: [3.1] - SMX60_LS +# CHECK-NEXT: [4] - SMX60_VFP +# CHECK-NEXT: [5] - SMX60_VIEU +# CHECK-NEXT: [6] - SMX60_VLS # CHECK: Resource pressure per iteration: -# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] -# CHECK-NEXT: 149.00 11.00 11.00 3.00 3.00 +# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] +# CHECK-NEXT: 149.00 11.00 11.00 3.00 3.00 - - - # CHECK: Resource pressure by instruction: -# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] Instructions: -# CHECK-NEXT: - - - 0.50 0.50 flh ft0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 fsh ft0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 flw ft0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 fsw ft0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 fld ft0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 fsd ft0, 0(a0) -# CHECK-NEXT: 1.00 - - - - fadd.h fs10, fs11, ft8 -# CHECK-NEXT: 1.00 - - - - fsub.h ft9, ft10, ft11 -# CHECK-NEXT: 1.00 - - - - fmul.h ft0, ft1, ft2 -# CHECK-NEXT: 12.00 - - - - fdiv.h ft3, ft4, ft5 -# CHECK-NEXT: 12.00 - - - - fsqrt.h ft6, ft7 -# CHECK-NEXT: 1.00 - - - - fmin.h fa5, fa6, fa7 -# CHECK-NEXT: 1.00 - - - - fmax.h fs2, fs3, fs4 -# CHECK-NEXT: 1.00 - - - - fmadd.h fa0, fa1, fa2, ft11 -# CHECK-NEXT: 1.00 - - - - fmsub.h fa4, fa5, fa6, fa7 -# CHECK-NEXT: 1.00 - - - - fnmsub.h fs2, fs3, fs4, fs5 -# CHECK-NEXT: 1.00 - - - - fnmadd.h fs6, fs7, fs8, fs9 -# CHECK-NEXT: 1.00 - - - - fadd.s fs10, fs11, ft8 -# CHECK-NEXT: 1.00 - - - - fsub.s ft9, ft10, ft11 -# CHECK-NEXT: 1.00 - - - - fmul.s ft0, ft1, ft2 -# CHECK-NEXT: 15.00 - - - - fdiv.s ft3, ft4, ft5 -# CHECK-NEXT: 15.00 - - - - fsqrt.s ft6, ft7 -# CHECK-NEXT: 1.00 - - - - fmin.s fa5, fa6, fa7 -# CHECK-NEXT: 1.00 - - - - fmax.s fs2, fs3, fs4 -# CHECK-NEXT: 1.00 - - - - fmadd.s fa0, fa1, fa2, ft11 -# CHECK-NEXT: 1.00 - - - - fmsub.s fa4, fa5, fa6, fa7 -# CHECK-NEXT: 1.00 - - - - fnmsub.s fs2, fs3, fs4, fs5 -# CHECK-NEXT: 1.00 - - - - fnmadd.s fs6, fs7, fs8, fs9 -# CHECK-NEXT: 1.00 - - - - fadd.d fs10, fs11, ft8 -# CHECK-NEXT: 1.00 - - - - fsub.d ft9, ft10, ft11 -# CHECK-NEXT: 1.00 - - - - fmul.d ft0, ft1, ft2 -# CHECK-NEXT: 22.00 - - - - fdiv.d ft3, ft4, ft5 -# CHECK-NEXT: 22.00 - - - - fsqrt.d ft6, ft7 -# CHECK-NEXT: 1.00 - - - - fmin.d fa5, fa6, fa7 -# CHECK-NEXT: 1.00 - - - - fmax.d fs2, fs3, fs4 -# CHECK-NEXT: 1.00 - - - - fmadd.d fa0, fa1, fa2, ft11 -# CHECK-NEXT: 1.00 - - - - fmsub.d fa4, fa5, fa6, fa7 -# CHECK-NEXT: 1.00 - - - - fnmsub.d fs2, fs3, fs4, fs5 -# CHECK-NEXT: 1.00 - - - - fnmadd.d fs6, fs7, fs8, fs9 -# CHECK-NEXT: - 0.50 0.50 - - fmv.x.h a2, fs7 -# CHECK-NEXT: - 0.50 0.50 - - fmv.h.x ft1, a6 -# CHECK-NEXT: 1.00 - - - - fcvt.s.h fa0, ft0 -# CHECK-NEXT: 1.00 - - - - fcvt.s.h fa0, ft0, rup -# CHECK-NEXT: 1.00 - - - - fcvt.h.s ft2, fa2 -# CHECK-NEXT: 1.00 - - - - fcvt.d.h fa0, ft0 -# CHECK-NEXT: 1.00 - - - - fcvt.d.h fa0, ft0, rup -# CHECK-NEXT: 1.00 - - - - fcvt.h.d ft2, fa2 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.w.s a0, fs5 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.wu.s a1, fs6 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.s.w ft11, a4 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.s.wu ft0, a5 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.l.s a0, ft0 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.lu.s a1, ft1 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.s.l ft2, a2 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.s.lu ft3, a3 -# CHECK-NEXT: - 0.50 0.50 - - fmv.x.w a2, fs7 -# CHECK-NEXT: - 0.50 0.50 - - fmv.w.x ft1, a6 -# CHECK-NEXT: 1.00 - - - - fsgnj.s fs1, fa0, fa1 -# CHECK-NEXT: 1.00 - - - - fsgnjn.s fa1, fa3, fa4 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.wu.d a4, ft11 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.w.d a4, ft11 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.d.w ft0, a5 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.d.wu ft1, a6 -# CHECK-NEXT: 1.00 - - - - fcvt.s.d fs5, fs6 -# CHECK-NEXT: 1.00 - - - - fcvt.d.s fs7, fs8 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.l.d a0, ft0 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.lu.d a1, ft1 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.d.l ft3, a3 -# CHECK-NEXT: - 0.50 0.50 - - fcvt.d.lu ft4, a4 -# CHECK-NEXT: - 0.50 0.50 - - fmv.x.d a2, ft2 -# CHECK-NEXT: - 0.50 0.50 - - fmv.d.x ft5, a5 -# CHECK-NEXT: 1.00 - - - - fsgnj.d fs1, fa0, fa1 -# CHECK-NEXT: 1.00 - - - - fsgnjn.d fa1, fa3, fa4 -# CHECK-NEXT: 1.00 - - - - feq.h a1, fs8, fs9 -# CHECK-NEXT: 1.00 - - - - flt.h a2, fs10, fs11 -# CHECK-NEXT: 1.00 - - - - fle.h a3, ft8, ft9 -# CHECK-NEXT: 1.00 - - - - feq.s a1, fs8, fs9 -# CHECK-NEXT: 1.00 - - - - flt.s a2, fs10, fs11 -# CHECK-NEXT: 1.00 - - - - fle.s a3, ft8, ft9 -# CHECK-NEXT: 1.00 - - - - feq.d a1, fs8, fs9 -# CHECK-NEXT: 1.00 - - - - flt.d a2, fs10, fs11 -# CHECK-NEXT: 1.00 - - - - fle.d a3, ft8, ft9 -# CHECK-NEXT: 1.00 - - - - fclass.s a3, ft10 -# CHECK-NEXT: 1.00 - - - - fclass.s a3, ft10 -# CHECK-NEXT: 1.00 - - - - fclass.d a3, ft10 +# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions: +# CHECK-NEXT: - - - 0.50 0.50 - - - flh ft0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - fsh ft0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - flw ft0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - fsw ft0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - fld ft0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - fsd ft0, 0(a0) +# CHECK-NEXT: 1.00 - - - - - - - fadd.h fs10, fs11, ft8 +# CHECK-NEXT: 1.00 - - - - - - - fsub.h ft9, ft10, ft11 +# CHECK-NEXT: 1.00 - - - - - - - fmul.h ft0, ft1, ft2 +# CHECK-NEXT: 12.00 - - - - - - - fdiv.h ft3, ft4, ft5 +# CHECK-NEXT: 12.00 - - - - - - - fsqrt.h ft6, ft7 +# CHECK-NEXT: 1.00 - - - - - - - fmin.h fa5, fa6, fa7 +# CHECK-NEXT: 1.00 - - - - - - - fmax.h fs2, fs3, fs4 +# CHECK-NEXT: 1.00 - - - - - - - fmadd.h fa0, fa1, fa2, ft11 +# CHECK-NEXT: 1.00 - - - - - - - fmsub.h fa4, fa5, fa6, fa7 +# CHECK-NEXT: 1.00 - - - - - - - fnmsub.h fs2, fs3, fs4, fs5 +# CHECK-NEXT: 1.00 - - - - - - - fnmadd.h fs6, fs7, fs8, fs9 +# CHECK-NEXT: 1.00 - - - - - - - fadd.s fs10, fs11, ft8 +# CHECK-NEXT: 1.00 - - - - - - - fsub.s ft9, ft10, ft11 +# CHECK-NEXT: 1.00 - - - - - - - fmul.s ft0, ft1, ft2 +# CHECK-NEXT: 15.00 - - - - - - - fdiv.s ft3, ft4, ft5 +# CHECK-NEXT: 15.00 - - - - - - - fsqrt.s ft6, ft7 +# CHECK-NEXT: 1.00 - - - - - - - fmin.s fa5, fa6, fa7 +# CHECK-NEXT: 1.00 - - - - - - - fmax.s fs2, fs3, fs4 +# CHECK-NEXT: 1.00 - - - - - - - fmadd.s fa0, fa1, fa2, ft11 +# CHECK-NEXT: 1.00 - - - - - - - fmsub.s fa4, fa5, fa6, fa7 +# CHECK-NEXT: 1.00 - - - - - - - fnmsub.s fs2, fs3, fs4, fs5 +# CHECK-NEXT: 1.00 - - - - - - - fnmadd.s fs6, fs7, fs8, fs9 +# CHECK-NEXT: 1.00 - - - - - - - fadd.d fs10, fs11, ft8 +# CHECK-NEXT: 1.00 - - - - - - - fsub.d ft9, ft10, ft11 +# CHECK-NEXT: 1.00 - - - - - - - fmul.d ft0, ft1, ft2 +# CHECK-NEXT: 22.00 - - - - - - - fdiv.d ft3, ft4, ft5 +# CHECK-NEXT: 22.00 - - - - - - - fsqrt.d ft6, ft7 +# CHECK-NEXT: 1.00 - - - - - - - fmin.d fa5, fa6, fa7 +# CHECK-NEXT: 1.00 - - - - - - - fmax.d fs2, fs3, fs4 +# CHECK-NEXT: 1.00 - - - - - - - fmadd.d fa0, fa1, fa2, ft11 +# CHECK-NEXT: 1.00 - - - - - - - fmsub.d fa4, fa5, fa6, fa7 +# CHECK-NEXT: 1.00 - - - - - - - fnmsub.d fs2, fs3, fs4, fs5 +# CHECK-NEXT: 1.00 - - - - - - - fnmadd.d fs6, fs7, fs8, fs9 +# CHECK-NEXT: - 0.50 0.50 - - - - - fmv.x.h a2, fs7 +# CHECK-NEXT: - 0.50 0.50 - - - - - fmv.h.x ft1, a6 +# CHECK-NEXT: 1.00 - - - - - - - fcvt.s.h fa0, ft0 +# CHECK-NEXT: 1.00 - - - - - - - fcvt.s.h fa0, ft0, rup +# CHECK-NEXT: 1.00 - - - - - - - fcvt.h.s ft2, fa2 +# CHECK-NEXT: 1.00 - - - - - - - fcvt.d.h fa0, ft0 +# CHECK-NEXT: 1.00 - - - - - - - fcvt.d.h fa0, ft0, rup +# CHECK-NEXT: 1.00 - - - - - - - fcvt.h.d ft2, fa2 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.w.s a0, fs5 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.wu.s a1, fs6 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.s.w ft11, a4 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.s.wu ft0, a5 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.l.s a0, ft0 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.lu.s a1, ft1 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.s.l ft2, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.s.lu ft3, a3 +# CHECK-NEXT: - 0.50 0.50 - - - - - fmv.x.w a2, fs7 +# CHECK-NEXT: - 0.50 0.50 - - - - - fmv.w.x ft1, a6 +# CHECK-NEXT: 1.00 - - - - - - - fsgnj.s fs1, fa0, fa1 +# CHECK-NEXT: 1.00 - - - - - - - fsgnjn.s fa1, fa3, fa4 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.wu.d a4, ft11 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.w.d a4, ft11 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.d.w ft0, a5 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.d.wu ft1, a6 +# CHECK-NEXT: 1.00 - - - - - - - fcvt.s.d fs5, fs6 +# CHECK-NEXT: 1.00 - - - - - - - fcvt.d.s fs7, fs8 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.l.d a0, ft0 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.lu.d a1, ft1 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.d.l ft3, a3 +# CHECK-NEXT: - 0.50 0.50 - - - - - fcvt.d.lu ft4, a4 +# CHECK-NEXT: - 0.50 0.50 - - - - - fmv.x.d a2, ft2 +# CHECK-NEXT: - 0.50 0.50 - - - - - fmv.d.x ft5, a5 +# CHECK-NEXT: 1.00 - - - - - - - fsgnj.d fs1, fa0, fa1 +# CHECK-NEXT: 1.00 - - - - - - - fsgnjn.d fa1, fa3, fa4 +# CHECK-NEXT: 1.00 - - - - - - - feq.h a1, fs8, fs9 +# CHECK-NEXT: 1.00 - - - - - - - flt.h a2, fs10, fs11 +# CHECK-NEXT: 1.00 - - - - - - - fle.h a3, ft8, ft9 +# CHECK-NEXT: 1.00 - - - - - - - feq.s a1, fs8, fs9 +# CHECK-NEXT: 1.00 - - - - - - - flt.s a2, fs10, fs11 +# CHECK-NEXT: 1.00 - - - - - - - fle.s a3, ft8, ft9 +# CHECK-NEXT: 1.00 - - - - - - - feq.d a1, fs8, fs9 +# CHECK-NEXT: 1.00 - - - - - - - flt.d a2, fs10, fs11 +# CHECK-NEXT: 1.00 - - - - - - - fle.d a3, ft8, ft9 +# CHECK-NEXT: 1.00 - - - - - - - fclass.s a3, ft10 +# CHECK-NEXT: 1.00 - - - - - - - fclass.s a3, ft10 +# CHECK-NEXT: 1.00 - - - - - - - fclass.d a3, ft10 diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s index b72540f29f487..51a036aaae784 100644 --- a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s +++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/integer.s @@ -170,6 +170,9 @@ bseti a0, a1, 1 # CHECK-NEXT: [2] - SMX60_IEUA:1 # CHECK-NEXT: [3] - SMX60_IEUB:1 # CHECK-NEXT: [4] - SMX60_LS:2 +# CHECK-NEXT: [5] - SMX60_VFP:1 +# CHECK-NEXT: [6] - SMX60_VIEU:1 +# CHECK-NEXT: [7] - SMX60_VLS:1 # CHECK: Instruction Info: # CHECK-NEXT: [1]: #uOps @@ -306,126 +309,129 @@ bseti a0, a1, 1 # CHECK-NEXT: [2] - SMX60_IEUB # CHECK-NEXT: [3.0] - SMX60_LS # CHECK-NEXT: [3.1] - SMX60_LS +# CHECK-NEXT: [4] - SMX60_VFP +# CHECK-NEXT: [5] - SMX60_VIEU +# CHECK-NEXT: [6] - SMX60_VLS # CHECK: Resource pressure per iteration: -# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] -# CHECK-NEXT: - 180.50 44.50 5.50 5.50 +# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] +# CHECK-NEXT: - 180.50 44.50 5.50 5.50 - - - # CHECK: Resource pressure by instruction: -# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] Instructions: -# CHECK-NEXT: - 0.50 0.50 - - addi a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - addiw a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - slti a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - seqz a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - andi a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - ori a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - xori a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - slli a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - srli a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - srai a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - slliw a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - srliw a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - sraiw a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - lui a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - auipc a1, 1 -# CHECK-NEXT: - 0.50 0.50 - - add a0, a0, a1 -# CHECK-NEXT: - 0.50 0.50 - - addw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - slt a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sltu a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - and a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - or a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - xor a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sll a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - srl a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sra a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sllw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - srlw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sraw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sub a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - subw a0, a0, a0 -# CHECK-NEXT: - 1.00 - - - jal a0, .Ltmp0 -# CHECK-NEXT: - 1.00 - - - jalr a0 -# CHECK-NEXT: - 1.00 - - - beq a0, a0, .Ltmp1 -# CHECK-NEXT: - 1.00 - - - bne a0, a0, .Ltmp2 -# CHECK-NEXT: - 1.00 - - - blt a0, a0, .Ltmp3 -# CHECK-NEXT: - 1.00 - - - bltu a0, a0, .Ltmp4 -# CHECK-NEXT: - 1.00 - - - bge a0, a0, .Ltmp5 -# CHECK-NEXT: - 1.00 - - - bgeu a0, a0, .Ltmp6 -# CHECK-NEXT: - 0.50 0.50 - - add a0, a0, a0 -# CHECK-NEXT: - - - 0.50 0.50 lb t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 lbu t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 lh t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 lhu t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 lw t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 lwu t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 ld t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 sb t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 sh t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 sw t0, 0(a0) -# CHECK-NEXT: - - - 0.50 0.50 sd t0, 0(a0) -# CHECK-NEXT: - 0.50 0.50 - - mul a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - mulh a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - mulhu a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - mulhsu a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - mulw a0, a0, a0 -# CHECK-NEXT: - 20.00 - - - div a0, a1, a2 -# CHECK-NEXT: - 20.00 - - - divu a0, a1, a2 -# CHECK-NEXT: - 20.00 - - - rem a0, a1, a2 -# CHECK-NEXT: - 20.00 - - - remu a0, a1, a2 -# CHECK-NEXT: - 12.00 - - - divw a0, a1, a2 -# CHECK-NEXT: - 12.00 - - - divuw a0, a1, a2 -# CHECK-NEXT: - 12.00 - - - remw a0, a1, a2 -# CHECK-NEXT: - 12.00 - - - remuw a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - csrrw t0, 4095, t1 -# CHECK-NEXT: - 0.50 0.50 - - csrrs s3, fflags, s5 -# CHECK-NEXT: - 0.50 0.50 - - csrrc sp, 0, ra -# CHECK-NEXT: - 0.50 0.50 - - csrrwi a5, 0, 0 -# CHECK-NEXT: - 0.50 0.50 - - csrrsi t2, 4095, 31 -# CHECK-NEXT: - 0.50 0.50 - - csrrci t1, sscratch, 5 -# CHECK-NEXT: - 0.50 0.50 - - czero.eqz a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - czero.nez a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - czero.eqz a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - czero.nez a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - add.uw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - slli.uw a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - sh1add.uw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sh2add.uw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sh3add.uw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sh1add a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sh2add a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sh3add a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - andn a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - orn a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - xnor a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - clz a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - clzw a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - ctz a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - ctzw a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - cpop a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - cpopw a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - min a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - minu a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - max a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - maxu a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sext.b a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - sext.h a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - zext.h a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - rol a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - rolw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - ror a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - rorw a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - rori a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - roriw a0, a0, 1 -# CHECK-NEXT: - 0.50 0.50 - - orc.b a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - rev8 a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - clmul a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - clmulr a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - clmulh a0, a0, a0 -# CHECK-NEXT: - 0.50 0.50 - - bclr a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - bclri a0, a1, 1 -# CHECK-NEXT: - 0.50 0.50 - - bext a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - bexti a0, a1, 1 -# CHECK-NEXT: - 0.50 0.50 - - binv a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - binvi a0, a1, 1 -# CHECK-NEXT: - 0.50 0.50 - - bset a0, a1, a2 -# CHECK-NEXT: - 0.50 0.50 - - bseti a0, a1, 1 +# CHECK-NEXT: [0] [1] [2] [3.0] [3.1] [4] [5] [6] Instructions: +# CHECK-NEXT: - 0.50 0.50 - - - - - addi a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - addiw a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - slti a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - seqz a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - andi a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - ori a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - xori a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - slli a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - srli a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - srai a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - slliw a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - srliw a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - sraiw a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - lui a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - auipc a1, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - add a0, a0, a1 +# CHECK-NEXT: - 0.50 0.50 - - - - - addw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - slt a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sltu a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - and a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - or a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - xor a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sll a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - srl a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sra a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sllw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - srlw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sraw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sub a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - subw a0, a0, a0 +# CHECK-NEXT: - 1.00 - - - - - - jal a0, .Ltmp0 +# CHECK-NEXT: - 1.00 - - - - - - jalr a0 +# CHECK-NEXT: - 1.00 - - - - - - beq a0, a0, .Ltmp1 +# CHECK-NEXT: - 1.00 - - - - - - bne a0, a0, .Ltmp2 +# CHECK-NEXT: - 1.00 - - - - - - blt a0, a0, .Ltmp3 +# CHECK-NEXT: - 1.00 - - - - - - bltu a0, a0, .Ltmp4 +# CHECK-NEXT: - 1.00 - - - - - - bge a0, a0, .Ltmp5 +# CHECK-NEXT: - 1.00 - - - - - - bgeu a0, a0, .Ltmp6 +# CHECK-NEXT: - 0.50 0.50 - - - - - add a0, a0, a0 +# CHECK-NEXT: - - - 0.50 0.50 - - - lb t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - lbu t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - lh t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - lhu t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - lw t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - lwu t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - ld t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - sb t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - sh t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - sw t0, 0(a0) +# CHECK-NEXT: - - - 0.50 0.50 - - - sd t0, 0(a0) +# CHECK-NEXT: - 0.50 0.50 - - - - - mul a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - mulh a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - mulhu a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - mulhsu a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - mulw a0, a0, a0 +# CHECK-NEXT: - 20.00 - - - - - - div a0, a1, a2 +# CHECK-NEXT: - 20.00 - - - - - - divu a0, a1, a2 +# CHECK-NEXT: - 20.00 - - - - - - rem a0, a1, a2 +# CHECK-NEXT: - 20.00 - - - - - - remu a0, a1, a2 +# CHECK-NEXT: - 12.00 - - - - - - divw a0, a1, a2 +# CHECK-NEXT: - 12.00 - - - - - - divuw a0, a1, a2 +# CHECK-NEXT: - 12.00 - - - - - - remw a0, a1, a2 +# CHECK-NEXT: - 12.00 - - - - - - remuw a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - csrrw t0, 4095, t1 +# CHECK-NEXT: - 0.50 0.50 - - - - - csrrs s3, fflags, s5 +# CHECK-NEXT: - 0.50 0.50 - - - - - csrrc sp, 0, ra +# CHECK-NEXT: - 0.50 0.50 - - - - - csrrwi a5, 0, 0 +# CHECK-NEXT: - 0.50 0.50 - - - - - csrrsi t2, 4095, 31 +# CHECK-NEXT: - 0.50 0.50 - - - - - csrrci t1, sscratch, 5 +# CHECK-NEXT: - 0.50 0.50 - - - - - czero.eqz a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - czero.nez a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - czero.eqz a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - czero.nez a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - add.uw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - slli.uw a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - sh1add.uw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sh2add.uw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sh3add.uw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sh1add a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sh2add a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sh3add a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - andn a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - orn a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - xnor a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - clz a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - clzw a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - ctz a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - ctzw a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - cpop a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - cpopw a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - min a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - minu a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - max a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - maxu a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sext.b a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - sext.h a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - zext.h a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - rol a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - rolw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - ror a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - rorw a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - rori a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - roriw a0, a0, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - orc.b a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - rev8 a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - clmul a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - clmulr a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - clmulh a0, a0, a0 +# CHECK-NEXT: - 0.50 0.50 - - - - - bclr a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - bclri a0, a1, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - bext a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - bexti a0, a1, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - binv a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - binvi a0, a1, 1 +# CHECK-NEXT: - 0.50 0.50 - - - - - bset a0, a1, a2 +# CHECK-NEXT: - 0.50 0.50 - - - - - bseti a0, a1, 1 diff --git a/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s new file mode 100644 index 0000000000000..c7755dcc37658 --- /dev/null +++ b/llvm/test/tools/llvm-mca/RISCV/SpacemitX60/rvv-arithmetic.s @@ -0,0 +1,6820 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=riscv64 -mcpu=spacemit-x60 -iterations=1 -instruction-tables=full < %s | FileCheck %s + +# Basic arithmetic operations + +vsetvli x28, x0, e8, mf2, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e8, mf4, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e8, mf8, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e8, m1, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e8, m2, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e8, m4, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e8, m8, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e16, mf2, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e16, mf4, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e16, m1, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e16, m2, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e16, m4, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e16, m8, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e32, mf2, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e32, m1, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e32, m2, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e32, m4, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e32, m8, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e64, m1, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e64, m2, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e64, m4, tu, mu +vadd.vi v8, v8, 12 +vsetvli x28, x0, e64, m8, tu, mu +vadd.vi v8, v8, 12 + +vsetvli x28, x0, e8, mf2, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vadd.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vadd.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vsub.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vsub.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vsub.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vsub.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m1, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m2, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m4, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m8, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m1, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m2, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m4, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m8, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m1, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m2, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m4, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m8, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m1, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m2, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m4, tu, mu +vadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m8, tu, mu +vadc.vvm v8, v8, v8, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m1, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m2, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m4, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m8, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m1, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m2, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m4, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m8, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m1, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m2, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m4, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m8, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m1, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m2, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m4, tu, mu +vadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m8, tu, mu +vadc.vxm v8, v8, x30, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, m1, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, m2, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, m4, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, m8, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, m1, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, m2, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, m4, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, m8, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, m1, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, m2, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, m4, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, m8, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e64, m1, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e64, m2, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e64, m4, tu, mu +vadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e64, m8, tu, mu +vadc.vim v8, v8, 12, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m1, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m2, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m4, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m8, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m1, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m2, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m4, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m8, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m1, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m2, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m4, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m8, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m1, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m2, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m4, tu, mu +vsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m8, tu, mu +vsbc.vvm v8, v8, v8, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m1, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m2, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m4, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m8, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m1, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m2, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m4, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m8, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m1, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m2, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m4, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m8, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m1, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m2, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m4, tu, mu +vsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m8, tu, mu +vsbc.vxm v8, v8, x30, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e8, mf4, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e8, mf8, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e8, m1, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e8, m2, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e8, m4, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e16, mf2, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e16, mf4, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e16, m1, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e16, m2, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e16, m4, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e32, mf2, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e32, m1, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e32, m2, tu, mu +vwaddu.vv v8, v16, v24 +vsetvli x28, x0, e32, m4, tu, mu +vwaddu.vv v8, v16, v24 + +vsetvli x28, x0, e8, mf2, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e8, m1, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e8, m2, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e8, m4, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e16, m1, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e16, m2, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e16, m4, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e32, m1, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e32, m2, tu, mu +vwaddu.vx v8, v16, x30 +vsetvli x28, x0, e32, m4, tu, mu +vwaddu.vx v8, v16, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e8, mf4, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e8, mf8, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e8, m1, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e8, m2, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e8, m4, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e16, mf2, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e16, mf4, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e16, m1, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e16, m2, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e16, m4, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e32, mf2, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e32, m1, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e32, m2, tu, mu +vwadd.vv v8, v16, v24 +vsetvli x28, x0, e32, m4, tu, mu +vwadd.vv v8, v16, v24 + +vsetvli x28, x0, e8, mf2, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e8, m1, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e8, m2, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e8, m4, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e16, m1, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e16, m2, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e16, m4, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e32, m1, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e32, m2, tu, mu +vwadd.vx v8, v16, x30 +vsetvli x28, x0, e32, m4, tu, mu +vwadd.vx v8, v16, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e8, mf4, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e8, mf8, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e8, m1, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e8, m2, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e8, m4, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e16, mf2, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e16, mf4, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e16, m1, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e16, m2, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e16, m4, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e32, mf2, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e32, m1, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e32, m2, tu, mu +vwsubu.vv v8, v16, v24 +vsetvli x28, x0, e32, m4, tu, mu +vwsubu.vv v8, v16, v24 + +vsetvli x28, x0, e8, mf2, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e8, m1, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e8, m2, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e8, m4, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e16, m1, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e16, m2, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e16, m4, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e32, m1, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e32, m2, tu, mu +vwsubu.vx v8, v16, x30 +vsetvli x28, x0, e32, m4, tu, mu +vwsubu.vx v8, v16, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e8, mf4, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e8, mf8, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e8, m1, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e8, m2, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e8, m4, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e16, mf2, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e16, mf4, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e16, m1, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e16, m2, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e16, m4, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e32, mf2, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e32, m1, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e32, m2, tu, mu +vwsub.vv v8, v16, v24 +vsetvli x28, x0, e32, m4, tu, mu +vwsub.vv v8, v16, v24 + +vsetvli x28, x0, e8, mf2, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e8, m1, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e8, m2, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e8, m4, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e16, m1, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e16, m2, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e16, m4, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e32, m1, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e32, m2, tu, mu +vwsub.vx v8, v16, x30 +vsetvli x28, x0, e32, m4, tu, mu +vwsub.vx v8, v16, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vaaddu.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vaaddu.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vaaddu.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vaaddu.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vaadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vaadd.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vaadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vaadd.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vasubu.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vasubu.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vasubu.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vasubu.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vasub.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vasub.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vasub.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vasub.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e8, mf4, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e8, mf8, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e8, m1, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e8, m2, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e8, m4, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e8, m8, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e16, mf2, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e16, mf4, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e16, m1, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e16, m2, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e16, m4, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e16, m8, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e32, mf2, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e32, m1, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e32, m2, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e32, m4, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e32, m8, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e64, m1, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e64, m2, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e64, m4, tu, mu +vmadc.vi v8, v8, 12 +vsetvli x28, x0, e64, m8, tu, mu +vmadc.vi v8, v8, 12 + +vsetvli x28, x0, e8, mf2, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, m1, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, m2, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, m4, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e8, m8, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, m1, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, m2, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, m4, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e16, m8, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, m1, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, m2, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, m4, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e32, m8, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e64, m1, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e64, m2, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e64, m4, tu, mu +vmadc.vim v8, v8, 12, v0 +vsetvli x28, x0, e64, m8, tu, mu +vmadc.vim v8, v8, 12, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vmadc.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vmadc.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m1, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m2, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m4, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m8, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m1, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m2, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m4, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m8, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m1, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m2, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m4, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m8, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m1, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m2, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m4, tu, mu +vmadc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m8, tu, mu +vmadc.vvm v8, v8, v8, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vmadc.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vmadc.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m1, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m2, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m4, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m8, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m1, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m2, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m4, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m8, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m1, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m2, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m4, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m8, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m1, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m2, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m4, tu, mu +vmadc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m8, tu, mu +vmadc.vxm v8, v8, x30, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vmsbc.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vmsbc.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m1, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m2, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m4, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e8, m8, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m1, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m2, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m4, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e16, m8, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m1, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m2, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m4, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e32, m8, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m1, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m2, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m4, tu, mu +vmsbc.vvm v8, v8, v8, v0 +vsetvli x28, x0, e64, m8, tu, mu +vmsbc.vvm v8, v8, v8, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vmsbc.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vmsbc.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, mf4, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, mf8, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m1, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m2, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m4, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e8, m8, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, mf2, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, mf4, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m1, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m2, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m4, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e16, m8, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, mf2, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m1, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m2, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m4, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e32, m8, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m1, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m2, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m4, tu, mu +vmsbc.vxm v8, v8, x30, v0 +vsetvli x28, x0, e64, m8, tu, mu +vmsbc.vxm v8, v8, x30, v0 + +vsetvli x28, x0, e8, mf2, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e8, mf4, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e8, mf8, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e8, m1, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e8, m2, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e8, m4, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e8, m8, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e16, mf2, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e16, mf4, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e16, m1, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e16, m2, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e16, m4, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e16, m8, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e32, mf2, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e32, m1, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e32, m2, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e32, m4, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e32, m8, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e64, m1, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e64, m2, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e64, m4, tu, mu +vrsub.vi v8, v8, 12 +vsetvli x28, x0, e64, m8, tu, mu +vrsub.vi v8, v8, 12 + +vsetvli x28, x0, e8, mf2, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vrsub.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vrsub.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e8, mf4, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e8, mf8, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e8, m1, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e8, m2, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e8, m4, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e8, m8, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e16, mf2, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e16, mf4, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e16, m1, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e16, m2, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e16, m4, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e16, m8, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e32, mf2, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e32, m1, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e32, m2, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e32, m4, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e32, m8, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e64, m1, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e64, m2, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e64, m4, tu, mu +vsaddu.vi v8, v8, 12 +vsetvli x28, x0, e64, m8, tu, mu +vsaddu.vi v8, v8, 12 + +vsetvli x28, x0, e8, mf2, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vsaddu.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vsaddu.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vsaddu.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vsaddu.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e8, mf4, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e8, mf8, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e8, m1, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e8, m2, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e8, m4, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e8, m8, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e16, mf2, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e16, mf4, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e16, m1, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e16, m2, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e16, m4, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e16, m8, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e32, mf2, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e32, m1, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e32, m2, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e32, m4, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e32, m8, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e64, m1, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e64, m2, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e64, m4, tu, mu +vsadd.vi v8, v8, 12 +vsetvli x28, x0, e64, m8, tu, mu +vsadd.vi v8, v8, 12 + +vsetvli x28, x0, e8, mf2, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vsadd.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vsadd.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vsadd.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vsadd.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vssubu.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vssubu.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vssubu.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vssubu.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e8, mf4, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e8, mf8, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e8, m1, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e8, m2, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e8, m4, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e8, m8, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e16, mf2, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e16, mf4, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e16, m1, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e16, m2, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e16, m4, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e16, m8, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e32, mf2, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e32, m1, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e32, m2, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e32, m4, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e32, m8, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e64, m1, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e64, m2, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e64, m4, tu, mu +vssub.vv v8, v8, v8 +vsetvli x28, x0, e64, m8, tu, mu +vssub.vv v8, v8, v8 + +vsetvli x28, x0, e8, mf2, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e8, m1, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e8, m2, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e8, m4, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e8, m8, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e16, m1, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e16, m2, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e16, m4, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e16, m8, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e32, m1, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e32, m2, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e32, m4, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e32, m8, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e64, m1, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e64, m2, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e64, m4, tu, mu +vssub.vx v8, v8, x30 +vsetvli x28, x0, e64, m8, tu, mu +vssub.vx v8, v8, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e8, mf4, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e8, mf8, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e8, m1, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e8, m2, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e8, m4, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e16, mf2, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e16, mf4, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e16, m1, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e16, m2, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e16, m4, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e32, mf2, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e32, m1, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e32, m2, tu, mu +vwaddu.wv v8, v16, v24 +vsetvli x28, x0, e32, m4, tu, mu +vwaddu.wv v8, v16, v24 + +vsetvli x28, x0, e8, mf2, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e8, m1, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e8, m2, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e8, m4, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e16, m1, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e16, m2, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e16, m4, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e32, m1, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e32, m2, tu, mu +vwaddu.wx v8, v16, x30 +vsetvli x28, x0, e32, m4, tu, mu +vwaddu.wx v8, v16, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e8, mf4, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e8, mf8, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e8, m1, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e8, m2, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e8, m4, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e16, mf2, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e16, mf4, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e16, m1, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e16, m2, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e16, m4, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e32, mf2, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e32, m1, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e32, m2, tu, mu +vwadd.wv v8, v16, v24 +vsetvli x28, x0, e32, m4, tu, mu +vwadd.wv v8, v16, v24 + +vsetvli x28, x0, e8, mf2, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e8, m1, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e8, m2, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e8, m4, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e16, m1, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e16, m2, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e16, m4, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e32, m1, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e32, m2, tu, mu +vwadd.wx v8, v16, x30 +vsetvli x28, x0, e32, m4, tu, mu +vwadd.wx v8, v16, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e8, mf4, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e8, mf8, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e8, m1, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e8, m2, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e8, m4, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e16, mf2, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e16, mf4, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e16, m1, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e16, m2, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e16, m4, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e32, mf2, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e32, m1, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e32, m2, tu, mu +vwsubu.wv v8, v16, v24 +vsetvli x28, x0, e32, m4, tu, mu +vwsubu.wv v8, v16, v24 + +vsetvli x28, x0, e8, mf2, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e8, m1, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e8, m2, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e8, m4, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e16, m1, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e16, m2, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e16, m4, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e32, m1, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e32, m2, tu, mu +vwsubu.wx v8, v16, x30 +vsetvli x28, x0, e32, m4, tu, mu +vwsubu.wx v8, v16, x30 + +vsetvli x28, x0, e8, mf2, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e8, mf4, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e8, mf8, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e8, m1, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e8, m2, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e8, m4, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e16, mf2, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e16, mf4, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e16, m1, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e16, m2, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e16, m4, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e32, mf2, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e32, m1, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e32, m2, tu, mu +vwsub.wv v8, v16, v24 +vsetvli x28, x0, e32, m4, tu, mu +vwsub.wv v8, v16, v24 + +vsetvli x28, x0, e8, mf2, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e8, mf4, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e8, mf8, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e8, m1, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e8, m2, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e8, m4, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e16, mf2, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e16, mf4, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e16, m1, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e16, m2, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e16, m4, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e32, mf2, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e32, m1, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e32, m2, tu, mu +vwsub.wx v8, v16, x30 +vsetvli x28, x0, e32, m4, tu, mu +vwsub.wx v8, v16, x30 + +# CHECK: Resources: +# CHECK-NEXT: [0] - SMX60_FP:1 +# CHECK-NEXT: [1] - SMX60_IEU:2 SMX60_IEUA, SMX60_IEUB +# CHECK-NEXT: [2] - SMX60_IEUA:1 +# CHECK-NEXT: [3] - SMX60_IEUB:1 +# CHECK-NEXT: [4] - SMX60_LS:2 +# CHECK-NEXT: [5] - SMX60_VFP:1 +# CHECK-NEXT: [6] - SMX60_VIEU:1 +# CHECK-NEXT: [7] - SMX60_VLS:1 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) +# CHECK-NEXT: [7]: Bypass Latency +# CHECK-NEXT: [8]: Resources ( | [] | [, | [] | [, | [] | [, | [] | [, | [] | [, | [] | [, | [] | [, | [] | [, | [] | [, | [] | [, | [] | [, Date: Wed, 16 Jul 2025 12:11:15 -0700 Subject: [PATCH 091/813] [flang] Handle SEQUENCE derived types for array repacking. (#148777) It is possible that a non-polymorphic dummy argument has a dynamic type that does not match its static type in a valid Fortran program, e.g. when the actual and the dummy arguments have different compatible derived SEQUENCE types: module mod type t sequence integer x end type contains subroutine test(x) type t sequence integer x end type type(t) :: x(:) end subroutine end module 'test' may be called with an actual argument of type 'mod::t', which is the dynamic type of 'x' on entry to 'test'. If we create the repacking temporary based on the static type of 'x' ('test::t'), then the runtime will report the types mismatch as an error. Thus, we have to create the temporary using the dynamic type of 'x'. The fact that the dummy's type has SEQUENCE or BIND attribute is not easily computable at this stage, so we use the dynamic type for all derived type cases. As long as this is done only when the repacking actually happens, the overhead should not be noticeable. --- .../Optimizer/CodeGen/LowerRepackArrays.cpp | 56 +++++--- flang/test/Transforms/lower-repack-arrays.fir | 128 +++++++++--------- 2 files changed, 98 insertions(+), 86 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp index e34771c67b0c3..d2cf85bedd54c 100644 --- a/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp +++ b/flang/lib/Optimizer/CodeGen/LowerRepackArrays.cpp @@ -63,13 +63,14 @@ class PackArrayConversion : public mlir::OpRewritePattern { static constexpr llvm::StringRef bufferName = ".repacked"; // Return value of fir::BaseBoxType that represents a temporary - // array created for the original box with given extents and - // type parameters. The new box has the default lower bounds. - // If useStack is true, then the temporary will be allocated + // array created for the original box with given lbounds/extents and + // type parameters. The new box has the same shape as the original + // array. If useStack is true, then the temporary will be allocated // in stack memory (when possible). static mlir::Value allocateTempBuffer(fir::FirOpBuilder &builder, mlir::Location loc, bool useStack, mlir::Value origBox, + llvm::ArrayRef lbounds, llvm::ArrayRef extents, llvm::ArrayRef typeParams); @@ -99,7 +100,9 @@ class UnpackArrayConversion // the presence of the stack attribute does not automatically // mean that the allocation is actually done in stack memory. // For example, we always do the heap allocation for polymorphic -// types using Fortran runtime. +// types using Fortran runtime. Currently, we allocate all +// repack temporaries of derived types as polymorphic, +// so that we can preserve the dynamic type of the original. // Adding the polymorpic mold to fir.alloca and then using // Fortran runtime to compute the allocation size could probably // resolve this limitation. @@ -170,7 +173,8 @@ PackArrayConversion::matchAndRewrite(fir::PackArrayOp op, mlir::Value PackArrayConversion::allocateTempBuffer( fir::FirOpBuilder &builder, mlir::Location loc, bool useStack, - mlir::Value origBox, llvm::ArrayRef extents, + mlir::Value origBox, llvm::ArrayRef lbounds, + llvm::ArrayRef extents, llvm::ArrayRef typeParams) { auto tempType = mlir::cast( fir::extractSequenceType(origBox.getType())); @@ -191,16 +195,35 @@ mlir::Value PackArrayConversion::allocateTempBuffer( assert(!isHeapAllocation && "temp must have been allocated on the stack"); mlir::Type ptrType = base.getType(); - if (llvm::isa(ptrType)) - return base; + if (auto tempBoxType = mlir::dyn_cast(ptrType)) { + // We need to reset the CFI_attribute_allocatable before + // returning the temporary box to avoid any mishandling + // of the temporary box in Fortran runtime. + base = builder.create(loc, fir::boxMemRefType(tempBoxType), + base); + ptrType = base.getType(); + } - mlir::Type tempBoxType = fir::BoxType::get(mlir::isa(ptrType) - ? ptrType - : fir::unwrapRefType(ptrType)); + // Create the temporary using dynamic type of the original, + // if it is polymorphic, or it has a derived type with SEQUENCE + // or BIND attribute (such dummy arguments may have their dynamic + // type not exactly matching their static type). + // Note that for the latter case, the allocation can still be done + // without the mold, because the dynamic and static types + // must be storage compatible. + bool useDynamicType = fir::isBoxedRecordType(origBox.getType()) || + fir::isPolymorphicType(origBox.getType()); + mlir::Type tempBoxType = + fir::wrapInClassOrBoxType(fir::unwrapRefType(ptrType), + /*isPolymorphic=*/useDynamicType); + // Use the shape with proper lower bounds for the final box. + shape = builder.genShape(loc, lbounds, extents); mlir::Value newBox = builder.createBox(loc, tempBoxType, base, shape, /*slice=*/nullptr, - typeParams, /*tdesc=*/nullptr); - return newBox; + typeParams, useDynamicType ? origBox : nullptr); + // The new box might be !fir.class, while the original might be + // !fir.box - we have to add a conversion. + return builder.createConvert(loc, origBox.getType(), newBox); } mlir::FailureOr @@ -280,16 +303,11 @@ PackArrayConversion::genRepackedBox(fir::FirOpBuilder &builder, << op.getOperation() << '\n'; } - mlir::Value tempBox = - allocateTempBuffer(builder, loc, op.getStack(), box, extents, typeParams); + mlir::Value tempBox = allocateTempBuffer(builder, loc, op.getStack(), box, + lbounds, extents, typeParams); if (!op.getNoCopy()) fir::runtime::genShallowCopy(builder, loc, tempBox, box, /*resultIsAllocated=*/true); - - // Set lower bounds after the original box. - mlir::Value shift = builder.genShift(loc, lbounds); - tempBox = builder.create(loc, boxType, tempBox, shift, - /*slice=*/nullptr); builder.create(loc, tempBox); return ifOp.getResult(0); diff --git a/flang/test/Transforms/lower-repack-arrays.fir b/flang/test/Transforms/lower-repack-arrays.fir index 458869cce45fd..9232a74f224d3 100644 --- a/flang/test/Transforms/lower-repack-arrays.fir +++ b/flang/test/Transforms/lower-repack-arrays.fir @@ -28,15 +28,14 @@ func.func @_QPtest1(%arg0: !fir.box> {fir.bindc_name = "x"}) // CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_18:.*]] = fir.allocmem !fir.array, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""} // CHECK: %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.heap>, !fir.shape<2>) -> !fir.heap> -// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) : (!fir.heap>, !fir.shapeshift<2>) -> !fir.box> // CHECK: %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}} -// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>) -> !fir.box // CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box // CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box> -// CHECK: fir.result %[[VAL_26]] : !fir.box> +// CHECK: fir.result %[[VAL_20]] : !fir.box> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box> // CHECK: } @@ -93,15 +92,14 @@ func.func @_QPtest1_whole(%arg0: !fir.box> {fir.bindc_name = // CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_18:.*]] = fir.alloca !fir.array, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"} // CHECK: %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.ref>, !fir.shape<2>) -> !fir.ref> -// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// CHECK: %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) : (!fir.ref>, !fir.shapeshift<2>) -> !fir.box> // CHECK: %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}} // CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>) -> !fir.box // CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box // CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box>, !fir.shift<2>) -> !fir.box> -// CHECK: fir.result %[[VAL_26]] : !fir.box> +// CHECK: fir.result %[[VAL_20]] : !fir.box> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box> // CHECK: } @@ -156,15 +154,14 @@ func.func @_QPtest1_in(%arg0: !fir.box> {fir.bindc_name = "x // CHECK: %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_17:.*]] = fir.allocmem !fir.array, %[[VAL_14]]#1, %[[VAL_15]]#1 {bindc_name = ".repacked", uniq_name = ""} // CHECK: %[[VAL_18:.*]] = fir.declare %[[VAL_17]](%[[VAL_16]]) {uniq_name = ".repacked"} : (!fir.heap>, !fir.shape<2>) -> !fir.heap> -// CHECK: %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_24:.*]] = fir.shape_shift %[[VAL_14]]#0, %[[VAL_14]]#1, %[[VAL_15]]#0, %[[VAL_15]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_24]]) : (!fir.heap>, !fir.shapeshift<2>) -> !fir.box> // CHECK: %[[VAL_20:.*]] = fir.address_of(@{{_QQcl.*}} -// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_19]] : (!fir.box>) -> !fir.box // CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_0]] : (!fir.box>) -> !fir.box // CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_20]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_24:.*]] = fir.shift %[[VAL_14]]#0, %[[VAL_15]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_25:.*]] = fir.rebox %[[VAL_19]](%[[VAL_24]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box> -// CHECK: fir.result %[[VAL_25]] : !fir.box> +// CHECK: fir.result %[[VAL_19]] : !fir.box> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box> // CHECK: } @@ -215,10 +212,9 @@ func.func @_QPtest1_out(%arg0: !fir.box> {fir.bindc_name = " // CHECK: %[[VAL_16:.*]] = fir.shape %[[VAL_14]]#1, %[[VAL_15]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_17:.*]] = fir.allocmem !fir.array, %[[VAL_14]]#1, %[[VAL_15]]#1 {bindc_name = ".repacked", uniq_name = ""} // CHECK: %[[VAL_18:.*]] = fir.declare %[[VAL_17]](%[[VAL_16]]) {uniq_name = ".repacked"} : (!fir.heap>, !fir.shape<2>) -> !fir.heap> -// CHECK: %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_16]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> -// CHECK: %[[VAL_20:.*]] = fir.shift %[[VAL_14]]#0, %[[VAL_15]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_21:.*]] = fir.rebox %[[VAL_19]](%[[VAL_20]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box> -// CHECK: fir.result %[[VAL_21]] : !fir.box> +// CHECK: %[[VAL_20:.*]] = fir.shape_shift %[[VAL_14]]#0, %[[VAL_14]]#1, %[[VAL_15]]#0, %[[VAL_15]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_19:.*]] = fir.embox %[[VAL_18]](%[[VAL_20]]) : (!fir.heap>, !fir.shapeshift<2>) -> !fir.box> +// CHECK: fir.result %[[VAL_19]] : !fir.box> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box> // CHECK: } @@ -286,15 +282,14 @@ func.func @_QPtest2(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.box // CHECK: %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_24:.*]] = fir.allocmem !fir.array>(%[[VAL_12]] : i32), %[[VAL_21]]#1, %[[VAL_22]]#1 {bindc_name = ".repacked", uniq_name = ""} // CHECK: %[[VAL_25:.*]] = fir.declare %[[VAL_24]](%[[VAL_23]]) typeparams %[[VAL_12]] {uniq_name = ".repacked"} : (!fir.heap>>, !fir.shape<2>, i32) -> !fir.heap>> -// CHECK: %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) typeparams %[[VAL_12]] : (!fir.heap>>, !fir.shape<2>, i32) -> !fir.box>>> +// CHECK: %[[VAL_31:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_21]]#1, %[[VAL_22]]#0, %[[VAL_22]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_31]]) typeparams %[[VAL_12]] : (!fir.heap>>, !fir.shapeshift<2>, i32) -> !fir.box>> // CHECK: %[[VAL_27:.*]] = fir.address_of(@{{_QQcl.*}} -// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box>>>) -> !fir.box +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_3]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_31:.*]] = fir.shift %[[VAL_21]]#0, %[[VAL_22]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_32:.*]] = fir.rebox %[[VAL_26]](%[[VAL_31]]) : (!fir.box>>>, !fir.shift<2>) -> !fir.box>> -// CHECK: fir.result %[[VAL_32]] : !fir.box>> +// CHECK: fir.result %[[VAL_26]] : !fir.box>> // CHECK: } else { // CHECK: fir.result %[[VAL_1]] : !fir.box>> // CHECK: } @@ -362,15 +357,14 @@ func.func @_QPtest2_stack(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !f // CHECK: %[[VAL_23:.*]] = fir.shape %[[VAL_21]]#1, %[[VAL_22]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_24:.*]] = fir.alloca !fir.array>(%[[VAL_12]] : i32), %[[VAL_21]]#1, %[[VAL_22]]#1 {bindc_name = ".repacked"} // CHECK: %[[VAL_25:.*]] = fir.declare %[[VAL_24]](%[[VAL_23]]) typeparams %[[VAL_12]] {uniq_name = ".repacked"} : (!fir.ref>>, !fir.shape<2>, i32) -> !fir.ref>> -// CHECK: %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) typeparams %[[VAL_12]] : (!fir.ref>>, !fir.shape<2>, i32) -> !fir.box>> +// CHECK: %[[VAL_31:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_21]]#1, %[[VAL_22]]#0, %[[VAL_22]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_31]]) typeparams %[[VAL_12]] : (!fir.ref>>, !fir.shapeshift<2>, i32) -> !fir.box>> // CHECK: %[[VAL_27:.*]] = fir.address_of(@{{_QQcl.*}} // CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_26]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_1]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_27]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_28]], %[[VAL_29]], %[[VAL_30]], %[[VAL_3]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_31:.*]] = fir.shift %[[VAL_21]]#0, %[[VAL_22]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_32:.*]] = fir.rebox %[[VAL_26]](%[[VAL_31]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box>> -// CHECK: fir.result %[[VAL_32]] : !fir.box>> +// CHECK: fir.result %[[VAL_26]] : !fir.box>> // CHECK: } else { // CHECK: fir.result %[[VAL_1]] : !fir.box>> // CHECK: } @@ -427,15 +421,14 @@ func.func @_QPtest3(%arg0: !fir.box>> {fir.bindc_n // CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_19:.*]] = fir.allocmem !fir.array>(%[[VAL_17]] : index), %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""} // CHECK: %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_17]] {uniq_name = ".repacked"} : (!fir.heap>>, !fir.shape<2>, index) -> !fir.heap>> -// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) typeparams %[[VAL_17]] : (!fir.heap>>, !fir.shape<2>, index) -> !fir.box>>> +// CHECK: %[[VAL_26:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) typeparams %[[VAL_17]] : (!fir.heap>>, !fir.shapeshift<2>, index) -> !fir.box>> // CHECK: %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}} -// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>>) -> !fir.box +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_26:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box>>>, !fir.shift<2>) -> !fir.box>> -// CHECK: fir.result %[[VAL_27]] : !fir.box>> +// CHECK: fir.result %[[VAL_21]] : !fir.box>> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box>> // CHECK: } @@ -493,15 +486,14 @@ func.func @_QPtest3_stack(%arg0: !fir.box>> {fir.b // CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_19:.*]] = fir.alloca !fir.array>(%[[VAL_17]] : index), %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"} // CHECK: %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_17]] {uniq_name = ".repacked"} : (!fir.ref>>, !fir.shape<2>, index) -> !fir.ref>> -// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) typeparams %[[VAL_17]] : (!fir.ref>>, !fir.shape<2>, index) -> !fir.box>> +// CHECK: %[[VAL_26:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) typeparams %[[VAL_17]] : (!fir.ref>>, !fir.shapeshift<2>, index) -> !fir.box>> // CHECK: %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}} // CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_26:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box>> -// CHECK: fir.result %[[VAL_27]] : !fir.box>> +// CHECK: fir.result %[[VAL_21]] : !fir.box>> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box>> // CHECK: } @@ -559,15 +551,14 @@ func.func @_QPtest4(%arg0: !fir.box>> {fir.bindc_ // CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_19:.*]] = fir.allocmem !fir.array>, %[[VAL_16]]#1, %[[VAL_17]]#1 {bindc_name = ".repacked", uniq_name = ""} // CHECK: %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_6]] {uniq_name = ".repacked"} : (!fir.heap>>, !fir.shape<2>, index) -> !fir.heap>> -// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.box>>> +// CHECK: %[[VAL_26:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1, %[[VAL_17]]#0, %[[VAL_17]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) : (!fir.heap>>, !fir.shapeshift<2>) -> !fir.box>> // CHECK: %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}} -// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>>) -> !fir.box +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_26:.*]] = fir.shift %[[VAL_16]]#0, %[[VAL_17]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box>>>, !fir.shift<2>) -> !fir.box>> -// CHECK: fir.result %[[VAL_27]] : !fir.box>> +// CHECK: fir.result %[[VAL_21]] : !fir.box>> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box>> // CHECK: } @@ -626,15 +617,14 @@ func.func @_QPtest4_stack(%arg0: !fir.box>> {fir. // CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_16]]#1, %[[VAL_17]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_19:.*]] = fir.alloca !fir.array>, %[[VAL_16]]#1, %[[VAL_17]]#1 {bindc_name = ".repacked"} // CHECK: %[[VAL_20:.*]] = fir.declare %[[VAL_19]](%[[VAL_18]]) typeparams %[[VAL_6]] {uniq_name = ".repacked"} : (!fir.ref>>, !fir.shape<2>, index) -> !fir.ref>> -// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_18]]) : (!fir.ref>>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_26:.*]] = fir.shape_shift %[[VAL_16]]#0, %[[VAL_16]]#1, %[[VAL_17]]#0, %[[VAL_17]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_26]]) : (!fir.ref>>, !fir.shapeshift<2>) -> !fir.box>> // CHECK: %[[VAL_22:.*]] = fir.address_of(@{{_QQcl.*}} // CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_22]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_26:.*]] = fir.shift %[[VAL_16]]#0, %[[VAL_17]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_27:.*]] = fir.rebox %[[VAL_21]](%[[VAL_26]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box>> -// CHECK: fir.result %[[VAL_27]] : !fir.box>> +// CHECK: fir.result %[[VAL_21]] : !fir.box>> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box>> // CHECK: } @@ -690,15 +680,15 @@ func.func @_QPtest5(%arg0: !fir.box>> {fir.bind // CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_18:.*]] = fir.allocmem !fir.array>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked", uniq_name = ""} // CHECK: %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.heap>>, !fir.shape<2>) -> !fir.heap>> -// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.heap>>, !fir.shape<2>) -> !fir.box>>> +// CHECK: %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) source_box %[[VAL_0]] : (!fir.heap>>, !fir.shapeshift<2>, !fir.box>>) -> !fir.class>> +// CHECK: %[[BOX:.*]] = fir.convert %[[VAL_20]] : (!fir.class>>) -> !fir.box>> // CHECK: %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}} -// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>>>) -> !fir.box +// CHECK: %[[VAL_22:.*]] = fir.convert %[[BOX]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box>>>, !fir.shift<2>) -> !fir.box>> -// CHECK: fir.result %[[VAL_26]] : !fir.box>> +// CHECK: fir.result %[[BOX]] : !fir.box>> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box>> // CHECK: } @@ -755,15 +745,15 @@ func.func @_QPtest5_stack(%arg0: !fir.box>> {fi // CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_15]]#1, %[[VAL_16]]#1 : (index, index) -> !fir.shape<2> // CHECK: %[[VAL_18:.*]] = fir.alloca !fir.array>, %[[VAL_15]]#1, %[[VAL_16]]#1 {bindc_name = ".repacked"} // CHECK: %[[VAL_19:.*]] = fir.declare %[[VAL_18]](%[[VAL_17]]) {uniq_name = ".repacked"} : (!fir.ref>>, !fir.shape<2>) -> !fir.ref>> -// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_17]]) : (!fir.ref>>, !fir.shape<2>) -> !fir.box>> +// CHECK: %[[VAL_25:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_19]](%[[VAL_25]]) source_box %[[VAL_0]] : (!fir.ref>>, !fir.shapeshift<2>, !fir.box>>) -> !fir.class>> +// CHECK: %[[BOX:.*]] = fir.convert %[[VAL_20]] : (!fir.class>>) -> !fir.box>> // CHECK: %[[VAL_21:.*]] = fir.address_of(@{{_QQcl.*}} -// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_20]] : (!fir.box>>) -> !fir.box +// CHECK: %[[VAL_22:.*]] = fir.convert %[[BOX]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_0]] : (!fir.box>>) -> !fir.box // CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_21]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_2]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_25:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_26:.*]] = fir.rebox %[[VAL_20]](%[[VAL_25]]) : (!fir.box>>, !fir.shift<2>) -> !fir.box>> -// CHECK: fir.result %[[VAL_26]] : !fir.box>> +// CHECK: fir.result %[[BOX]] : !fir.box>> // CHECK: } else { // CHECK: fir.result %[[VAL_0]] : !fir.box>> // CHECK: } @@ -830,13 +820,14 @@ func.func @_QPtest6(%arg0: !fir.class>> {fir.bi // CHECK: %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref>, !fir.ref, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref>>>> // CHECK: %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class>>>) -> !fir.class>>> +// CHECK: %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class>>>) -> !fir.heap>> +// CHECK: %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap>>, !fir.shapeshift<2>, !fir.class>>) -> !fir.class>> // CHECK: %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref> -// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class>>>) -> !fir.box +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class>>) -> !fir.box // CHECK: %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class>>) -> !fir.box // CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class>>>, !fir.shift<2>) -> !fir.class>> // CHECK: fir.result %[[VAL_34]] : !fir.class>> // CHECK: } else { // CHECK: fir.result %[[ARG0]] : !fir.class>> @@ -906,13 +897,14 @@ func.func @_QPtest6_stack(%arg0: !fir.class>> { // CHECK: %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref>, !fir.ref, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref>>>> // CHECK: %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class>>>) -> !fir.class>>> +// CHECK: %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class>>>) -> !fir.heap>> +// CHECK: %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap>>, !fir.shapeshift<2>, !fir.class>>) -> !fir.class>> // CHECK: %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref> -// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class>>>) -> !fir.box +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class>>) -> !fir.box // CHECK: %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class>>) -> !fir.box // CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class>>>, !fir.shift<2>) -> !fir.class>> // CHECK: fir.result %[[VAL_34]] : !fir.class>> // CHECK: } else { // CHECK: fir.result %[[ARG0]] : !fir.class>> @@ -981,13 +973,14 @@ func.func @_QPtest7(%arg0: !fir.class> {fir.bindc_name = "x // CHECK: %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref>, !fir.ref, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref>>> // CHECK: %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class>>) -> !fir.class>> +// CHECK: %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class>>) -> !fir.heap> +// CHECK: %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap>, !fir.shapeshift<2>, !fir.class>) -> !fir.class> // CHECK: %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref> -// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class>) -> !fir.box // CHECK: %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class>) -> !fir.box // CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class>>, !fir.shift<2>) -> !fir.class> // CHECK: fir.result %[[VAL_34]] : !fir.class> // CHECK: } else { // CHECK: fir.result %[[ARG0]] : !fir.class> @@ -1057,13 +1050,14 @@ func.func @_QPtest7_stack(%arg0: !fir.class> {fir.bindc_nam // CHECK: %[[VAL_26:.*]] = fir.call @_FortranAAllocatableAllocate(%[[VAL_23]], %[[VAL_24]], %[[VAL_4]], %[[VAL_22]], %[[VAL_25]], %[[VAL_1]]) : (!fir.ref>, !fir.ref, i1, !fir.box, !fir.ref, i32) -> i32 // CHECK: %[[VAL_27:.*]] = fir.load %[[VAL_5]] : !fir.ref>>> // CHECK: %[[VAL_28:.*]] = fir.declare %[[VAL_27]] {uniq_name = ".repacked"} : (!fir.class>>) -> !fir.class>> +// CHECK: %[[ADDR:.*]] = fir.box_addr %[[VAL_28]] : (!fir.class>>) -> !fir.heap> +// CHECK: %[[VAL_33:.*]] = fir.shape_shift %[[VAL_15]]#0, %[[VAL_15]]#1, %[[VAL_16]]#0, %[[VAL_16]]#1 : (index, index, index, index) -> !fir.shapeshift<2> +// CHECK: %[[VAL_34:.*]] = fir.embox %[[ADDR]](%[[VAL_33]]) source_box %[[ARG0]] : (!fir.heap>, !fir.shapeshift<2>, !fir.class>) -> !fir.class> // CHECK: %[[VAL_29:.*]] = fir.address_of(@{{_QQcl.*}}) : !fir.ref> -// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_28]] : (!fir.class>>) -> !fir.box +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_34]] : (!fir.class>) -> !fir.box // CHECK: %[[VAL_31:.*]] = fir.convert %[[ARG0]] : (!fir.class>) -> !fir.box // CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_29]] : (!fir.ref>) -> !fir.ref // CHECK: fir.call @_FortranAShallowCopyDirect(%[[VAL_30]], %[[VAL_31]], %[[VAL_32]], %[[VAL_1]]) : (!fir.box, !fir.box, !fir.ref, i32) -> () -// CHECK: %[[VAL_33:.*]] = fir.shift %[[VAL_15]]#0, %[[VAL_16]]#0 : (index, index) -> !fir.shift<2> -// CHECK: %[[VAL_34:.*]] = fir.rebox %[[VAL_28]](%[[VAL_33]]) : (!fir.class>>, !fir.shift<2>) -> !fir.class> // CHECK: fir.result %[[VAL_34]] : !fir.class> // CHECK: } else { // CHECK: fir.result %[[ARG0]] : !fir.class> From fb3972dd06cbc1b0a5305f81b0c2d74e44dbea41 Mon Sep 17 00:00:00 2001 From: Daniel Bertalan Date: Wed, 16 Jul 2025 15:06:09 +0200 Subject: [PATCH 092/813] [lld-macho] Move Linker Optimization Hints pass to a separate file Moving it away from the arm64 `TargetInfo` class will let us enable it more easily for arm64_32 and the soon-to-be-added arm64e target as well. This is the NFC part of #148964 --- lld/MachO/Arch/ARM64.cpp | 509 -------------------- lld/MachO/CMakeLists.txt | 1 + lld/MachO/LinkerOptimizationHints.cpp | 526 +++++++++++++++++++++ lld/MachO/LinkerOptimizationHints.h | 17 + lld/MachO/Target.h | 2 - lld/MachO/Writer.cpp | 3 +- llvm/utils/gn/secondary/lld/MachO/BUILD.gn | 1 + 7 files changed, 547 insertions(+), 512 deletions(-) create mode 100644 lld/MachO/LinkerOptimizationHints.cpp create mode 100644 lld/MachO/LinkerOptimizationHints.h diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp index 2fe96b26bfb55..04da702b48764 100644 --- a/lld/MachO/Arch/ARM64.cpp +++ b/lld/MachO/Arch/ARM64.cpp @@ -14,15 +14,10 @@ #include "lld/Common/ErrorHandler.h" #include "mach-o/compact_unwind_encoding.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/BinaryFormat/MachO.h" -#include "llvm/Support/Endian.h" -#include "llvm/Support/LEB128.h" -#include "llvm/Support/MathExtras.h" using namespace llvm; using namespace llvm::MachO; -using namespace llvm::support::endian; using namespace lld; using namespace lld::macho; @@ -39,7 +34,6 @@ struct ARM64 : ARM64Common { uint64_t &stubOffset, uint64_t selrefVA, Symbol *objcMsgSend) const override; void populateThunk(InputSection *thunk, Symbol *funcSym) override; - void applyOptimizationHints(uint8_t *, const ObjFile &) const override; void initICFSafeThunkBody(InputSection *thunk, Symbol *targetSym) const override; @@ -236,509 +230,6 @@ ARM64::ARM64() : ARM64Common(LP64()) { relocAttrs = {relocAttrsArray.data(), relocAttrsArray.size()}; } -namespace { -struct Adrp { - uint32_t destRegister; - int64_t addend; -}; - -struct Add { - uint8_t destRegister; - uint8_t srcRegister; - uint32_t addend; -}; - -enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 }; - -struct Ldr { - uint8_t destRegister; - uint8_t baseRegister; - uint8_t p2Size; - bool isFloat; - ExtendType extendType; - int64_t offset; -}; -} // namespace - -static bool parseAdrp(uint32_t insn, Adrp &adrp) { - if ((insn & 0x9f000000) != 0x90000000) - return false; - adrp.destRegister = insn & 0x1f; - uint64_t immHi = (insn >> 5) & 0x7ffff; - uint64_t immLo = (insn >> 29) & 0x3; - adrp.addend = SignExtend64<21>(immLo | (immHi << 2)) * 4096; - return true; -} - -static bool parseAdd(uint32_t insn, Add &add) { - if ((insn & 0xffc00000) != 0x91000000) - return false; - add.destRegister = insn & 0x1f; - add.srcRegister = (insn >> 5) & 0x1f; - add.addend = (insn >> 10) & 0xfff; - return true; -} - -static bool parseLdr(uint32_t insn, Ldr &ldr) { - ldr.destRegister = insn & 0x1f; - ldr.baseRegister = (insn >> 5) & 0x1f; - uint8_t size = insn >> 30; - uint8_t opc = (insn >> 22) & 3; - - if ((insn & 0x3fc00000) == 0x39400000) { - // LDR (immediate), LDRB (immediate), LDRH (immediate) - ldr.p2Size = size; - ldr.extendType = ZeroExtend; - ldr.isFloat = false; - } else if ((insn & 0x3f800000) == 0x39800000) { - // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate) - ldr.p2Size = size; - ldr.extendType = static_cast(opc); - ldr.isFloat = false; - } else if ((insn & 0x3f400000) == 0x3d400000) { - // LDR (immediate, SIMD&FP) - ldr.extendType = ZeroExtend; - ldr.isFloat = true; - if (opc == 1) - ldr.p2Size = size; - else if (size == 0 && opc == 3) - ldr.p2Size = 4; - else - return false; - } else { - return false; - } - ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size; - return true; -} - -static bool isValidAdrOffset(int32_t delta) { return isInt<21>(delta); } - -static void writeAdr(void *loc, uint32_t dest, int32_t delta) { - assert(isValidAdrOffset(delta)); - uint32_t opcode = 0x10000000; - uint32_t immHi = (delta & 0x001ffffc) << 3; - uint32_t immLo = (delta & 0x00000003) << 29; - write32le(loc, opcode | immHi | immLo | dest); -} - -static void writeNop(void *loc) { write32le(loc, 0xd503201f); } - -static bool isLiteralLdrEligible(const Ldr &ldr) { - return ldr.p2Size > 1 && isShiftedInt<19, 2>(ldr.offset); -} - -static void writeLiteralLdr(void *loc, const Ldr &ldr) { - assert(isLiteralLdrEligible(ldr)); - uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes(19)) << 5; - uint32_t opcode; - switch (ldr.p2Size) { - case 2: - if (ldr.isFloat) - opcode = 0x1c000000; - else - opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000; - break; - case 3: - opcode = ldr.isFloat ? 0x5c000000 : 0x58000000; - break; - case 4: - opcode = 0x9c000000; - break; - default: - llvm_unreachable("Invalid literal ldr size"); - } - write32le(loc, opcode | imm19 | ldr.destRegister); -} - -static bool isImmediateLdrEligible(const Ldr &ldr) { - // Note: We deviate from ld64's behavior, which converts to immediate loads - // only if ldr.offset < 4096, even though the offset is divided by the load's - // size in the 12-bit immediate operand. Only the unsigned offset variant is - // supported. - - uint32_t size = 1 << ldr.p2Size; - return ldr.offset >= 0 && (ldr.offset % size) == 0 && - isUInt<12>(ldr.offset >> ldr.p2Size); -} - -static void writeImmediateLdr(void *loc, const Ldr &ldr) { - assert(isImmediateLdrEligible(ldr)); - uint32_t opcode = 0x39000000; - if (ldr.isFloat) { - opcode |= 0x04000000; - assert(ldr.extendType == ZeroExtend); - } - opcode |= ldr.destRegister; - opcode |= ldr.baseRegister << 5; - uint8_t size, opc; - if (ldr.p2Size == 4) { - size = 0; - opc = 3; - } else { - opc = ldr.extendType; - size = ldr.p2Size; - } - uint32_t immBits = ldr.offset >> ldr.p2Size; - write32le(loc, opcode | (immBits << 10) | (opc << 22) | (size << 30)); -} - -// Transforms a pair of adrp+add instructions into an adr instruction if the -// target is within the +/- 1 MiB range allowed by the adr's 21 bit signed -// immediate offset. -// -// adrp xN, _foo@PAGE -// add xM, xN, _foo@PAGEOFF -// -> -// adr xM, _foo -// nop -static bool applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec, - uint64_t offset1, uint64_t offset2) { - uint32_t ins1 = read32le(buf + offset1); - uint32_t ins2 = read32le(buf + offset2); - Adrp adrp; - Add add; - if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add)) - return false; - if (adrp.destRegister != add.srcRegister) - return false; - - uint64_t addr1 = isec->getVA() + offset1; - uint64_t referent = pageBits(addr1) + adrp.addend + add.addend; - int64_t delta = referent - addr1; - if (!isValidAdrOffset(delta)) - return false; - - writeAdr(buf + offset1, add.destRegister, delta); - writeNop(buf + offset2); - return true; -} - -// Transforms two adrp instructions into a single adrp if their referent -// addresses are located on the same 4096 byte page. -// -// adrp xN, _foo@PAGE -// adrp xN, _bar@PAGE -// -> -// adrp xN, _foo@PAGE -// nop -static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec, - uint64_t offset1, uint64_t offset2) { - uint32_t ins1 = read32le(buf + offset1); - uint32_t ins2 = read32le(buf + offset2); - Adrp adrp1, adrp2; - if (!parseAdrp(ins1, adrp1) || !parseAdrp(ins2, adrp2)) - return; - if (adrp1.destRegister != adrp2.destRegister) - return; - - uint64_t page1 = pageBits(offset1 + isec->getVA()) + adrp1.addend; - uint64_t page2 = pageBits(offset2 + isec->getVA()) + adrp2.addend; - if (page1 != page2) - return; - - writeNop(buf + offset2); -} - -// Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal) -// load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB, -// as ldr can encode a signed 19-bit offset that gets multiplied by 4. -// -// adrp xN, _foo@PAGE -// ldr xM, [xN, _foo@PAGEOFF] -// -> -// nop -// ldr xM, _foo -static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec, - uint64_t offset1, uint64_t offset2) { - uint32_t ins1 = read32le(buf + offset1); - uint32_t ins2 = read32le(buf + offset2); - Adrp adrp; - Ldr ldr; - if (!parseAdrp(ins1, adrp) || !parseLdr(ins2, ldr)) - return; - if (adrp.destRegister != ldr.baseRegister) - return; - - uint64_t addr1 = isec->getVA() + offset1; - uint64_t addr2 = isec->getVA() + offset2; - uint64_t referent = pageBits(addr1) + adrp.addend + ldr.offset; - ldr.offset = referent - addr2; - if (!isLiteralLdrEligible(ldr)) - return; - - writeNop(buf + offset1); - writeLiteralLdr(buf + offset2, ldr); -} - -// GOT loads are emitted by the compiler as a pair of adrp and ldr instructions, -// but they may be changed to adrp+add by relaxGotLoad(). This hint performs -// the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed. -static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec, - uint64_t offset1, uint64_t offset2) { - uint32_t ins2 = read32le(buf + offset2); - Add add; - Ldr ldr; - if (parseAdd(ins2, add)) - applyAdrpAdd(buf, isec, offset1, offset2); - else if (parseLdr(ins2, ldr)) - applyAdrpLdr(buf, isec, offset1, offset2); -} - -// Optimizes an adrp+add+ldr sequence used for loading from a local symbol's -// address by loading directly if it's close enough, or to an adrp(p)+ldr -// sequence if it's not. -// -// adrp x0, _foo@PAGE -// add x1, x0, _foo@PAGEOFF -// ldr x2, [x1, #off] -static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec, - uint64_t offset1, uint64_t offset2, - uint64_t offset3) { - uint32_t ins1 = read32le(buf + offset1); - uint32_t ins2 = read32le(buf + offset2); - uint32_t ins3 = read32le(buf + offset3); - Adrp adrp; - Add add; - Ldr ldr; - if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add) || !parseLdr(ins3, ldr)) - return; - if (adrp.destRegister != add.srcRegister) - return; - if (add.destRegister != ldr.baseRegister) - return; - - // Load from the target address directly. - // nop - // nop - // ldr x2, [_foo + #off] - uint64_t addr1 = isec->getVA() + offset1; - uint64_t addr3 = isec->getVA() + offset3; - uint64_t referent = pageBits(addr1) + adrp.addend + add.addend; - Ldr literalLdr = ldr; - literalLdr.offset += referent - addr3; - if (isLiteralLdrEligible(literalLdr)) { - writeNop(buf + offset1); - writeNop(buf + offset2); - writeLiteralLdr(buf + offset3, literalLdr); - return; - } - - if (applyAdrpAdd(buf, isec, offset1, offset2)) - return; - - // Move the target's page offset into the ldr's immediate offset. - // adrp x0, _foo@PAGE - // nop - // ldr x2, [x0, _foo@PAGEOFF + #off] - Ldr immediateLdr = ldr; - immediateLdr.baseRegister = adrp.destRegister; - immediateLdr.offset += add.addend; - if (isImmediateLdrEligible(immediateLdr)) { - writeNop(buf + offset2); - writeImmediateLdr(buf + offset3, immediateLdr); - return; - } -} - -// Relaxes a GOT-indirect load. -// If the referenced symbol is external and its GOT entry is within +/- 1 MiB, -// the GOT entry can be loaded with a single literal ldr instruction. -// If the referenced symbol is local and thus has been relaxed to adrp+add+ldr, -// we perform the AdrpAddLdr transformation. -static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec, - uint64_t offset1, uint64_t offset2, - uint64_t offset3) { - uint32_t ins2 = read32le(buf + offset2); - Add add; - Ldr ldr2; - - if (parseAdd(ins2, add)) { - applyAdrpAddLdr(buf, isec, offset1, offset2, offset3); - } else if (parseLdr(ins2, ldr2)) { - // adrp x1, _foo@GOTPAGE - // ldr x2, [x1, _foo@GOTPAGEOFF] - // ldr x3, [x2, #off] - uint32_t ins3 = read32le(buf + offset3); - Ldr ldr3; - if (!parseLdr(ins3, ldr3)) - return; - if (ldr3.baseRegister != ldr2.destRegister) - return; - // Loads from the GOT must be pointer sized. - if (ldr2.p2Size != 3 || ldr2.isFloat) - return; - applyAdrpLdr(buf, isec, offset1, offset2); - } -} - -template -static void forEachHint(ArrayRef data, Callback callback) { - std::array args; - - auto readNext = [&]() -> uint64_t { - unsigned int n = 0; - uint64_t value = decodeULEB128(data.data(), &n, data.end()); - data = data.drop_front(n); - return value; - }; - - while (!data.empty()) { - uint64_t type = readNext(); - if (type == 0) - break; - - uint64_t argCount = readNext(); - for (unsigned i = 0; i < argCount; ++i) { - uint64_t arg = readNext(); - if (i < 3) - args[i] = arg; - } - // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others. - if (argCount > 3) - continue; - callback(type, ArrayRef(args.data(), argCount)); - } -} - -// On RISC architectures like arm64, materializing a memory address generally -// takes multiple instructions. If the referenced symbol is located close enough -// in memory, fewer instructions are needed. -// -// Linker optimization hints record where addresses are computed. After -// addresses have been assigned, if possible, we change them to a shorter -// sequence of instructions. The size of the binary is not modified; the -// eliminated instructions are replaced with NOPs. This still leads to faster -// code as the CPU can skip over NOPs quickly. -// -// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which -// points to a sequence of ULEB128-encoded numbers. Each entry specifies a -// transformation kind, and 2 or 3 addresses where the instructions are located. -void ARM64::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) const { - ArrayRef data = obj.getOptimizationHints(); - if (data.empty()) - return; - - const ConcatInputSection *section = nullptr; - uint64_t sectionAddr = 0; - uint8_t *buf = nullptr; - - auto findSection = [&](uint64_t addr) { - if (section && addr >= sectionAddr && - addr < sectionAddr + section->getSize()) - return true; - - if (obj.sections.empty()) - return false; - auto secIt = std::prev(llvm::upper_bound( - obj.sections, addr, - [](uint64_t off, const Section *sec) { return off < sec->addr; })); - const Section *sec = *secIt; - - if (sec->subsections.empty()) - return false; - auto subsecIt = std::prev(llvm::upper_bound( - sec->subsections, addr - sec->addr, - [](uint64_t off, Subsection subsec) { return off < subsec.offset; })); - const Subsection &subsec = *subsecIt; - const ConcatInputSection *isec = - dyn_cast_or_null(subsec.isec); - if (!isec || isec->shouldOmitFromOutput()) - return false; - - section = isec; - sectionAddr = subsec.offset + sec->addr; - buf = outBuf + section->outSecOff + section->parent->fileOff; - return true; - }; - - auto isValidOffset = [&](uint64_t offset) { - if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) { - error(toString(&obj) + - ": linker optimization hint spans multiple sections"); - return false; - } - return true; - }; - - bool hasAdrpAdrp = false; - forEachHint(data, [&](uint64_t kind, ArrayRef args) { - if (kind == LOH_ARM64_ADRP_ADRP) { - hasAdrpAdrp = true; - return; - } - - if (!findSection(args[0])) - return; - switch (kind) { - case LOH_ARM64_ADRP_ADD: - if (isValidOffset(args[1])) - applyAdrpAdd(buf, section, args[0] - sectionAddr, - args[1] - sectionAddr); - break; - case LOH_ARM64_ADRP_LDR: - if (isValidOffset(args[1])) - applyAdrpLdr(buf, section, args[0] - sectionAddr, - args[1] - sectionAddr); - break; - case LOH_ARM64_ADRP_LDR_GOT: - if (isValidOffset(args[1])) - applyAdrpLdrGot(buf, section, args[0] - sectionAddr, - args[1] - sectionAddr); - break; - case LOH_ARM64_ADRP_ADD_LDR: - if (isValidOffset(args[1]) && isValidOffset(args[2])) - applyAdrpAddLdr(buf, section, args[0] - sectionAddr, - args[1] - sectionAddr, args[2] - sectionAddr); - break; - case LOH_ARM64_ADRP_LDR_GOT_LDR: - if (isValidOffset(args[1]) && isValidOffset(args[2])) - applyAdrpLdrGotLdr(buf, section, args[0] - sectionAddr, - args[1] - sectionAddr, args[2] - sectionAddr); - break; - case LOH_ARM64_ADRP_ADD_STR: - case LOH_ARM64_ADRP_LDR_GOT_STR: - // TODO: Implement these - break; - } - }); - - if (!hasAdrpAdrp) - return; - - // AdrpAdrp optimization hints are performed in a second pass because they - // might interfere with other transformations. For instance, consider the - // following input: - // - // adrp x0, _foo@PAGE - // add x1, x0, _foo@PAGEOFF - // adrp x0, _bar@PAGE - // add x2, x0, _bar@PAGEOFF - // - // If we perform the AdrpAdrp relaxation first, we get: - // - // adrp x0, _foo@PAGE - // add x1, x0, _foo@PAGEOFF - // nop - // add x2, x0, _bar@PAGEOFF - // - // If we then apply AdrpAdd to the first two instructions, the add will have a - // garbage value in x0: - // - // adr x1, _foo - // nop - // nop - // add x2, x0, _bar@PAGEOFF - forEachHint(data, [&](uint64_t kind, ArrayRef args) { - if (kind != LOH_ARM64_ADRP_ADRP) - return; - if (!findSection(args[0])) - return; - if (isValidOffset(args[1])) - applyAdrpAdrp(buf, section, args[0] - sectionAddr, args[1] - sectionAddr); - }); -} - TargetInfo *macho::createARM64TargetInfo() { static ARM64 t; return &t; diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt index ecf6ce609e59f..3cd94ced75cc0 100644 --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -18,6 +18,7 @@ add_lld_library(lldMachO ICF.cpp InputFiles.cpp InputSection.cpp + LinkerOptimizationHints.cpp LTO.cpp MapFile.cpp MarkLive.cpp diff --git a/lld/MachO/LinkerOptimizationHints.cpp b/lld/MachO/LinkerOptimizationHints.cpp new file mode 100644 index 0000000000000..60c999b19ecfc --- /dev/null +++ b/lld/MachO/LinkerOptimizationHints.cpp @@ -0,0 +1,526 @@ +//===- LinkerOptimizationHints.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "LinkerOptimizationHints.h" + +#include "Arch/ARM64Common.h" +#include "lld/Common/ErrorHandler.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/Support/Endian.h" +#include "llvm/Support/LEB128.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; +using namespace llvm::MachO; +using namespace llvm::support::endian; +using namespace lld; +using namespace lld::macho; + +namespace { +struct Adrp { + uint32_t destRegister; + int64_t addend; +}; + +struct Add { + uint8_t destRegister; + uint8_t srcRegister; + uint32_t addend; +}; + +enum ExtendType { ZeroExtend = 1, Sign64 = 2, Sign32 = 3 }; + +struct Ldr { + uint8_t destRegister; + uint8_t baseRegister; + uint8_t p2Size; + bool isFloat; + ExtendType extendType; + int64_t offset; +}; +} // namespace + +static bool parseAdrp(uint32_t insn, Adrp &adrp) { + if ((insn & 0x9f000000) != 0x90000000) + return false; + adrp.destRegister = insn & 0x1f; + uint64_t immHi = (insn >> 5) & 0x7ffff; + uint64_t immLo = (insn >> 29) & 0x3; + adrp.addend = SignExtend64<21>(immLo | (immHi << 2)) * 4096; + return true; +} + +static bool parseAdd(uint32_t insn, Add &add) { + if ((insn & 0xffc00000) != 0x91000000) + return false; + add.destRegister = insn & 0x1f; + add.srcRegister = (insn >> 5) & 0x1f; + add.addend = (insn >> 10) & 0xfff; + return true; +} + +static bool parseLdr(uint32_t insn, Ldr &ldr) { + ldr.destRegister = insn & 0x1f; + ldr.baseRegister = (insn >> 5) & 0x1f; + uint8_t size = insn >> 30; + uint8_t opc = (insn >> 22) & 3; + + if ((insn & 0x3fc00000) == 0x39400000) { + // LDR (immediate), LDRB (immediate), LDRH (immediate) + ldr.p2Size = size; + ldr.extendType = ZeroExtend; + ldr.isFloat = false; + } else if ((insn & 0x3f800000) == 0x39800000) { + // LDRSB (immediate), LDRSH (immediate), LDRSW (immediate) + ldr.p2Size = size; + ldr.extendType = static_cast(opc); + ldr.isFloat = false; + } else if ((insn & 0x3f400000) == 0x3d400000) { + // LDR (immediate, SIMD&FP) + ldr.extendType = ZeroExtend; + ldr.isFloat = true; + if (opc == 1) + ldr.p2Size = size; + else if (size == 0 && opc == 3) + ldr.p2Size = 4; + else + return false; + } else { + return false; + } + ldr.offset = ((insn >> 10) & 0xfff) << ldr.p2Size; + return true; +} + +static bool isValidAdrOffset(int32_t delta) { return isInt<21>(delta); } + +static void writeAdr(void *loc, uint32_t dest, int32_t delta) { + assert(isValidAdrOffset(delta)); + uint32_t opcode = 0x10000000; + uint32_t immHi = (delta & 0x001ffffc) << 3; + uint32_t immLo = (delta & 0x00000003) << 29; + write32le(loc, opcode | immHi | immLo | dest); +} + +static void writeNop(void *loc) { write32le(loc, 0xd503201f); } + +static bool isLiteralLdrEligible(const Ldr &ldr) { + return ldr.p2Size > 1 && isShiftedInt<19, 2>(ldr.offset); +} + +static void writeLiteralLdr(void *loc, const Ldr &ldr) { + assert(isLiteralLdrEligible(ldr)); + uint32_t imm19 = (ldr.offset / 4 & maskTrailingOnes(19)) << 5; + uint32_t opcode; + switch (ldr.p2Size) { + case 2: + if (ldr.isFloat) + opcode = 0x1c000000; + else + opcode = ldr.extendType == Sign64 ? 0x98000000 : 0x18000000; + break; + case 3: + opcode = ldr.isFloat ? 0x5c000000 : 0x58000000; + break; + case 4: + opcode = 0x9c000000; + break; + default: + llvm_unreachable("Invalid literal ldr size"); + } + write32le(loc, opcode | imm19 | ldr.destRegister); +} + +static bool isImmediateLdrEligible(const Ldr &ldr) { + // Note: We deviate from ld64's behavior, which converts to immediate loads + // only if ldr.offset < 4096, even though the offset is divided by the load's + // size in the 12-bit immediate operand. Only the unsigned offset variant is + // supported. + + uint32_t size = 1 << ldr.p2Size; + return ldr.offset >= 0 && (ldr.offset % size) == 0 && + isUInt<12>(ldr.offset >> ldr.p2Size); +} + +static void writeImmediateLdr(void *loc, const Ldr &ldr) { + assert(isImmediateLdrEligible(ldr)); + uint32_t opcode = 0x39000000; + if (ldr.isFloat) { + opcode |= 0x04000000; + assert(ldr.extendType == ZeroExtend); + } + opcode |= ldr.destRegister; + opcode |= ldr.baseRegister << 5; + uint8_t size, opc; + if (ldr.p2Size == 4) { + size = 0; + opc = 3; + } else { + opc = ldr.extendType; + size = ldr.p2Size; + } + uint32_t immBits = ldr.offset >> ldr.p2Size; + write32le(loc, opcode | (immBits << 10) | (opc << 22) | (size << 30)); +} + +// Transforms a pair of adrp+add instructions into an adr instruction if the +// target is within the +/- 1 MiB range allowed by the adr's 21 bit signed +// immediate offset. +// +// adrp xN, _foo@PAGE +// add xM, xN, _foo@PAGEOFF +// -> +// adr xM, _foo +// nop +static bool applyAdrpAdd(uint8_t *buf, const ConcatInputSection *isec, + uint64_t offset1, uint64_t offset2) { + uint32_t ins1 = read32le(buf + offset1); + uint32_t ins2 = read32le(buf + offset2); + Adrp adrp; + Add add; + if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add)) + return false; + if (adrp.destRegister != add.srcRegister) + return false; + + uint64_t addr1 = isec->getVA() + offset1; + uint64_t referent = lld::macho::pageBits(addr1) + adrp.addend + add.addend; + int64_t delta = referent - addr1; + if (!isValidAdrOffset(delta)) + return false; + + writeAdr(buf + offset1, add.destRegister, delta); + writeNop(buf + offset2); + return true; +} + +// Transforms two adrp instructions into a single adrp if their referent +// addresses are located on the same 4096 byte page. +// +// adrp xN, _foo@PAGE +// adrp xN, _bar@PAGE +// -> +// adrp xN, _foo@PAGE +// nop +static void applyAdrpAdrp(uint8_t *buf, const ConcatInputSection *isec, + uint64_t offset1, uint64_t offset2) { + uint32_t ins1 = read32le(buf + offset1); + uint32_t ins2 = read32le(buf + offset2); + Adrp adrp1, adrp2; + if (!parseAdrp(ins1, adrp1) || !parseAdrp(ins2, adrp2)) + return; + if (adrp1.destRegister != adrp2.destRegister) + return; + + uint64_t page1 = pageBits(offset1 + isec->getVA()) + adrp1.addend; + uint64_t page2 = pageBits(offset2 + isec->getVA()) + adrp2.addend; + if (page1 != page2) + return; + + writeNop(buf + offset2); +} + +// Transforms a pair of adrp+ldr (immediate) instructions into an ldr (literal) +// load from a PC-relative address if it is 4-byte aligned and within +/- 1 MiB, +// as ldr can encode a signed 19-bit offset that gets multiplied by 4. +// +// adrp xN, _foo@PAGE +// ldr xM, [xN, _foo@PAGEOFF] +// -> +// nop +// ldr xM, _foo +static void applyAdrpLdr(uint8_t *buf, const ConcatInputSection *isec, + uint64_t offset1, uint64_t offset2) { + uint32_t ins1 = read32le(buf + offset1); + uint32_t ins2 = read32le(buf + offset2); + Adrp adrp; + Ldr ldr; + if (!parseAdrp(ins1, adrp) || !parseLdr(ins2, ldr)) + return; + if (adrp.destRegister != ldr.baseRegister) + return; + + uint64_t addr1 = isec->getVA() + offset1; + uint64_t addr2 = isec->getVA() + offset2; + uint64_t referent = pageBits(addr1) + adrp.addend + ldr.offset; + ldr.offset = referent - addr2; + if (!isLiteralLdrEligible(ldr)) + return; + + writeNop(buf + offset1); + writeLiteralLdr(buf + offset2, ldr); +} + +// GOT loads are emitted by the compiler as a pair of adrp and ldr instructions, +// but they may be changed to adrp+add by relaxGotLoad(). This hint performs +// the AdrpLdr or AdrpAdd transformation depending on whether it was relaxed. +static void applyAdrpLdrGot(uint8_t *buf, const ConcatInputSection *isec, + uint64_t offset1, uint64_t offset2) { + uint32_t ins2 = read32le(buf + offset2); + Add add; + Ldr ldr; + if (parseAdd(ins2, add)) + applyAdrpAdd(buf, isec, offset1, offset2); + else if (parseLdr(ins2, ldr)) + applyAdrpLdr(buf, isec, offset1, offset2); +} + +// Optimizes an adrp+add+ldr sequence used for loading from a local symbol's +// address by loading directly if it's close enough, or to an adrp(p)+ldr +// sequence if it's not. +// +// adrp x0, _foo@PAGE +// add x1, x0, _foo@PAGEOFF +// ldr x2, [x1, #off] +static void applyAdrpAddLdr(uint8_t *buf, const ConcatInputSection *isec, + uint64_t offset1, uint64_t offset2, + uint64_t offset3) { + uint32_t ins1 = read32le(buf + offset1); + uint32_t ins2 = read32le(buf + offset2); + uint32_t ins3 = read32le(buf + offset3); + Adrp adrp; + Add add; + Ldr ldr; + if (!parseAdrp(ins1, adrp) || !parseAdd(ins2, add) || !parseLdr(ins3, ldr)) + return; + if (adrp.destRegister != add.srcRegister) + return; + if (add.destRegister != ldr.baseRegister) + return; + + // Load from the target address directly. + // nop + // nop + // ldr x2, [_foo + #off] + uint64_t addr1 = isec->getVA() + offset1; + uint64_t addr3 = isec->getVA() + offset3; + uint64_t referent = pageBits(addr1) + adrp.addend + add.addend; + Ldr literalLdr = ldr; + literalLdr.offset += referent - addr3; + if (isLiteralLdrEligible(literalLdr)) { + writeNop(buf + offset1); + writeNop(buf + offset2); + writeLiteralLdr(buf + offset3, literalLdr); + return; + } + + if (applyAdrpAdd(buf, isec, offset1, offset2)) + return; + + // Move the target's page offset into the ldr's immediate offset. + // adrp x0, _foo@PAGE + // nop + // ldr x2, [x0, _foo@PAGEOFF + #off] + Ldr immediateLdr = ldr; + immediateLdr.baseRegister = adrp.destRegister; + immediateLdr.offset += add.addend; + if (isImmediateLdrEligible(immediateLdr)) { + writeNop(buf + offset2); + writeImmediateLdr(buf + offset3, immediateLdr); + return; + } +} + +// Relaxes a GOT-indirect load. +// If the referenced symbol is external and its GOT entry is within +/- 1 MiB, +// the GOT entry can be loaded with a single literal ldr instruction. +// If the referenced symbol is local and thus has been relaxed to adrp+add+ldr, +// we perform the AdrpAddLdr transformation. +static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec, + uint64_t offset1, uint64_t offset2, + uint64_t offset3) { + uint32_t ins2 = read32le(buf + offset2); + Add add; + Ldr ldr2; + + if (parseAdd(ins2, add)) { + applyAdrpAddLdr(buf, isec, offset1, offset2, offset3); + } else if (parseLdr(ins2, ldr2)) { + // adrp x1, _foo@GOTPAGE + // ldr x2, [x1, _foo@GOTPAGEOFF] + // ldr x3, [x2, #off] + uint32_t ins3 = read32le(buf + offset3); + Ldr ldr3; + if (!parseLdr(ins3, ldr3)) + return; + if (ldr3.baseRegister != ldr2.destRegister) + return; + // Loads from the GOT must be pointer sized. + if (ldr2.p2Size != 3 || ldr2.isFloat) + return; + applyAdrpLdr(buf, isec, offset1, offset2); + } +} + +template +static void forEachHint(ArrayRef data, Callback callback) { + std::array args; + + auto readNext = [&]() -> uint64_t { + unsigned int n = 0; + uint64_t value = decodeULEB128(data.data(), &n, data.end()); + data = data.drop_front(n); + return value; + }; + + while (!data.empty()) { + uint64_t type = readNext(); + if (type == 0) + break; + + uint64_t argCount = readNext(); + for (unsigned i = 0; i < argCount; ++i) { + uint64_t arg = readNext(); + if (i < 3) + args[i] = arg; + } + // All known LOH types as of 2022-09 have 3 or fewer arguments; skip others. + if (argCount > 3) + continue; + callback(type, ArrayRef(args.data(), argCount)); + } +} + +// On RISC architectures like arm64, materializing a memory address generally +// takes multiple instructions. If the referenced symbol is located close enough +// in memory, fewer instructions are needed. +// +// Linker optimization hints record where addresses are computed. After +// addresses have been assigned, if possible, we change them to a shorter +// sequence of instructions. The size of the binary is not modified; the +// eliminated instructions are replaced with NOPs. This still leads to faster +// code as the CPU can skip over NOPs quickly. +// +// LOHs are specified by the LC_LINKER_OPTIMIZATION_HINTS load command, which +// points to a sequence of ULEB128-encoded numbers. Each entry specifies a +// transformation kind, and 2 or 3 addresses where the instructions are located. +void macho::applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj) { + ArrayRef data = obj.getOptimizationHints(); + if (data.empty()) + return; + + const ConcatInputSection *section = nullptr; + uint64_t sectionAddr = 0; + uint8_t *buf = nullptr; + + auto findSection = [&](uint64_t addr) { + if (section && addr >= sectionAddr && + addr < sectionAddr + section->getSize()) + return true; + + if (obj.sections.empty()) + return false; + auto secIt = std::prev(llvm::upper_bound( + obj.sections, addr, + [](uint64_t off, const Section *sec) { return off < sec->addr; })); + const Section *sec = *secIt; + + if (sec->subsections.empty()) + return false; + auto subsecIt = std::prev(llvm::upper_bound( + sec->subsections, addr - sec->addr, + [](uint64_t off, Subsection subsec) { return off < subsec.offset; })); + const Subsection &subsec = *subsecIt; + const ConcatInputSection *isec = + dyn_cast_or_null(subsec.isec); + if (!isec || isec->shouldOmitFromOutput()) + return false; + + section = isec; + sectionAddr = subsec.offset + sec->addr; + buf = outBuf + section->outSecOff + section->parent->fileOff; + return true; + }; + + auto isValidOffset = [&](uint64_t offset) { + if (offset < sectionAddr || offset >= sectionAddr + section->getSize()) { + error(toString(&obj) + + ": linker optimization hint spans multiple sections"); + return false; + } + return true; + }; + + bool hasAdrpAdrp = false; + forEachHint(data, [&](uint64_t kind, ArrayRef args) { + if (kind == LOH_ARM64_ADRP_ADRP) { + hasAdrpAdrp = true; + return; + } + + if (!findSection(args[0])) + return; + switch (kind) { + case LOH_ARM64_ADRP_ADD: + if (isValidOffset(args[1])) + applyAdrpAdd(buf, section, args[0] - sectionAddr, + args[1] - sectionAddr); + break; + case LOH_ARM64_ADRP_LDR: + if (isValidOffset(args[1])) + applyAdrpLdr(buf, section, args[0] - sectionAddr, + args[1] - sectionAddr); + break; + case LOH_ARM64_ADRP_LDR_GOT: + if (isValidOffset(args[1])) + applyAdrpLdrGot(buf, section, args[0] - sectionAddr, + args[1] - sectionAddr); + break; + case LOH_ARM64_ADRP_ADD_LDR: + if (isValidOffset(args[1]) && isValidOffset(args[2])) + applyAdrpAddLdr(buf, section, args[0] - sectionAddr, + args[1] - sectionAddr, args[2] - sectionAddr); + break; + case LOH_ARM64_ADRP_LDR_GOT_LDR: + if (isValidOffset(args[1]) && isValidOffset(args[2])) + applyAdrpLdrGotLdr(buf, section, args[0] - sectionAddr, + args[1] - sectionAddr, args[2] - sectionAddr); + break; + case LOH_ARM64_ADRP_ADD_STR: + case LOH_ARM64_ADRP_LDR_GOT_STR: + // TODO: Implement these + break; + } + }); + + if (!hasAdrpAdrp) + return; + + // AdrpAdrp optimization hints are performed in a second pass because they + // might interfere with other transformations. For instance, consider the + // following input: + // + // adrp x0, _foo@PAGE + // add x1, x0, _foo@PAGEOFF + // adrp x0, _bar@PAGE + // add x2, x0, _bar@PAGEOFF + // + // If we perform the AdrpAdrp relaxation first, we get: + // + // adrp x0, _foo@PAGE + // add x1, x0, _foo@PAGEOFF + // nop + // add x2, x0, _bar@PAGEOFF + // + // If we then apply AdrpAdd to the first two instructions, the add will have a + // garbage value in x0: + // + // adr x1, _foo + // nop + // nop + // add x2, x0, _bar@PAGEOFF + forEachHint(data, [&](uint64_t kind, ArrayRef args) { + if (kind != LOH_ARM64_ADRP_ADRP) + return; + if (!findSection(args[0])) + return; + if (isValidOffset(args[1])) + applyAdrpAdrp(buf, section, args[0] - sectionAddr, args[1] - sectionAddr); + }); +} diff --git a/lld/MachO/LinkerOptimizationHints.h b/lld/MachO/LinkerOptimizationHints.h new file mode 100644 index 0000000000000..eada9b048c255 --- /dev/null +++ b/lld/MachO/LinkerOptimizationHints.h @@ -0,0 +1,17 @@ +//===- LinkerOptimizationHints.h ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_MACHO_LINKER_OPTIMIZATION_HINTS_H +#define LLD_MACHO_LINKER_OPTIMIZATION_HINTS_H + +#include "InputFiles.h" + +namespace lld::macho { +void applyOptimizationHints(uint8_t *outBuf, const ObjFile &obj); +} +#endif diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h index 39f5f94078611..27e5178593c87 100644 --- a/lld/MachO/Target.h +++ b/lld/MachO/Target.h @@ -124,8 +124,6 @@ class TargetInfo { llvm_unreachable("Unsupported architecture for dtrace symbols"); } - virtual void applyOptimizationHints(uint8_t *, const ObjFile &) const {}; - uint32_t magic; llvm::MachO::CPUType cpuType; uint32_t cpuSubtype; diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index 613e6dea3b897..a18f350ee69da 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -11,6 +11,7 @@ #include "Config.h" #include "InputFiles.h" #include "InputSection.h" +#include "LinkerOptimizationHints.h" #include "MapFile.h" #include "OutputSection.h" #include "OutputSegment.h" @@ -1216,7 +1217,7 @@ void Writer::applyOptimizationHints() { TimeTraceScope timeScope("Apply linker optimization hints"); parallelForEach(inputFiles, [buf](const InputFile *file) { if (const auto *objFile = dyn_cast(file)) - target->applyOptimizationHints(buf, *objFile); + macho::applyOptimizationHints(buf, *objFile); }); } diff --git a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn index db608e3cc7449..b118d16441960 100644 --- a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn +++ b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn @@ -37,6 +37,7 @@ static_library("MachO") { "ICF.cpp", "InputFiles.cpp", "InputSection.cpp", + "LinkerOptimizationHints.cpp", "LTO.cpp", "MapFile.cpp", "MarkLive.cpp", From c372a2cd0a1e4502f35bf8ebfc0a5d682223249e Mon Sep 17 00:00:00 2001 From: Daniel Paoliello Date: Wed, 16 Jul 2025 12:21:40 -0700 Subject: [PATCH 093/813] Use default instead of a specific case to fix the MSVC warning in sysinfo.cc (#149159) #147357 attempted to fix an MSVC in sysinfo.cc by adding a `case` block for a missing enum value. However, this resulted in [CI failures](https://github.com/llvm/llvm-project/pull/147357#issuecomment-3079709852): ``` 4.170 [148/4/9] Building CXX object third-party/benchmark/src/CMakeFiles/benchmark.dir/sysinfo.cc.obj FAILED: third-party/benchmark/src/CMakeFiles/benchmark.dir/sysinfo.cc.obj C:\Users\tcwg\scoop\shims\ccache.exe C:\Users\tcwg\scoop\apps\llvm-arm64\current\bin\clang-cl.exe /nologo -TP -DBENCHMARK_STATIC_DEFINE -DEXPERIMENTAL_KEY_INSTRUCTIONS -DHAVE_STD_REGEX -DHAVE_STEADY_CLOCK -DUNICODE -D_CRT_NONSTDC_NO_DEPRECATE -D_CRT_NONSTDC_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE -D_CRT_SECURE_NO_WARNINGS -D_GLIBCXX_ASSERTIONS -D_HAS_EXCEPTIONS=0 -D_SCL_SECURE_NO_DEPRECATE -D_SCL_SECURE_NO_WARNINGS -D_UNICODE -D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_LIMIT_MACROS -IC:/Users/tcwg/llvm-worker/flang-arm64-windows-msvc/build/third-party/benchmark/src -IC:/Users/tcwg/llvm-worker/flang-arm64-windows-msvc/llvm-project/third-party/benchmark/src -IC:/Users/tcwg/llvm-worker/flang-arm64-windows-msvc/build/include -IC:/Users/tcwg/llvm-worker/flang-arm64-windows-msvc/llvm-project/llvm/include -IC:/Users/tcwg/llvm-worker/flang-arm64-windows-msvc/llvm-project/third-party/benchmark/include /DWIN32 /D_WINDOWS /Zc:inline /Zc:__cplusplus /Oi /Brepro /bigobj /permissive- -Werror=unguarded-availability-new -Wextra -Wno-unused-parameter -Wwrite-strings -Wcast-qual -Wmissing-field-initializers -Wimplicit-fallthrough -Wcovered-switch-default -Wno-noexcept-type -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wsuggest-override -Wstring-conversion -Wmisleading-indentation -Wctad-maybe-unsupported /Gw /W4 -EHs- -EHa- /O2 /Ob2 -std:c++14 -MD -UNDEBUG /showIncludes /Fothird-party/benchmark/src/CMakeFiles/benchmark.dir/sysinfo.cc.obj /Fdthird-party\benchmark\src\CMakeFiles\benchmark.dir\benchmark.pdb -c -- C:/Users/tcwg/llvm-worker/flang-arm64-windows-msvc/llvm-project/third-party/benchmark/src/sysinfo.cc C:/Users/tcwg/llvm-worker/flang-arm64-windows-msvc/llvm-project/third-party/benchmark/src/sysinfo.cc(374,12): error: use of undeclared identifier 'CacheUnknown' 374 | case CacheUnknown: | ^ 1 error generated. ``` The root cause is that the enum being switched on is defined in the Windows SDK, so depending on which version of the SDK you are using `CacheUnknown` may or may not be defined. The correct fix here is to use a `default` block in the switch statement instead. --- third-party/benchmark/src/sysinfo.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third-party/benchmark/src/sysinfo.cc b/third-party/benchmark/src/sysinfo.cc index 837be8f9cf891..d1ae6cc82b943 100644 --- a/third-party/benchmark/src/sysinfo.cc +++ b/third-party/benchmark/src/sysinfo.cc @@ -371,7 +371,7 @@ std::vector GetCacheSizesWindows() { case CacheTrace: C.type = "Trace"; break; - case CacheUnknown: + default: C.type = "Unknown"; break; } From b3c72a97c5ac352b89c12f3cf7c3f223219f91ed Mon Sep 17 00:00:00 2001 From: Andre Kuhlenschmidt Date: Wed, 16 Jul 2025 12:26:38 -0700 Subject: [PATCH 094/813] [flang][driver] -Werror promotes warnings to error and interopts with -Wfatal-errors (#148748) This PR changes how `-Werror` promotes warnings to errors so that it interoperates with `-Wfatal-error`. It maintains the property that warnings and other messages promoted to errors are displayed as there original message. --- flang/include/flang/Parser/message.h | 4 +-- flang/lib/Frontend/FrontendAction.cpp | 9 +++--- flang/lib/Parser/message.cpp | 17 +++++++++-- flang/lib/Semantics/semantics.cpp | 5 ++-- flang/test/Driver/fatal-errors-warnings.f90 | 31 +++++++++++++++++++++ 5 files changed, 53 insertions(+), 13 deletions(-) create mode 100644 flang/test/Driver/fatal-errors-warnings.f90 diff --git a/flang/include/flang/Parser/message.h b/flang/include/flang/Parser/message.h index db1a0a65157e3..9192d23529913 100644 --- a/flang/include/flang/Parser/message.h +++ b/flang/include/flang/Parser/message.h @@ -355,9 +355,9 @@ class Messages { void Emit(llvm::raw_ostream &, const AllCookedSources &, bool echoSourceLines = true, const common::LanguageFeatureControl *hintFlags = nullptr, - std::size_t maxErrorsToEmit = 0) const; + std::size_t maxErrorsToEmit = 0, bool warningsAreErrors = false) const; void AttachTo(Message &, std::optional = std::nullopt); - bool AnyFatalError() const; + bool AnyFatalError(bool warningsAreErrors = false) const; private: std::list messages_; diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp index 2429e07e5b8c4..58901c6000380 100644 --- a/flang/lib/Frontend/FrontendAction.cpp +++ b/flang/lib/Frontend/FrontendAction.cpp @@ -230,15 +230,14 @@ bool FrontendAction::reportFatalErrors(const char (&message)[N]) { const common::LanguageFeatureControl &features{ instance->getInvocation().getFortranOpts().features}; const size_t maxErrors{instance->getInvocation().getMaxErrors()}; - if (!instance->getParsing().messages().empty() && - (instance->getInvocation().getWarnAsErr() || - instance->getParsing().messages().AnyFatalError())) { + const bool warningsAreErrors{instance->getInvocation().getWarnAsErr()}; + if (instance->getParsing().messages().AnyFatalError(warningsAreErrors)) { const unsigned diagID = instance->getDiagnostics().getCustomDiagID( clang::DiagnosticsEngine::Error, message); instance->getDiagnostics().Report(diagID) << getCurrentFileOrBufferName(); instance->getParsing().messages().Emit( llvm::errs(), instance->getAllCookedSources(), - /*echoSourceLines=*/true, &features, maxErrors); + /*echoSourceLines=*/true, &features, maxErrors, warningsAreErrors); return true; } if (instance->getParsing().parseTree().has_value() && @@ -249,7 +248,7 @@ bool FrontendAction::reportFatalErrors(const char (&message)[N]) { instance->getDiagnostics().Report(diagID) << getCurrentFileOrBufferName(); instance->getParsing().messages().Emit( llvm::errs(), instance->getAllCookedSources(), - /*echoSourceLine=*/true, &features, maxErrors); + /*echoSourceLine=*/true, &features, maxErrors, warningsAreErrors); instance->getParsing().EmitMessage( llvm::errs(), instance->getParsing().finalRestingPlace(), "parser FAIL (final position)", "error: ", llvm::raw_ostream::RED); diff --git a/flang/lib/Parser/message.cpp b/flang/lib/Parser/message.cpp index 909fba948a45a..2a8101dd0b810 100644 --- a/flang/lib/Parser/message.cpp +++ b/flang/lib/Parser/message.cpp @@ -453,7 +453,7 @@ void Messages::ResolveProvenances(const AllCookedSources &allCooked) { void Messages::Emit(llvm::raw_ostream &o, const AllCookedSources &allCooked, bool echoSourceLines, const common::LanguageFeatureControl *hintFlagPtr, - std::size_t maxErrorsToEmit) const { + std::size_t maxErrorsToEmit, bool warningsAreErrors) const { std::vector sorted; for (const auto &msg : messages_) { sorted.push_back(&msg); @@ -469,7 +469,7 @@ void Messages::Emit(llvm::raw_ostream &o, const AllCookedSources &allCooked, } msg->Emit(o, allCooked, echoSourceLines, hintFlagPtr); lastMsg = msg; - if (msg->IsFatal()) { + if (warningsAreErrors || msg->IsFatal()) { ++errorsEmitted; } // If maxErrorsToEmit is 0, emit all errors, otherwise break after @@ -491,7 +491,18 @@ void Messages::AttachTo(Message &msg, std::optional severity) { messages_.clear(); } -bool Messages::AnyFatalError() const { +bool Messages::AnyFatalError(bool warningsAreErrors) const { + // Short-circuit in the most common case. + if (messages_.empty()) { + return false; + } + // If warnings are errors and there are warnings or errors, this is fatal. + // This preserves the compiler's current behavior of treating any non-fatal + // message as a warning. We may want to refine this in the future. + if (warningsAreErrors) { + return true; + } + // Otherwise, check the message buffer for fatal errors. for (const auto &msg : messages_) { if (msg.IsFatal()) { return true; diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp index ab78605d01f4c..b15ed057b52f2 100644 --- a/flang/lib/Semantics/semantics.cpp +++ b/flang/lib/Semantics/semantics.cpp @@ -376,8 +376,7 @@ const DeclTypeSpec &SemanticsContext::MakeLogicalType(int kind) { } bool SemanticsContext::AnyFatalError() const { - return !messages_.empty() && - (warningsAreErrors_ || messages_.AnyFatalError()); + return messages_.AnyFatalError(warningsAreErrors_); } bool SemanticsContext::HasError(const Symbol &symbol) { return errorSymbols_.count(symbol) > 0; @@ -658,7 +657,7 @@ void Semantics::EmitMessages(llvm::raw_ostream &os) { context_.messages().ResolveProvenances(context_.allCookedSources()); context_.messages().Emit(os, context_.allCookedSources(), /*echoSourceLine=*/true, &context_.languageFeatures(), - /*maxErrorsToEmit=*/context_.maxErrors()); + context_.maxErrors(), context_.warningsAreErrors()); } void SemanticsContext::DumpSymbols(llvm::raw_ostream &os) { diff --git a/flang/test/Driver/fatal-errors-warnings.f90 b/flang/test/Driver/fatal-errors-warnings.f90 new file mode 100644 index 0000000000000..2de09c3ed0778 --- /dev/null +++ b/flang/test/Driver/fatal-errors-warnings.f90 @@ -0,0 +1,31 @@ +! RUN: %flang_fc1 -Wfatal-errors -pedantic %s 2>&1 | FileCheck %s --check-prefix=CHECK1 +! RUN: not %flang_fc1 -pedantic -Werror %s 2>&1 | FileCheck %s --check-prefix=CHECK2 +! RUN: not %flang_fc1 -Wfatal-errors -pedantic -Werror %s 2>&1 | FileCheck %s --check-prefix=CHECK3 + +module m + contains + subroutine foo(a) + real, intent(in), target :: a(:) + end subroutine +end module + +program test + use m + real, target :: a(1) + real :: b(1) + call foo(a) ! ok + !CHECK1: fatal-errors-warnings.f90:{{.*}} warning: + !CHECK2: fatal-errors-warnings.f90:{{.*}} warning: + !CHECK3: fatal-errors-warnings.f90:{{.*}} warning: + call foo(b) + !CHECK1: fatal-errors-warnings.f90:{{.*}} warning: + !CHECK2: fatal-errors-warnings.f90:{{.*}} warning: + !CHECK3-NOT: error: + !CHECK3-NOT: warning: + call foo((a)) + !CHECK1: fatal-errors-warnings.f90:{{.*}} warning: + !CHECK2: fatal-errors-warnings.f90:{{.*}} warning: + call foo(a([1])) + !! Hard error instead of warning if uncommented. + !call foo(a(1)) +end \ No newline at end of file From 362594a10fa5fd8e5f8d31eb5391370c928b639e Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 16 Jul 2025 20:27:50 +0100 Subject: [PATCH 095/813] [TableGen] Remove unnecessary sortAndUniqueRegisters (#149125) Each of the SRSets is already sorted and unique because it is a filtered version of RC->getMembers() which is already sorted and unique. --- llvm/utils/TableGen/Common/CodeGenRegisters.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index c43cc9afe1e3c..1fb2918a72636 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -2295,9 +2295,6 @@ void CodeGenRegBank::inferSubClassWithSubReg(CodeGenRegisterClass *RC) { SRSets[I].push_back(R); } - for (auto I : SRSets) - sortAndUniqueRegisters(I.second); - // Find matching classes for all SRSets entries. Iterate in SubRegIndex // numerical order to visit synthetic indices last. for (const CodeGenSubRegIndex &SubIdx : SubRegIndices) { From 43f10639a18b2b8fb0976f3bde84a9d240647915 Mon Sep 17 00:00:00 2001 From: Daniel Bertalan Date: Wed, 16 Jul 2025 21:29:48 +0200 Subject: [PATCH 096/813] [lld-macho] Enable Linker Optimization Hints pass for arm64_32 (#148964) The backend emits `.loh` directives for arm64_32 as well. Our pass already handles 32-bit pointer loads correctly (there was an extraneous sanity check for 8-byte pointer sizes, I removed that here), so we can enable them for all arm64 subtargets, including our upcoming arm64e support. --- lld/MachO/LinkerOptimizationHints.cpp | 3 -- lld/MachO/Writer.cpp | 3 +- lld/test/MachO/loh-arm64-32.s | 64 +++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 lld/test/MachO/loh-arm64-32.s diff --git a/lld/MachO/LinkerOptimizationHints.cpp b/lld/MachO/LinkerOptimizationHints.cpp index 60c999b19ecfc..bae1a576eea57 100644 --- a/lld/MachO/LinkerOptimizationHints.cpp +++ b/lld/MachO/LinkerOptimizationHints.cpp @@ -351,9 +351,6 @@ static void applyAdrpLdrGotLdr(uint8_t *buf, const ConcatInputSection *isec, return; if (ldr3.baseRegister != ldr2.destRegister) return; - // Loads from the GOT must be pointer sized. - if (ldr2.p2Size != 3 || ldr2.isFloat) - return; applyAdrpLdr(buf, isec, offset1, offset2); } } diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index a18f350ee69da..f288fadc0d14f 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -1210,7 +1210,8 @@ void Writer::writeSections() { } void Writer::applyOptimizationHints() { - if (config->arch() != AK_arm64 || config->ignoreOptimizationHints) + if (!is_contained({AK_arm64, AK_arm64e, AK_arm64_32}, config->arch()) || + config->ignoreOptimizationHints) return; uint8_t *buf = buffer->getBufferStart(); diff --git a/lld/test/MachO/loh-arm64-32.s b/lld/test/MachO/loh-arm64-32.s new file mode 100644 index 0000000000000..906d0e1ce9046 --- /dev/null +++ b/lld/test/MachO/loh-arm64-32.s @@ -0,0 +1,64 @@ +# REQUIRES: aarch64 + +# RUN: llvm-mc -filetype=obj -triple=arm64_32-apple-watchos %s -o %t.o +# RUN: %lld-watchos -U _external %t.o -o %t +# RUN: llvm-objdump -d --macho %t | FileCheck %s + +.text +.align 2 +.globl _foo +_foo: + ret +.globl _bar +_bar: + ret + +.globl _main +_main: +# CHECK-LABEL: _main: + +L1: adrp x0, _foo@PAGE +L2: add x0, x0, _foo@PAGEOFF +# CHECK-NEXT: adr x0 +# CHECK-NEXT: nop + +L3: adrp x0, _ptr@PAGE +L4: add x1, x0, _ptr@PAGEOFF +L5: ldr x2, [x1] +# CHECK-NEXT: nop +# CHECK-NEXT: nop +# CHECK-NEXT: ldr x2 + +L6: adrp x0, _foo@PAGE +L7: adrp x0, _bar@PAGE +# CHECK-NEXT: adrp x0 +# CHECK-NEXT: nop + +L8: adrp x0, _ptr@PAGE +L9: ldr x0, [x0, _ptr@PAGEOFF] +# CHECK-NEXT: nop +# CHECK-NEXT: ldr x0 + +L10: adrp x0, _ptr@PAGE +L11: ldr w0, [x0, _ptr@PAGEOFF] +# CHECK-NEXT: nop +# CHECK-NEXT: ldr w0, _ptr + +L12: adrp x0, _external@PAGE +L13: ldr w1, [x0, _external@PAGEOFF] +L14: ldr x2, [x1] +# CHECK-NEXT: nop +# CHECK-NEXT: ldr w1, 0x{{.*}} +# CHECK-NEXT: ldr x2, [x1] + +.data +.align 4 +_ptr: + .quad 0 + +.loh AdrpAdd L1, L2 +.loh AdrpAddLdr L3, L4, L5 +.loh AdrpAdrp L6, L7 +.loh AdrpLdr L8, L9 +.loh AdrpLdrGot L10, L11 +.loh AdrpLdrGotLdr L12, L13, L14 From 4355356d96de1e171f7511a6c41d056871dacc68 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 16 Jul 2025 20:31:32 +0100 Subject: [PATCH 097/813] [TableGen] Add a bitvector of members of CodeGenRegisterClass (#149122) This makes CodeGenRegisterClass::contains fast. Use this to simplify inferMatchingSuperRegClass. --- .../TableGen/Common/CodeGenRegisters.cpp | 27 ++++++++----------- llvm/utils/TableGen/Common/CodeGenRegisters.h | 4 ++- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 2 +- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index 1fb2918a72636..28b542f09e8c0 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -701,11 +701,13 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Orders.resize(1 + AltOrders->size()); // Default allocation order always contains all registers. + MemberBV.resize(RegBank.getRegisters().size()); Artificial = true; for (const Record *Element : *Elements) { Orders[0].push_back(Element); const CodeGenRegister *Reg = RegBank.getReg(Element); Members.push_back(Reg); + MemberBV.set(CodeGenRegBank::getRegIndex(Reg)); Artificial &= Reg->Artificial; if (!Reg->getSuperRegs().empty()) RegsWithSuperRegsTopoSigs.set(Reg->getTopoSig()); @@ -767,9 +769,11 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, RegsWithSuperRegsTopoSigs(RegBank.getNumTopoSigs()), EnumValue(-1), RSI(Props.RSI), CopyCost(0), Allocatable(true), AllocationPriority(0), GlobalPriority(false), TSFlags(0) { + MemberBV.resize(RegBank.getRegisters().size()); Artificial = true; GeneratePressureSet = false; for (const auto R : Members) { + MemberBV.set(CodeGenRegBank::getRegIndex(R)); if (!R->getSuperRegs().empty()) RegsWithSuperRegsTopoSigs.set(R->getTopoSig()); Artificial &= R->Artificial; @@ -833,7 +837,7 @@ bool CodeGenRegisterClass::hasType(const ValueTypeByHwMode &VT) const { } bool CodeGenRegisterClass::contains(const CodeGenRegister *Reg) const { - return llvm::binary_search(Members, Reg, deref>()); + return MemberBV.test(CodeGenRegBank::getRegIndex(Reg)); } unsigned CodeGenRegisterClass::getWeight(const CodeGenRegBank &RegBank) const { @@ -2329,8 +2333,7 @@ void CodeGenRegBank::inferMatchingSuperRegClass( CodeGenRegisterClass *RC, std::list::iterator FirstSubRegRC) { DenseSet ImpliedSubRegIndices; - std::vector> - SubToSuperRegs; + std::vector SubRegs; BitVector TopoSigs(getNumTopoSigs()); // Iterate subregister indices in topological order to visit larger indices @@ -2348,15 +2351,14 @@ void CodeGenRegBank::inferMatchingSuperRegClass( // Build list of (Sub, Super) pairs for this SubIdx, sorted by Sub. Note // that the list may contain entries with the same Sub but different Supers. - SubToSuperRegs.clear(); + SubRegs.clear(); TopoSigs.reset(); for (const CodeGenRegister *Super : RC->getMembers()) { const CodeGenRegister *Sub = Super->getSubRegs().find(SubIdx)->second; assert(Sub && "Missing sub-register"); - SubToSuperRegs.emplace_back(Sub, Super); + SubRegs.push_back(Sub); TopoSigs.set(Sub->getTopoSig()); } - sort(SubToSuperRegs, on_first>>()); // Iterate over sub-register class candidates. Ignore classes created by // this loop. They will never be useful. @@ -2371,16 +2373,10 @@ void CodeGenRegBank::inferMatchingSuperRegClass( // Topological shortcut: SubRC members have the wrong shape. if (!TopoSigs.anyCommon(SubRC.getRegsWithSuperRegsTopoSigs())) continue; - // Compute the subset of RC that maps into SubRC with a single linear scan - // through SubToSuperRegs and the members of SubRC. + // Compute the subset of RC that maps into SubRC. CodeGenRegister::Vec SubSetVec; - auto SubI = SubRC.getMembers().begin(), SubE = SubRC.getMembers().end(); - for (auto &[Sub, Super] : SubToSuperRegs) { - while (SubI != SubE && **SubI < *Sub) - ++SubI; - if (SubI == SubE) - break; - if (**SubI == *Sub) + for (const auto &[Sub, Super] : zip_equal(SubRegs, RC->getMembers())) { + if (SubRC.contains(Sub)) SubSetVec.push_back(Super); } @@ -2388,7 +2384,6 @@ void CodeGenRegBank::inferMatchingSuperRegClass( continue; // RC injects completely into SubRC. - sortAndUniqueRegisters(SubSetVec); if (SubSetVec.size() == RC->getMembers().size()) { SubRC.addSuperRegClass(SubIdx, RC); diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h index bbcd44ce2cc5b..5e6fff0f775ea 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.h +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h @@ -315,6 +315,8 @@ inline bool operator==(const CodeGenRegister &A, const CodeGenRegister &B) { class CodeGenRegisterClass { CodeGenRegister::Vec Members; + // Bit mask of members, indexed by getRegIndex. + BitVector MemberBV; // Allocation orders. Order[0] always contains all registers in Members. std::vector> Orders; // Bit mask of sub-classes including this, indexed by their EnumValue. @@ -752,7 +754,7 @@ class CodeGenRegBank { CodeGenRegister *getReg(const Record *); // Get a Register's index into the Registers array. - unsigned getRegIndex(const CodeGenRegister *Reg) const { + static unsigned getRegIndex(const CodeGenRegister *Reg) { return Reg->EnumValue - 1; } diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 7d24c0f80cddb..2a311b7ff96b8 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -1644,7 +1644,7 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) { for (const CodeGenRegister &Reg : Regs) { const CodeGenRegisterClass *BaseRC = nullptr; for (const CodeGenRegisterClass *RC : BaseClasses) { - if (is_contained(RC->getMembers(), &Reg)) { + if (RC->contains(&Reg)) { BaseRC = RC; break; } From 3a6ef8b359fc3f2459ef60013b8938ebe847831b Mon Sep 17 00:00:00 2001 From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com> Date: Wed, 16 Jul 2025 22:48:29 +0300 Subject: [PATCH 098/813] [libc][math] Refactor exp10 implementation to header-only in src/__support/math folder. (#148400) Part of #147386 in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450 --- libc/shared/math.h | 1 + libc/shared/math/exp10.h | 23 + libc/src/__support/FPUtil/PolyEval.h | 2 +- libc/src/__support/FPUtil/double_double.h | 4 +- libc/src/__support/math/CMakeLists.txt | 21 + libc/src/__support/math/exp10.h | 501 ++++++++++++++++++ libc/src/math/generic/CMakeLists.txt | 15 +- libc/src/math/generic/exp10.cpp | 485 +---------------- .../llvm-project-overlay/libc/BUILD.bazel | 31 +- 9 files changed, 572 insertions(+), 511 deletions(-) create mode 100644 libc/shared/math/exp10.h create mode 100644 libc/src/__support/math/exp10.h diff --git a/libc/shared/math.h b/libc/shared/math.h index 3012cbb938816..b37aa46820523 100644 --- a/libc/shared/math.h +++ b/libc/shared/math.h @@ -12,6 +12,7 @@ #include "libc_common.h" #include "math/exp.h" +#include "math/exp10.h" #include "math/expf.h" #include "math/expf16.h" #include "math/frexpf.h" diff --git a/libc/shared/math/exp10.h b/libc/shared/math/exp10.h new file mode 100644 index 0000000000000..3d36d9103705f --- /dev/null +++ b/libc/shared/math/exp10.h @@ -0,0 +1,23 @@ +//===-- Shared exp10 function -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_MATH_EXP10_H +#define LLVM_LIBC_SHARED_MATH_EXP10_H + +#include "shared/libc_common.h" +#include "src/__support/math/exp10.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using math::exp10; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SHARED_MATH_EXP10_H diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h index 41104620ed61d..7bec4e30a9960 100644 --- a/libc/src/__support/FPUtil/PolyEval.h +++ b/libc/src/__support/FPUtil/PolyEval.h @@ -37,7 +37,7 @@ LIBC_INLINE cpp::enable_if_t<(sizeof(T) <= sizeof(void *)), T> polyeval(T, } template -LIBC_INLINE cpp::enable_if_t<(sizeof(T) > sizeof(void *)), T> +LIBC_INLINE static constexpr cpp::enable_if_t<(sizeof(T) > sizeof(void *)), T> polyeval(const T &x, const T &a0, const Ts &...a) { return multiply_add(x, polyeval(x, a...), a0); } diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h index c27885aadc028..8e54e845de493 100644 --- a/libc/src/__support/FPUtil/double_double.h +++ b/libc/src/__support/FPUtil/double_double.h @@ -151,8 +151,8 @@ LIBC_INLINE DoubleDouble quick_mult(double a, const DoubleDouble &b) { } template -LIBC_INLINE DoubleDouble quick_mult(const DoubleDouble &a, - const DoubleDouble &b) { +LIBC_INLINE constexpr DoubleDouble quick_mult(const DoubleDouble &a, + const DoubleDouble &b) { DoubleDouble r = exact_mult(a.hi, b.hi); double t1 = multiply_add(a.hi, b.lo, r.lo); double t2 = multiply_add(a.lo, b.hi, t1); diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt index f7ef9e7694fe6..0bfc996c44fc8 100644 --- a/libc/src/__support/math/CMakeLists.txt +++ b/libc/src/__support/math/CMakeLists.txt @@ -149,3 +149,24 @@ add_header_library( libc.src.__support.integer_literals libc.src.__support.macros.optimization ) + +add_header_library( + exp10 + HDRS + exp10.h + DEPENDS + .exp_constants + .exp_utils + libc.src.__support.CPP.bit + libc.src.__support.CPP.optional + libc.src.__support.FPUtil.dyadic_float + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.rounding_mode + libc.src.__support.FPUtil.triple_double + libc.src.__support.integer_literals + libc.src.__support.macros.optimization +) diff --git a/libc/src/__support/math/exp10.h b/libc/src/__support/math/exp10.h new file mode 100644 index 0000000000000..88748523deb3d --- /dev/null +++ b/libc/src/__support/math/exp10.h @@ -0,0 +1,501 @@ +//===-- Implementation header for exp10 ------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H + +#include "exp_constants.h" // Lookup tables EXP2_MID1 and EXP_M2. +#include "exp_utils.h" // ziv_test_denorm. +#include "src/__support/CPP/bit.h" +#include "src/__support/CPP/optional.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/double_double.h" +#include "src/__support/FPUtil/dyadic_float.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/nearest_integer.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/FPUtil/triple_double.h" +#include "src/__support/common.h" +#include "src/__support/integer_literals.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY + +namespace LIBC_NAMESPACE_DECL { + +using fputil::DoubleDouble; +using fputil::TripleDouble; +using Float128 = typename fputil::DyadicFloat<128>; + +using LIBC_NAMESPACE::operator""_u128; + +// log2(10) +static constexpr double LOG2_10 = 0x1.a934f0979a371p+1; + +// -2^-12 * log10(2) +// > a = -2^-12 * log10(2); +// > b = round(a, 32, RN); +// > c = round(a - b, 32, RN); +// > d = round(a - b - c, D, RN); +// Errors < 1.5 * 2^-144 +static constexpr double MLOG10_2_EXP2_M12_HI = -0x1.3441350ap-14; +static constexpr double MLOG10_2_EXP2_M12_MID = 0x1.0c0219dc1da99p-51; + +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +static constexpr double MLOG10_2_EXP2_M12_MID_32 = 0x1.0c0219dcp-51; +static constexpr double MLOG10_2_EXP2_M12_LO = 0x1.da994fd20dba2p-87; +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + +// Error bounds: +// Errors when using double precision. +constexpr double ERR_D = 0x1.8p-63; + +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +// Errors when using double-double precision. +static constexpr double ERR_DD = 0x1.8p-99; +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + +// Polynomial approximations with double precision. Generated by Sollya with: +// > P = fpminimax((10^x - 1)/x, 3, [|D...|], [-2^-14, 2^-14]); +// > P; +// Error bounds: +// | output - (10^dx - 1) / dx | < 2^-52. +LIBC_INLINE static double exp10_poly_approx_d(double dx) { + // dx^2 + double dx2 = dx * dx; + double c0 = + fputil::multiply_add(dx, 0x1.53524c73cea6ap+1, 0x1.26bb1bbb55516p+1); + double c1 = + fputil::multiply_add(dx, 0x1.2bd75cc6afc65p+0, 0x1.0470587aa264cp+1); + double p = fputil::multiply_add(dx2, c1, c0); + return p; +} + +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +// Polynomial approximation with double-double precision. Generated by Solya +// with: +// > P = fpminimax((10^x - 1)/x, 5, [|DD...|], [-2^-14, 2^-14]); +// Error bounds: +// | output - 10^(dx) | < 2^-101 +static constexpr DoubleDouble exp10_poly_approx_dd(const DoubleDouble &dx) { + // Taylor polynomial. + constexpr DoubleDouble COEFFS[] = { + {0, 0x1p0}, + {-0x1.f48ad494e927bp-53, 0x1.26bb1bbb55516p1}, + {-0x1.e2bfab3191cd2p-53, 0x1.53524c73cea69p1}, + {0x1.80fb65ec3b503p-53, 0x1.0470591de2ca4p1}, + {0x1.338fc05e21e55p-54, 0x1.2bd7609fd98c4p0}, + {0x1.d4ea116818fbp-56, 0x1.1429ffd519865p-1}, + {-0x1.872a8ff352077p-57, 0x1.a7ed70847c8b3p-3}, + + }; + + DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2], + COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]); + return p; +} + +// Polynomial approximation with 128-bit precision: +// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7 +// For |dx| < 2^-14: +// | output - 10^dx | < 1.5 * 2^-124. +static constexpr Float128 exp10_poly_approx_f128(const Float128 &dx) { + constexpr Float128 COEFFS_128[]{ + {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0 + {Sign::POS, -126, 0x935d8ddd'aaa8ac16'ea56d62b'82d30a2d_u128}, + {Sign::POS, -126, 0xa9a92639'e753443a'80a99ce7'5f4d5bdb_u128}, + {Sign::POS, -126, 0x82382c8e'f1652304'6a4f9d7d'bf6c9635_u128}, + {Sign::POS, -124, 0x12bd7609'fd98c44c'34578701'9216c7af_u128}, + {Sign::POS, -127, 0x450a7ff4'7535d889'cc41ed7e'0d27aee5_u128}, + {Sign::POS, -130, 0xd3f6b844'702d636b'8326bb91'a6e7601d_u128}, + {Sign::POS, -130, 0x45b937f0'd05bb1cd'fa7b46df'314112a9_u128}, + }; + + Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2], + COEFFS_128[3], COEFFS_128[4], COEFFS_128[5], + COEFFS_128[6], COEFFS_128[7]); + return p; +} + +// Compute 10^(x) using 128-bit precision. +// TODO(lntue): investigate triple-double precision implementation for this +// step. +static Float128 exp10_f128(double x, double kd, int idx1, int idx2) { + double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact + double t2 = kd * MLOG10_2_EXP2_M12_MID_32; // exact + double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-144 + + Float128 dx = fputil::quick_add( + Float128(t1), fputil::quick_add(Float128(t2), Float128(t3))); + + // TODO: Skip recalculating exp_mid1 and exp_mid2. + Float128 exp_mid1 = + fputil::quick_add(Float128(EXP2_MID1[idx1].hi), + fputil::quick_add(Float128(EXP2_MID1[idx1].mid), + Float128(EXP2_MID1[idx1].lo))); + + Float128 exp_mid2 = + fputil::quick_add(Float128(EXP2_MID2[idx2].hi), + fputil::quick_add(Float128(EXP2_MID2[idx2].mid), + Float128(EXP2_MID2[idx2].lo))); + + Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2); + + Float128 p = exp10_poly_approx_f128(dx); + + Float128 r = fputil::quick_mul(exp_mid, p); + + r.exponent += static_cast(kd) >> 12; + + return r; +} + +// Compute 10^x with double-double precision. +static DoubleDouble exp10_double_double(double x, double kd, + const DoubleDouble &exp_mid) { + // Recalculate dx: + // dx = x - k * 2^-12 * log10(2) + double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact + double t2 = kd * MLOG10_2_EXP2_M12_MID_32; // exact + double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-140 + + DoubleDouble dx = fputil::exact_add(t1, t2); + dx.lo += t3; + + // Degree-6 polynomial approximation in double-double precision. + // | p - 10^x | < 2^-103. + DoubleDouble p = exp10_poly_approx_dd(dx); + + // Error bounds: 2^-102. + DoubleDouble r = fputil::quick_mult(exp_mid, p); + + return r; +} +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + +// When output is denormal. +static double exp10_denorm(double x) { + // Range reduction. + double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21); + int k = static_cast(cpp::bit_cast(tmp) >> 19); + double kd = static_cast(k); + + uint32_t idx1 = (k >> 6) & 0x3f; + uint32_t idx2 = k & 0x3f; + + int hi = k >> 12; + + DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi}; + DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi}; + DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2); + + // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14 + double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact + double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h); + + double mid_lo = dx * exp_mid.hi; + + // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4. + double p = exp10_poly_approx_d(dx); + + double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); + +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + return ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D) + .value(); +#else + if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D); + LIBC_LIKELY(r.has_value())) + return r.value(); + + // Use double-double + DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid); + + if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD); + LIBC_LIKELY(r.has_value())) + return r.value(); + + // Use 128-bit precision + Float128 r_f128 = exp10_f128(x, kd, idx1, idx2); + + return static_cast(r_f128); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS +} + +// Check for exceptional cases when: +// * log10(1 - 2^-54) < x < log10(1 + 2^-53) +// * x >= log10(2^1024) +// * x <= log10(2^-1022) +// * x is inf or nan +static constexpr double exp10_set_exceptional(double x) { + using FPBits = typename fputil::FPBits; + FPBits xbits(x); + + uint64_t x_u = xbits.uintval(); + uint64_t x_abs = xbits.abs().uintval(); + + // |x| < log10(1 + 2^-53) + if (x_abs <= 0x3c8bcb7b1526e50e) { + // 10^(x) ~ 1 + x/2 + return fputil::multiply_add(x, 0.5, 1.0); + } + + // x <= log10(2^-1022) || x >= log10(2^1024) or inf/nan. + if (x_u >= 0xc0733a7146f72a42) { + // x <= log10(2^-1075) or -inf/nan + if (x_u > 0xc07439b746e36b52) { + // exp(-Inf) = 0 + if (xbits.is_inf()) + return 0.0; + + // exp(nan) = nan + if (xbits.is_nan()) + return x; + + if (fputil::quick_get_round() == FE_UPWARD) + return FPBits::min_subnormal().get_val(); + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_UNDERFLOW); + return 0.0; + } + + return exp10_denorm(x); + } + + // x >= log10(2^1024) or +inf/nan + // x is finite + if (x_u < 0x7ff0'0000'0000'0000ULL) { + int rounding = fputil::quick_get_round(); + if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO) + return FPBits::max_normal().get_val(); + + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_OVERFLOW); + } + // x is +inf or nan + return x + FPBits::inf().get_val(); +} + +namespace math { + +static constexpr double exp10(double x) { + using FPBits = typename fputil::FPBits; + FPBits xbits(x); + + uint64_t x_u = xbits.uintval(); + + // x <= log10(2^-1022) or x >= log10(2^1024) or + // log10(1 - 2^-54) < x < log10(1 + 2^-53). + if (LIBC_UNLIKELY(x_u >= 0xc0733a7146f72a42 || + (x_u <= 0xbc7bcb7b1526e50e && x_u >= 0x40734413509f79ff) || + x_u < 0x3c8bcb7b1526e50e)) { + return exp10_set_exceptional(x); + } + + // Now log10(2^-1075) < x <= log10(1 - 2^-54) or + // log10(1 + 2^-53) < x < log10(2^1024) + + // Range reduction: + // Let x = log10(2) * (hi + mid1 + mid2) + lo + // in which: + // hi is an integer + // mid1 * 2^6 is an integer + // mid2 * 2^12 is an integer + // then: + // 10^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 10^(lo). + // With this formula: + // - multiplying by 2^hi is exact and cheap, simply by adding the exponent + // field. + // - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables. + // - 10^(lo) ~ 1 + a0*lo + a1 * lo^2 + ... + // + // We compute (hi + mid1 + mid2) together by perform the rounding on + // x * log2(10) * 2^12. + // Since |x| < |log10(2^-1075)| < 2^9, + // |x * 2^12| < 2^9 * 2^12 < 2^21, + // So we can fit the rounded result round(x * 2^12) in int32_t. + // Thus, the goal is to be able to use an additional addition and fixed width + // shift to get an int32_t representing round(x * 2^12). + // + // Assuming int32_t using 2-complement representation, since the mantissa part + // of a double precision is unsigned with the leading bit hidden, if we add an + // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^23 to the product, the + // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be + // considered as a proper 2-complement representations of x*2^12. + // + // One small problem with this approach is that the sum (x*2^12 + C) in + // double precision is rounded to the least significant bit of the dorminant + // factor C. In order to minimize the rounding errors from this addition, we + // want to minimize e1. Another constraint that we want is that after + // shifting the mantissa so that the least significant bit of int32_t + // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without + // any adjustment. So combining these 2 requirements, we can choose + // C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence + // after right shifting the mantissa, the resulting int32_t has correct sign. + // With this choice of C, the number of mantissa bits we need to shift to the + // right is: 52 - 33 = 19. + // + // Moreover, since the integer right shifts are equivalent to rounding down, + // we can add an extra 0.5 so that it will become round-to-nearest, tie-to- + // +infinity. So in particular, we can compute: + // hmm = x * 2^12 + C, + // where C = 2^33 + 2^32 + 2^-1, then if + // k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19), + // the reduced argument: + // lo = x - log10(2) * 2^-12 * k is bounded by: + // |lo| = |x - log10(2) * 2^-12 * k| + // = log10(2) * 2^-12 * | x * log2(10) * 2^12 - k | + // <= log10(2) * 2^-12 * (2^-1 + 2^-19) + // < 1.5 * 2^-2 * (2^-13 + 2^-31) + // = 1.5 * (2^-15 * 2^-31) + // + // Finally, notice that k only uses the mantissa of x * 2^12, so the + // exponent 2^12 is not needed. So we can simply define + // C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and + // k = int32_t(lower 51 bits of double(x + C) >> 19). + + // Rounding errors <= 2^-31. + double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21); + int k = static_cast(cpp::bit_cast(tmp) >> 19); + double kd = static_cast(k); + + uint32_t idx1 = (k >> 6) & 0x3f; + uint32_t idx2 = k & 0x3f; + + int hi = k >> 12; + + DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi}; + DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi}; + DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2); + + // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14 + double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact + double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h); + + // We use the degree-4 polynomial to approximate 10^(lo): + // 10^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 + // = 1 + lo * P(lo) + // So that the errors are bounded by: + // |P(lo) - (10^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58 + // Let P_ be an evaluation of P where all intermediate computations are in + // double precision. Using either Horner's or Estrin's schemes, the evaluated + // errors can be bounded by: + // |P_(lo) - P(lo)| < 2^-51 + // => |lo * P_(lo) - (2^lo - 1) | < 2^-65 + // => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-64. + // Since we approximate + // 2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo, + // We use the expression: + // (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~ + // ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo) + // with errors bounded by 2^-64. + + double mid_lo = dx * exp_mid.hi; + + // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4. + double p = exp10_poly_approx_d(dx); + + double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); + +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + int64_t exp_hi = static_cast(hi) << FPBits::FRACTION_LEN; + double r = + cpp::bit_cast(exp_hi + cpp::bit_cast(exp_mid.hi + lo)); + return r; +#else + double upper = exp_mid.hi + (lo + ERR_D); + double lower = exp_mid.hi + (lo - ERR_D); + + if (LIBC_LIKELY(upper == lower)) { + // To multiply by 2^hi, a fast way is to simply add hi to the exponent + // field. + int64_t exp_hi = static_cast(hi) << FPBits::FRACTION_LEN; + double r = cpp::bit_cast(exp_hi + cpp::bit_cast(upper)); + return r; + } + + // Exact outputs when x = 1, 2, ..., 22 + hard to round with x = 23. + // Quick check mask: 0x800f'ffffU = ~(bits of 1.0 | ... | bits of 23.0) + if (LIBC_UNLIKELY((x_u & 0x8000'ffff'ffff'ffffULL) == 0ULL)) { + switch (x_u) { + case 0x3ff0000000000000: // x = 1.0 + return 10.0; + case 0x4000000000000000: // x = 2.0 + return 100.0; + case 0x4008000000000000: // x = 3.0 + return 1'000.0; + case 0x4010000000000000: // x = 4.0 + return 10'000.0; + case 0x4014000000000000: // x = 5.0 + return 100'000.0; + case 0x4018000000000000: // x = 6.0 + return 1'000'000.0; + case 0x401c000000000000: // x = 7.0 + return 10'000'000.0; + case 0x4020000000000000: // x = 8.0 + return 100'000'000.0; + case 0x4022000000000000: // x = 9.0 + return 1'000'000'000.0; + case 0x4024000000000000: // x = 10.0 + return 10'000'000'000.0; + case 0x4026000000000000: // x = 11.0 + return 100'000'000'000.0; + case 0x4028000000000000: // x = 12.0 + return 1'000'000'000'000.0; + case 0x402a000000000000: // x = 13.0 + return 10'000'000'000'000.0; + case 0x402c000000000000: // x = 14.0 + return 100'000'000'000'000.0; + case 0x402e000000000000: // x = 15.0 + return 1'000'000'000'000'000.0; + case 0x4030000000000000: // x = 16.0 + return 10'000'000'000'000'000.0; + case 0x4031000000000000: // x = 17.0 + return 100'000'000'000'000'000.0; + case 0x4032000000000000: // x = 18.0 + return 1'000'000'000'000'000'000.0; + case 0x4033000000000000: // x = 19.0 + return 10'000'000'000'000'000'000.0; + case 0x4034000000000000: // x = 20.0 + return 100'000'000'000'000'000'000.0; + case 0x4035000000000000: // x = 21.0 + return 1'000'000'000'000'000'000'000.0; + case 0x4036000000000000: // x = 22.0 + return 10'000'000'000'000'000'000'000.0; + case 0x4037000000000000: // x = 23.0 + return 0x1.52d02c7e14af6p76 + x; + } + } + + // Use double-double + DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid); + + double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD); + double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD); + + if (LIBC_LIKELY(upper_dd == lower_dd)) { + // To multiply by 2^hi, a fast way is to simply add hi to the exponent + // field. + int64_t exp_hi = static_cast(hi) << FPBits::FRACTION_LEN; + double r = cpp::bit_cast(exp_hi + cpp::bit_cast(upper_dd)); + return r; + } + + // Use 128-bit precision + Float128 r_f128 = exp10_f128(x, kd, idx1, idx2); + + return static_cast(r_f128); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS +} + +} // namespace math + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index b59beacd94143..352c2ad4ab22a 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -1457,20 +1457,7 @@ add_entrypoint_object( HDRS ../exp10.h DEPENDS - .common_constants - .explogxf - libc.src.__support.CPP.bit - libc.src.__support.CPP.optional - libc.src.__support.FPUtil.dyadic_float - libc.src.__support.FPUtil.fenv_impl - libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.nearest_integer - libc.src.__support.FPUtil.polyeval - libc.src.__support.FPUtil.rounding_mode - libc.src.__support.FPUtil.triple_double - libc.src.__support.integer_literals - libc.src.__support.macros.optimization + libc.src.__support.math.exp10 libc.src.errno.errno ) diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp index c464979b092c3..5c36d28c166ae 100644 --- a/libc/src/math/generic/exp10.cpp +++ b/libc/src/math/generic/exp10.cpp @@ -7,491 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/math/exp10.h" -#include "common_constants.h" // Lookup tables EXP2_MID1 and EXP_M2. -#include "explogxf.h" // ziv_test_denorm. -#include "src/__support/CPP/bit.h" -#include "src/__support/CPP/optional.h" -#include "src/__support/FPUtil/FEnvImpl.h" -#include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/double_double.h" -#include "src/__support/FPUtil/dyadic_float.h" -#include "src/__support/FPUtil/multiply_add.h" -#include "src/__support/FPUtil/nearest_integer.h" -#include "src/__support/FPUtil/rounding_mode.h" -#include "src/__support/FPUtil/triple_double.h" -#include "src/__support/common.h" -#include "src/__support/integer_literals.h" -#include "src/__support/macros/config.h" -#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/math/exp10.h" namespace LIBC_NAMESPACE_DECL { -using fputil::DoubleDouble; -using fputil::TripleDouble; -using Float128 = typename fputil::DyadicFloat<128>; - -using LIBC_NAMESPACE::operator""_u128; - -// log2(10) -constexpr double LOG2_10 = 0x1.a934f0979a371p+1; - -// -2^-12 * log10(2) -// > a = -2^-12 * log10(2); -// > b = round(a, 32, RN); -// > c = round(a - b, 32, RN); -// > d = round(a - b - c, D, RN); -// Errors < 1.5 * 2^-144 -constexpr double MLOG10_2_EXP2_M12_HI = -0x1.3441350ap-14; -constexpr double MLOG10_2_EXP2_M12_MID = 0x1.0c0219dc1da99p-51; - -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS -constexpr double MLOG10_2_EXP2_M12_MID_32 = 0x1.0c0219dcp-51; -constexpr double MLOG10_2_EXP2_M12_LO = 0x1.da994fd20dba2p-87; -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS - -// Error bounds: -// Errors when using double precision. -constexpr double ERR_D = 0x1.8p-63; - -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS -// Errors when using double-double precision. -constexpr double ERR_DD = 0x1.8p-99; -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS - -namespace { - -// Polynomial approximations with double precision. Generated by Sollya with: -// > P = fpminimax((10^x - 1)/x, 3, [|D...|], [-2^-14, 2^-14]); -// > P; -// Error bounds: -// | output - (10^dx - 1) / dx | < 2^-52. -LIBC_INLINE double poly_approx_d(double dx) { - // dx^2 - double dx2 = dx * dx; - double c0 = - fputil::multiply_add(dx, 0x1.53524c73cea6ap+1, 0x1.26bb1bbb55516p+1); - double c1 = - fputil::multiply_add(dx, 0x1.2bd75cc6afc65p+0, 0x1.0470587aa264cp+1); - double p = fputil::multiply_add(dx2, c1, c0); - return p; -} - -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS -// Polynomial approximation with double-double precision. Generated by Solya -// with: -// > P = fpminimax((10^x - 1)/x, 5, [|DD...|], [-2^-14, 2^-14]); -// Error bounds: -// | output - 10^(dx) | < 2^-101 -DoubleDouble poly_approx_dd(const DoubleDouble &dx) { - // Taylor polynomial. - constexpr DoubleDouble COEFFS[] = { - {0, 0x1p0}, - {-0x1.f48ad494e927bp-53, 0x1.26bb1bbb55516p1}, - {-0x1.e2bfab3191cd2p-53, 0x1.53524c73cea69p1}, - {0x1.80fb65ec3b503p-53, 0x1.0470591de2ca4p1}, - {0x1.338fc05e21e55p-54, 0x1.2bd7609fd98c4p0}, - {0x1.d4ea116818fbp-56, 0x1.1429ffd519865p-1}, - {-0x1.872a8ff352077p-57, 0x1.a7ed70847c8b3p-3}, - - }; - - DoubleDouble p = fputil::polyeval(dx, COEFFS[0], COEFFS[1], COEFFS[2], - COEFFS[3], COEFFS[4], COEFFS[5], COEFFS[6]); - return p; -} - -// Polynomial approximation with 128-bit precision: -// Return exp(dx) ~ 1 + a0 * dx + a1 * dx^2 + ... + a6 * dx^7 -// For |dx| < 2^-14: -// | output - 10^dx | < 1.5 * 2^-124. -Float128 poly_approx_f128(const Float128 &dx) { - constexpr Float128 COEFFS_128[]{ - {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, // 1.0 - {Sign::POS, -126, 0x935d8ddd'aaa8ac16'ea56d62b'82d30a2d_u128}, - {Sign::POS, -126, 0xa9a92639'e753443a'80a99ce7'5f4d5bdb_u128}, - {Sign::POS, -126, 0x82382c8e'f1652304'6a4f9d7d'bf6c9635_u128}, - {Sign::POS, -124, 0x12bd7609'fd98c44c'34578701'9216c7af_u128}, - {Sign::POS, -127, 0x450a7ff4'7535d889'cc41ed7e'0d27aee5_u128}, - {Sign::POS, -130, 0xd3f6b844'702d636b'8326bb91'a6e7601d_u128}, - {Sign::POS, -130, 0x45b937f0'd05bb1cd'fa7b46df'314112a9_u128}, - }; - - Float128 p = fputil::polyeval(dx, COEFFS_128[0], COEFFS_128[1], COEFFS_128[2], - COEFFS_128[3], COEFFS_128[4], COEFFS_128[5], - COEFFS_128[6], COEFFS_128[7]); - return p; -} - -// Compute 10^(x) using 128-bit precision. -// TODO(lntue): investigate triple-double precision implementation for this -// step. -Float128 exp10_f128(double x, double kd, int idx1, int idx2) { - double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact - double t2 = kd * MLOG10_2_EXP2_M12_MID_32; // exact - double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-144 - - Float128 dx = fputil::quick_add( - Float128(t1), fputil::quick_add(Float128(t2), Float128(t3))); - - // TODO: Skip recalculating exp_mid1 and exp_mid2. - Float128 exp_mid1 = - fputil::quick_add(Float128(EXP2_MID1[idx1].hi), - fputil::quick_add(Float128(EXP2_MID1[idx1].mid), - Float128(EXP2_MID1[idx1].lo))); - - Float128 exp_mid2 = - fputil::quick_add(Float128(EXP2_MID2[idx2].hi), - fputil::quick_add(Float128(EXP2_MID2[idx2].mid), - Float128(EXP2_MID2[idx2].lo))); - - Float128 exp_mid = fputil::quick_mul(exp_mid1, exp_mid2); - - Float128 p = poly_approx_f128(dx); - - Float128 r = fputil::quick_mul(exp_mid, p); - - r.exponent += static_cast(kd) >> 12; - - return r; -} - -// Compute 10^x with double-double precision. -DoubleDouble exp10_double_double(double x, double kd, - const DoubleDouble &exp_mid) { - // Recalculate dx: - // dx = x - k * 2^-12 * log10(2) - double t1 = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact - double t2 = kd * MLOG10_2_EXP2_M12_MID_32; // exact - double t3 = kd * MLOG10_2_EXP2_M12_LO; // Error < 2^-140 - - DoubleDouble dx = fputil::exact_add(t1, t2); - dx.lo += t3; - - // Degree-6 polynomial approximation in double-double precision. - // | p - 10^x | < 2^-103. - DoubleDouble p = poly_approx_dd(dx); - - // Error bounds: 2^-102. - DoubleDouble r = fputil::quick_mult(exp_mid, p); - - return r; -} -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS - -// When output is denormal. -double exp10_denorm(double x) { - // Range reduction. - double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21); - int k = static_cast(cpp::bit_cast(tmp) >> 19); - double kd = static_cast(k); - - uint32_t idx1 = (k >> 6) & 0x3f; - uint32_t idx2 = k & 0x3f; - - int hi = k >> 12; - - DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi}; - DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi}; - DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2); - - // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14 - double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact - double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h); - - double mid_lo = dx * exp_mid.hi; - - // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4. - double p = poly_approx_d(dx); - - double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); - -#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - return ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D) - .value(); -#else - if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D); - LIBC_LIKELY(r.has_value())) - return r.value(); - - // Use double-double - DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid); - - if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD); - LIBC_LIKELY(r.has_value())) - return r.value(); - - // Use 128-bit precision - Float128 r_f128 = exp10_f128(x, kd, idx1, idx2); - - return static_cast(r_f128); -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS -} - -// Check for exceptional cases when: -// * log10(1 - 2^-54) < x < log10(1 + 2^-53) -// * x >= log10(2^1024) -// * x <= log10(2^-1022) -// * x is inf or nan -double set_exceptional(double x) { - using FPBits = typename fputil::FPBits; - FPBits xbits(x); - - uint64_t x_u = xbits.uintval(); - uint64_t x_abs = xbits.abs().uintval(); - - // |x| < log10(1 + 2^-53) - if (x_abs <= 0x3c8bcb7b1526e50e) { - // 10^(x) ~ 1 + x/2 - return fputil::multiply_add(x, 0.5, 1.0); - } - - // x <= log10(2^-1022) || x >= log10(2^1024) or inf/nan. - if (x_u >= 0xc0733a7146f72a42) { - // x <= log10(2^-1075) or -inf/nan - if (x_u > 0xc07439b746e36b52) { - // exp(-Inf) = 0 - if (xbits.is_inf()) - return 0.0; - - // exp(nan) = nan - if (xbits.is_nan()) - return x; - - if (fputil::quick_get_round() == FE_UPWARD) - return FPBits::min_subnormal().get_val(); - fputil::set_errno_if_required(ERANGE); - fputil::raise_except_if_required(FE_UNDERFLOW); - return 0.0; - } - - return exp10_denorm(x); - } - - // x >= log10(2^1024) or +inf/nan - // x is finite - if (x_u < 0x7ff0'0000'0000'0000ULL) { - int rounding = fputil::quick_get_round(); - if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO) - return FPBits::max_normal().get_val(); - - fputil::set_errno_if_required(ERANGE); - fputil::raise_except_if_required(FE_OVERFLOW); - } - // x is +inf or nan - return x + FPBits::inf().get_val(); -} - -} // namespace - -LLVM_LIBC_FUNCTION(double, exp10, (double x)) { - using FPBits = typename fputil::FPBits; - FPBits xbits(x); - - uint64_t x_u = xbits.uintval(); - - // x <= log10(2^-1022) or x >= log10(2^1024) or - // log10(1 - 2^-54) < x < log10(1 + 2^-53). - if (LIBC_UNLIKELY(x_u >= 0xc0733a7146f72a42 || - (x_u <= 0xbc7bcb7b1526e50e && x_u >= 0x40734413509f79ff) || - x_u < 0x3c8bcb7b1526e50e)) { - return set_exceptional(x); - } - - // Now log10(2^-1075) < x <= log10(1 - 2^-54) or - // log10(1 + 2^-53) < x < log10(2^1024) - - // Range reduction: - // Let x = log10(2) * (hi + mid1 + mid2) + lo - // in which: - // hi is an integer - // mid1 * 2^6 is an integer - // mid2 * 2^12 is an integer - // then: - // 10^(x) = 2^hi * 2^(mid1) * 2^(mid2) * 10^(lo). - // With this formula: - // - multiplying by 2^hi is exact and cheap, simply by adding the exponent - // field. - // - 2^(mid1) and 2^(mid2) are stored in 2 x 64-element tables. - // - 10^(lo) ~ 1 + a0*lo + a1 * lo^2 + ... - // - // We compute (hi + mid1 + mid2) together by perform the rounding on - // x * log2(10) * 2^12. - // Since |x| < |log10(2^-1075)| < 2^9, - // |x * 2^12| < 2^9 * 2^12 < 2^21, - // So we can fit the rounded result round(x * 2^12) in int32_t. - // Thus, the goal is to be able to use an additional addition and fixed width - // shift to get an int32_t representing round(x * 2^12). - // - // Assuming int32_t using 2-complement representation, since the mantissa part - // of a double precision is unsigned with the leading bit hidden, if we add an - // extra constant C = 2^e1 + 2^e2 with e1 > e2 >= 2^23 to the product, the - // part that are < 2^e2 in resulted mantissa of (x*2^12*L2E + C) can be - // considered as a proper 2-complement representations of x*2^12. - // - // One small problem with this approach is that the sum (x*2^12 + C) in - // double precision is rounded to the least significant bit of the dorminant - // factor C. In order to minimize the rounding errors from this addition, we - // want to minimize e1. Another constraint that we want is that after - // shifting the mantissa so that the least significant bit of int32_t - // corresponds to the unit bit of (x*2^12*L2E), the sign is correct without - // any adjustment. So combining these 2 requirements, we can choose - // C = 2^33 + 2^32, so that the sign bit corresponds to 2^31 bit, and hence - // after right shifting the mantissa, the resulting int32_t has correct sign. - // With this choice of C, the number of mantissa bits we need to shift to the - // right is: 52 - 33 = 19. - // - // Moreover, since the integer right shifts are equivalent to rounding down, - // we can add an extra 0.5 so that it will become round-to-nearest, tie-to- - // +infinity. So in particular, we can compute: - // hmm = x * 2^12 + C, - // where C = 2^33 + 2^32 + 2^-1, then if - // k = int32_t(lower 51 bits of double(x * 2^12 + C) >> 19), - // the reduced argument: - // lo = x - log10(2) * 2^-12 * k is bounded by: - // |lo| = |x - log10(2) * 2^-12 * k| - // = log10(2) * 2^-12 * | x * log2(10) * 2^12 - k | - // <= log10(2) * 2^-12 * (2^-1 + 2^-19) - // < 1.5 * 2^-2 * (2^-13 + 2^-31) - // = 1.5 * (2^-15 * 2^-31) - // - // Finally, notice that k only uses the mantissa of x * 2^12, so the - // exponent 2^12 is not needed. So we can simply define - // C = 2^(33 - 12) + 2^(32 - 12) + 2^(-13 - 12), and - // k = int32_t(lower 51 bits of double(x + C) >> 19). - - // Rounding errors <= 2^-31. - double tmp = fputil::multiply_add(x, LOG2_10, 0x1.8000'0000'4p21); - int k = static_cast(cpp::bit_cast(tmp) >> 19); - double kd = static_cast(k); - - uint32_t idx1 = (k >> 6) & 0x3f; - uint32_t idx2 = k & 0x3f; - - int hi = k >> 12; - - DoubleDouble exp_mid1{EXP2_MID1[idx1].mid, EXP2_MID1[idx1].hi}; - DoubleDouble exp_mid2{EXP2_MID2[idx2].mid, EXP2_MID2[idx2].hi}; - DoubleDouble exp_mid = fputil::quick_mult(exp_mid1, exp_mid2); - - // |dx| < 1.5 * 2^-15 + 2^-31 < 2^-14 - double lo_h = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_HI, x); // exact - double dx = fputil::multiply_add(kd, MLOG10_2_EXP2_M12_MID, lo_h); - - // We use the degree-4 polynomial to approximate 10^(lo): - // 10^(lo) ~ 1 + a0 * lo + a1 * lo^2 + a2 * lo^3 + a3 * lo^4 - // = 1 + lo * P(lo) - // So that the errors are bounded by: - // |P(lo) - (10^lo - 1)/lo| < |lo|^4 / 64 < 2^(-13 * 4) / 64 = 2^-58 - // Let P_ be an evaluation of P where all intermediate computations are in - // double precision. Using either Horner's or Estrin's schemes, the evaluated - // errors can be bounded by: - // |P_(lo) - P(lo)| < 2^-51 - // => |lo * P_(lo) - (2^lo - 1) | < 2^-65 - // => 2^(mid1 + mid2) * |lo * P_(lo) - expm1(lo)| < 2^-64. - // Since we approximate - // 2^(mid1 + mid2) ~ exp_mid.hi + exp_mid.lo, - // We use the expression: - // (exp_mid.hi + exp_mid.lo) * (1 + dx * P_(dx)) ~ - // ~ exp_mid.hi + (exp_mid.hi * dx * P_(dx) + exp_mid.lo) - // with errors bounded by 2^-64. - - double mid_lo = dx * exp_mid.hi; - - // Approximate (10^dx - 1)/dx ~ 1 + a0*dx + a1*dx^2 + a2*dx^3 + a3*dx^4. - double p = poly_approx_d(dx); - - double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); - -#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - int64_t exp_hi = static_cast(hi) << FPBits::FRACTION_LEN; - double r = - cpp::bit_cast(exp_hi + cpp::bit_cast(exp_mid.hi + lo)); - return r; -#else - double upper = exp_mid.hi + (lo + ERR_D); - double lower = exp_mid.hi + (lo - ERR_D); - - if (LIBC_LIKELY(upper == lower)) { - // To multiply by 2^hi, a fast way is to simply add hi to the exponent - // field. - int64_t exp_hi = static_cast(hi) << FPBits::FRACTION_LEN; - double r = cpp::bit_cast(exp_hi + cpp::bit_cast(upper)); - return r; - } - - // Exact outputs when x = 1, 2, ..., 22 + hard to round with x = 23. - // Quick check mask: 0x800f'ffffU = ~(bits of 1.0 | ... | bits of 23.0) - if (LIBC_UNLIKELY((x_u & 0x8000'ffff'ffff'ffffULL) == 0ULL)) { - switch (x_u) { - case 0x3ff0000000000000: // x = 1.0 - return 10.0; - case 0x4000000000000000: // x = 2.0 - return 100.0; - case 0x4008000000000000: // x = 3.0 - return 1'000.0; - case 0x4010000000000000: // x = 4.0 - return 10'000.0; - case 0x4014000000000000: // x = 5.0 - return 100'000.0; - case 0x4018000000000000: // x = 6.0 - return 1'000'000.0; - case 0x401c000000000000: // x = 7.0 - return 10'000'000.0; - case 0x4020000000000000: // x = 8.0 - return 100'000'000.0; - case 0x4022000000000000: // x = 9.0 - return 1'000'000'000.0; - case 0x4024000000000000: // x = 10.0 - return 10'000'000'000.0; - case 0x4026000000000000: // x = 11.0 - return 100'000'000'000.0; - case 0x4028000000000000: // x = 12.0 - return 1'000'000'000'000.0; - case 0x402a000000000000: // x = 13.0 - return 10'000'000'000'000.0; - case 0x402c000000000000: // x = 14.0 - return 100'000'000'000'000.0; - case 0x402e000000000000: // x = 15.0 - return 1'000'000'000'000'000.0; - case 0x4030000000000000: // x = 16.0 - return 10'000'000'000'000'000.0; - case 0x4031000000000000: // x = 17.0 - return 100'000'000'000'000'000.0; - case 0x4032000000000000: // x = 18.0 - return 1'000'000'000'000'000'000.0; - case 0x4033000000000000: // x = 19.0 - return 10'000'000'000'000'000'000.0; - case 0x4034000000000000: // x = 20.0 - return 100'000'000'000'000'000'000.0; - case 0x4035000000000000: // x = 21.0 - return 1'000'000'000'000'000'000'000.0; - case 0x4036000000000000: // x = 22.0 - return 10'000'000'000'000'000'000'000.0; - case 0x4037000000000000: // x = 23.0 - return 0x1.52d02c7e14af6p76 + x; - } - } - - // Use double-double - DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid); - - double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD); - double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD); - - if (LIBC_LIKELY(upper_dd == lower_dd)) { - // To multiply by 2^hi, a fast way is to simply add hi to the exponent - // field. - int64_t exp_hi = static_cast(hi) << FPBits::FRACTION_LEN; - double r = cpp::bit_cast(exp_hi + cpp::bit_cast(upper_dd)); - return r; - } - - // Use 128-bit precision - Float128 r_f128 = exp10_f128(x, kd, idx1, idx2); - - return static_cast(r_f128); -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS -} +LLVM_LIBC_FUNCTION(double, exp10, (double x)) { return math::exp10(x); } } // namespace LIBC_NAMESPACE_DECL diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index fe843d3207ceb..8b60ca13562f6 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -2245,6 +2245,24 @@ libc_support_library( ], ) +libc_support_library( + name = "__support_math_exp10", + hdrs = ["src/__support/math/exp10.h"], + deps = [ + ":__support_math_exp_constants", + ":__support_math_exp_utils", + ":__support_fputil_double_double", + ":__support_fputil_dyadic_float", + ":__support_fputil_multiply_add", + ":__support_fputil_nearest_integer", + ":__support_fputil_polyeval", + ":__support_fputil_rounding_mode", + ":__support_fputil_triple_double", + ":__support_integer_literals", + ":__support_macros_optimization", + ], +) + ############################### complex targets ################################ libc_function( @@ -2849,17 +2867,8 @@ libc_math_function( libc_math_function( name = "exp10", additional_deps = [ - ":__support_fputil_double_double", - ":__support_fputil_dyadic_float", - ":__support_fputil_multiply_add", - ":__support_fputil_nearest_integer", - ":__support_fputil_polyeval", - ":__support_fputil_rounding_mode", - ":__support_fputil_triple_double", - ":__support_integer_literals", - ":__support_macros_optimization", - ":common_constants", - ":explogxf", + ":__support_math_exp10", + ":errno", ], ) From 94382c8e56e878d1b6a8cf317e3632df5352e23e Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Wed, 16 Jul 2025 13:09:04 -0700 Subject: [PATCH 099/813] [llvm][StackProtector] Add noreturn to __stack_chk_fail call (#143976) This is a reland for 99e53cb4139eda491f97cb33ee42ea424d352200 with the appropriate test fixes. It's possible for __stack_chk_fail to be an alias when using CrossDSOCFI since it will make a jump table entry for this function and replace it with an alias. StackProtector can crash since it always expects this to be a regular function. Instead add the noreturn attribute to the call. --- llvm/lib/CodeGen/StackProtector.cpp | 4 +-- .../cross-dso-cfi-stack-chk-fail.ll | 34 +++++++++++++++++++ .../StackProtector/stack-chk-fail-alias.ll | 22 ++++++++++++ 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll create mode 100644 llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index 9cc9af88c5e4f..b79911bcf3c49 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -731,8 +731,8 @@ BasicBlock *CreateFailBB(Function *F, const TargetLowering &TLI) { } if (StackChkFail) { - cast(StackChkFail.getCallee())->addFnAttr(Attribute::NoReturn); - B.CreateCall(StackChkFail, Args); + CallInst *Call = B.CreateCall(StackChkFail, Args); + Call->addFnAttr(Attribute::NoReturn); } B.CreateUnreachable(); diff --git a/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll new file mode 100644 index 0000000000000..9a102768b1277 --- /dev/null +++ b/llvm/test/Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll @@ -0,0 +1,34 @@ +;; This is a minimal reproducer that caused StackProtector to crash with a bad cast when +;; CrossDSOCFI is used. This test just needs to not crash. +; REQUIRES: x86-registered-target +; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=lowertypetests,cross-dso-cfi,stack-protector + +define hidden void @__stack_chk_fail() !type !1{ + unreachable +} + +define void @store_captures() sspstrong { +entry: + %a = alloca i32, align 4 + %j = alloca ptr, align 8 + store ptr %a, ptr %j, align 8 + ret void +} + +define void @func(ptr %0) { +entry: + %1 = call i1 @llvm.type.test(ptr %0, metadata !"typeid") + br i1 %1, label %cont, label %trap + +trap: ; preds = %entry + call void @llvm.trap() + unreachable + +cont: ; preds = %entry + call void %0() + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"Cross-DSO CFI", i32 1} +!1 = !{i64 0, !"typeid"} diff --git a/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll new file mode 100644 index 0000000000000..def3e014797de --- /dev/null +++ b/llvm/test/Transforms/StackProtector/stack-chk-fail-alias.ll @@ -0,0 +1,22 @@ +;; __stack_chk_fail should have the noreturn attr even if it is an alias +; REQUIRES: x86-registered-target +; RUN: opt -mtriple=x86_64-pc-linux-gnu %s -passes=stack-protector -S | FileCheck %s + +define hidden void @__stack_chk_fail_impl() { + unreachable +} + +@__stack_chk_fail = hidden alias void (), ptr @__stack_chk_fail_impl + +; CHECK-LABEL: @store_captures( +; CHECK: CallStackCheckFailBlk: +; CHECK-NEXT: call void @__stack_chk_fail() [[ATTRS:#.*]] +define void @store_captures() sspstrong { +entry: + %a = alloca i32, align 4 + %j = alloca ptr, align 8 + store ptr %a, ptr %j, align 8 + ret void +} + +; CHECK: attributes [[ATTRS]] = { noreturn } From ececa877083fcbe19aa0394b280630b9d807cd6d Mon Sep 17 00:00:00 2001 From: "Mikhail R. Gadelha" Date: Wed, 16 Jul 2025 17:25:27 -0300 Subject: [PATCH 100/813] [RISCV][VLOPT] Add support for vrgather (#148249) This PR adds support for the vrgather.vi, vrgather.vx, vrgather.vv, vrgatherei16.vv instructions in the RISC-V VLOptimizer. To support vrgatherei16.vv I also needed to add support for it in getOperandLog2EEW. --- llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 13 ++++ llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll | 12 ++-- .../test/CodeGen/RISCV/rvv/vl-opt-op-info.mir | 59 +++++++++++++++++++ 3 files changed, 76 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index c2b5e0135caea..e656e8bb99d86 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -747,6 +747,14 @@ getOperandLog2EEW(const MachineOperand &MO, const MachineRegisterInfo *MRI) { return TwoTimes ? MILog2SEW + 1 : MILog2SEW; } + // Vector Register Gather with 16-bit Index Elements Instruction + // Dest and source data EEW=SEW. Index vector EEW=16. + case RISCV::VRGATHEREI16_VV: { + if (MO.getOperandNo() == 2) + return 4; + return MILog2SEW; + } + default: return std::nullopt; } @@ -1058,6 +1066,11 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VSLIDEDOWN_VI: case RISCV::VSLIDE1UP_VX: case RISCV::VFSLIDE1UP_VF: + // Vector Register Gather Instructions + case RISCV::VRGATHER_VI: + case RISCV::VRGATHER_VV: + case RISCV::VRGATHER_VX: + case RISCV::VRGATHEREI16_VV: // Vector Single-Width Floating-Point Add/Subtract Instructions case RISCV::VFADD_VF: case RISCV::VFADD_VV: diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll index a5bc04d66e49d..4883a4dcfcf67 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll @@ -5468,9 +5468,8 @@ define @vrgather_vi( %a, ; ; VLOPT-LABEL: vrgather_vi: ; VLOPT: # %bb.0: -; VLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; VLOPT-NEXT: vrgather.vi v12, v8, 5 ; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vrgather.vi v12, v8, 5 ; VLOPT-NEXT: vadd.vv v8, v12, v10 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vrgather.vx.nxv4i32.iXLen( poison, %a, iXLen 5, iXLen -1) @@ -5489,9 +5488,8 @@ define @vrgather_vv( %a, ; ; VLOPT-LABEL: vrgather_vv: ; VLOPT: # %bb.0: -; VLOPT-NEXT: vsetvli a1, zero, e32, m2, ta, ma -; VLOPT-NEXT: vrgather.vv v12, v8, v10 ; VLOPT-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; VLOPT-NEXT: vrgather.vv v12, v8, v10 ; VLOPT-NEXT: vadd.vv v8, v12, v8 ; VLOPT-NEXT: ret %1 = call @llvm.riscv.vrgather.vv.nxv4i32( poison, %a, %idx, iXLen -1) @@ -5510,9 +5508,8 @@ define @vrgather_vx( %a, iXLen %idx, @llvm.riscv.vrgather.vx.nxv4i32.iXLen( poison, %a, iXLen %idx, iXLen -1) @@ -5531,9 +5528,8 @@ define @vrgatherei16_vv( %a, @llvm.riscv.vrgatherei16.vv.nxv4i32( poison, %a, %idx, iXLen -1) diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir index b39ba422bd349..52cd3e35e6eb8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir @@ -1801,4 +1801,63 @@ body: | ; CHECK-NEXT: %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0 /* e8 */ %x:vr = PseudoVMSET_M_B8 -1, 0 %y:vr = PseudoVMAND_MM_B16 $noreg, %x, 1, 0 +... +--- +name: vrgatherei16_vv +body: | + bb.0: + ; CHECK-LABEL: name: vrgatherei16_vv + ; CHECK: early-clobber %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 + %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- +name: vrgatherei16_vv_incompatible_data_eew +body: | + bb.0: + ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_data_eew + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... --- +name: vrgatherei16_vv_incompatible_index_eew +body: | + bb.0: + ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_index_eew + ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0 +... +--- +name: vrgatherei16_vv_incompatible_dest_emul +body: | + bb.0: + ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_dest_emul + ; CHECK: early-clobber %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 + %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- +name: vrgatherei16_vv_incompatible_source_emul +body: | + bb.0: + ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_source_emul + ; CHECK: %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_MF2 $noreg, $noreg, $noreg, -1, 5 /* e32 */, 0 + %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- +name: vrgatherei16_vv_incompatible_index_emul +body: | + bb.0: + ; CHECK-LABEL: name: vrgatherei16_vv_incompatible_index_emul + ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */ + ; CHECK-NEXT: early-clobber %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 + %y:vr = PseudoVRGATHEREI16_VV_M1_E32_MF2 $noreg, $noreg, %x, 1, 5 /* e32 */, 0 From 6824bcfdb4c8315a990f4b5ce2cb9f528281a823 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Wed, 16 Jul 2025 13:46:02 -0700 Subject: [PATCH 101/813] [IA] Relax the requirement of having ExtractValue users on deinterleave intrinsic (#148716) There are cases where InstCombine / InstSimplify might sink extractvalue instructions that use a deinterleave intrinsic into successor blocks, which prevents InterleavedAccess from kicking in because the current pattern requires deinterleave intrinsic to be used by extractvalue. However, this requirement is bit too strict while we could have just replaced the users of deinterleave intrinsic with whatever generated by the target TLI hooks. --- llvm/include/llvm/Analysis/VectorUtils.h | 5 + llvm/include/llvm/CodeGen/TargetLowering.h | 7 +- llvm/lib/Analysis/VectorUtils.cpp | 9 ++ llvm/lib/CodeGen/InterleavedAccessPass.cpp | 23 +--- .../Target/AArch64/AArch64ISelLowering.cpp | 31 ++--- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 5 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 5 +- .../Target/RISCV/RISCVInterleavedAccess.cpp | 21 +--- .../rvv/fixed-vectors-deinterleave-load.ll | 53 +++++++++ .../RISCV/rvv/vp-vector-interleaved-access.ll | 109 ++++++++++++++++++ .../AArch64/fixed-deinterleave-intrinsics.ll | 32 ++--- .../scalable-deinterleave-intrinsics.ll | 36 +++--- .../AArch64/sve-deinterleave4.ll | 27 +++-- .../AArch64/sve-interleaved-accesses.ll | 4 + 14 files changed, 263 insertions(+), 104 deletions(-) diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index af1e0d7251a4f..9a2773c06bae6 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -24,6 +24,7 @@ namespace llvm { class TargetLibraryInfo; +class IntrinsicInst; /// The Vector Function Database. /// @@ -188,6 +189,10 @@ LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID); /// Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics. LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID); +/// Given a deinterleaveN intrinsic, return the (narrow) vector type of each +/// factor. +LLVM_ABI VectorType *getDeinterleavedVectorType(IntrinsicInst *DI); + /// Given a vector and an element number, see if the scalar value is /// already around as a register, for example if it were inserted then extracted /// from the vector. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 72594c7f9783c..238d07a20eec8 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3251,10 +3251,9 @@ class LLVM_ABI TargetLoweringBase { /// /// \p Load is the accompanying load instruction. Can be either a plain load /// instruction or a vp.load intrinsic. - /// \p DeinterleaveValues contains the deinterleaved values. - virtual bool - lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, - ArrayRef DeinterleaveValues) const { + /// \p DI represents the deinterleaveN intrinsic. + virtual bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const { return false; } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 7f0ed0b60a785..1b3da590cff7f 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -306,6 +306,15 @@ unsigned llvm::getDeinterleaveIntrinsicFactor(Intrinsic::ID ID) { } } +VectorType *llvm::getDeinterleavedVectorType(IntrinsicInst *DI) { + [[maybe_unused]] unsigned Factor = + getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); + ArrayRef DISubtypes = DI->getType()->subtypes(); + assert(Factor && Factor == DISubtypes.size() && + "unexpected deinterleave factor or result type"); + return cast(DISubtypes[0]); +} + /// Given a vector and an element number, see if the scalar value is /// already around as a register, for example if it were inserted then extracted /// from the vector. diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 95599837e1bfc..68a956921c8e0 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -618,29 +618,13 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); assert(Factor && "unexpected deinterleave intrinsic"); - SmallVector DeinterleaveValues(Factor, nullptr); - Value *LastFactor = nullptr; - for (auto *User : DI->users()) { - auto *Extract = dyn_cast(User); - if (!Extract || Extract->getNumIndices() != 1) - return false; - unsigned Idx = Extract->getIndices()[0]; - if (DeinterleaveValues[Idx]) - return false; - DeinterleaveValues[Idx] = Extract; - LastFactor = Extract; - } - - if (!LastFactor) - return false; - Value *Mask = nullptr; if (auto *VPLoad = dyn_cast(LoadedVal)) { if (VPLoad->getIntrinsicID() != Intrinsic::vp_load) return false; // Check mask operand. Handle both all-true/false and interleaved mask. Value *WideMask = VPLoad->getOperand(1); - Mask = getMask(WideMask, Factor, cast(LastFactor->getType())); + Mask = getMask(WideMask, Factor, getDeinterleavedVectorType(DI)); if (!Mask) return false; @@ -657,12 +641,9 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( // Try and match this with target specific intrinsics. if (!TLI->lowerDeinterleaveIntrinsicToLoad(cast(LoadedVal), Mask, - DeinterleaveValues)) + DI)) return false; - for (Value *V : DeinterleaveValues) - if (V) - DeadInsts.insert(cast(V)); DeadInsts.insert(DI); // We now have a target-specific load, so delete the old one. DeadInsts.insert(cast(LoadedVal)); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 4f13a14d24649..46c53843ba3a4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17486,9 +17486,8 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, } bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef DeinterleavedValues) const { - unsigned Factor = DeinterleavedValues.size(); + Instruction *Load, Value *Mask, IntrinsicInst *DI) const { + const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n"); return false; @@ -17498,9 +17497,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( return false; assert(!Mask && "Unexpected mask on a load\n"); - Value *FirstActive = *llvm::find_if(DeinterleavedValues, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast(FirstActive->getType()); + VectorType *VTy = getDeinterleavedVectorType(DI); const DataLayout &DL = LI->getModule()->getDataLayout(); bool UseScalable; @@ -17528,6 +17525,7 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue()); Value *BaseAddr = LI->getPointerOperand(); + Value *Result = nullptr; if (NumLoads > 1) { // Create multiple legal small ldN. SmallVector ExtractedLdValues(Factor, PoisonValue::get(VTy)); @@ -17548,25 +17546,20 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( } LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump()); } - // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned J = 0; J < Factor; ++J) { - if (DeinterleavedValues[J]) - DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]); - } + + // Merge the values from different factors. + Result = PoisonValue::get(DI->getType()); + for (unsigned J = 0; J < Factor; ++J) + Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J); } else { - Value *Result; if (UseScalable) Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); else Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); - // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 - for (unsigned I = 0; I < Factor; I++) { - if (DeinterleavedValues[I]) { - Value *NewExtract = Builder.CreateExtractValue(Result, I); - DeinterleavedValues[I]->replaceAllUsesWith(NewExtract); - } - } } + + // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4 + DI->replaceAllUsesWith(Result); return true; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 6afb3c330d25b..a19bf19387a8c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -218,9 +218,8 @@ class AArch64TargetLowering : public TargetLowering { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 3af729aaba2ae..e8adf561c9c35 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -437,9 +437,8 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef DeinterleaveValues) const override; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, + IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( StoreInst *SI, ArrayRef InterleaveValues) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index ddfacd970e950..025054d5a2a60 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -14,6 +14,7 @@ #include "RISCVISelLowering.h" #include "RISCVSubtarget.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" @@ -256,17 +257,14 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { } bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( - Instruction *Load, Value *Mask, - ArrayRef DeinterleaveValues) const { - const unsigned Factor = DeinterleaveValues.size(); + Instruction *Load, Value *Mask, IntrinsicInst *DI) const { + const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); if (Factor > 8) return false; IRBuilder<> Builder(Load); - Value *FirstActive = - *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; }); - VectorType *ResVTy = cast(FirstActive->getType()); + VectorType *ResVTy = getDeinterleavedVectorType(DI); const DataLayout &DL = Load->getDataLayout(); auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); @@ -346,16 +344,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( } } - for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) { - if (!DIV) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast(Idx)}); - DIV->replaceAllUsesWith(NewEV); - } - + DI->replaceAllUsesWith(Return); return true; } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll index 3e822d357b667..807651c9b40c6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll @@ -274,6 +274,59 @@ define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2 } +define { <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_partial(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_factor3_partial: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vlseg3e8.v v7, (a0) +; CHECK-NEXT: vmv1r.v v8, v7 +; CHECK-NEXT: ret + %vec = load <24 x i8>, ptr %p + %d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec) + %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0 + %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2 + %res0 = insertvalue { <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8> } %res0, <8 x i8> %t2, 1 + ret { <8 x i8>, <8 x i8> } %res1 +} + +; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue. +define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3_no_extract(ptr %p, ptr %p1, i1 %c) { +; CHECK-LABEL: vector_deinterleave_load_factor3_no_extract: +; CHECK: # %bb.0: +; CHECK-NEXT: andi a2, a2, 1 +; CHECK-NEXT: beqz a2, .LBB17_2 +; CHECK-NEXT: # %bb.1: # %bb0 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vlseg3e8.v v6, (a0) +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB17_2: # %bb1 +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vlseg3e8.v v6, (a1) +; CHECK-NEXT: ret + br i1 %c, label %bb0, label %bb1 + +bb0: + %vec0 = load <24 x i8>, ptr %p + %d0.0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec0) + br label %merge + +bb1: + %vec1 = load <24 x i8>, ptr %p1 + %d0.1 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec1) + br label %merge + +merge: + %d0 = phi {<8 x i8>, <8 x i8>, <8 x i8>} [%d0.0, %bb0], [%d0.1, %bb1] + %t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0 + %t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1 + %t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2 + %res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0 + %res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0 + %res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0 + ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2 +} + define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) { ; CHECK-LABEL: vector_deinterleave_load_factor4: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 7fb822d20f892..27ecbe56bda42 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -66,6 +66,115 @@ define {, , } @load_factor ret { , , } %res1 } +define {, } @load_factor3_partial(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor3_partial: +; RV32: # %bb.0: +; RV32-NEXT: slli a2, a1, 1 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: lui a2, 699051 +; RV32-NEXT: addi a2, a2, -1365 +; RV32-NEXT: mulhu a1, a1, a2 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg3e32.v v7, (a0) +; RV32-NEXT: vmv1r.v v8, v7 +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor3_partial: +; RV64: # %bb.0: +; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: lui a2, 699051 +; RV64-NEXT: addi a2, a2, -1365 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: mulhu a1, a1, a2 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg3e32.v v7, (a0) +; RV64-NEXT: vmv1r.v v8, v7 +; RV64-NEXT: ret + %rvl = mul i32 %evl, 3 + %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) + %deinterleaved.results = call { , , } @llvm.vector.deinterleave3( %wide.masked.load) + %t0 = extractvalue { , , } %deinterleaved.results, 0 + %t2 = extractvalue { , , } %deinterleaved.results, 2 + %res0 = insertvalue { , } poison, %t0, 0 + %res1 = insertvalue { , } %res0, %t2, 1 + ret { , } %res1 +} + +; InterleavedAccess should kick in even if the users of deinterleave intrinsic are not extractvalue. +define {, } @load_factor3_no_extract(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor3_no_extract: +; RV32: # %bb.0: +; RV32-NEXT: li a2, 12 +; RV32-NEXT: beq a1, a2, .LBB3_2 +; RV32-NEXT: # %bb.1: # %bb0 +; RV32-NEXT: slli a2, a1, 1 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: lui a2, 699051 +; RV32-NEXT: addi a2, a2, -1365 +; RV32-NEXT: mulhu a1, a1, a2 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg3e32.v v7, (a0) +; RV32-NEXT: j .LBB3_3 +; RV32-NEXT: .LBB3_2: # %bb1 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vlseg3e32.v v7, (a0) +; RV32-NEXT: .LBB3_3: # %merge +; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32-NEXT: vmv1r.v v8, v7 +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor3_no_extract: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a2, a1 +; RV64-NEXT: li a3, 12 +; RV64-NEXT: beq a2, a3, .LBB3_2 +; RV64-NEXT: # %bb.1: # %bb0 +; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: lui a2, 699051 +; RV64-NEXT: addi a2, a2, -1365 +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: slli a2, a2, 32 +; RV64-NEXT: mulhu a1, a1, a2 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg3e32.v v7, (a0) +; RV64-NEXT: j .LBB3_3 +; RV64-NEXT: .LBB3_2: # %bb1 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vlseg3e32.v v7, (a0) +; RV64-NEXT: .LBB3_3: # %merge +; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64-NEXT: vmv1r.v v8, v7 +; RV64-NEXT: ret + %p = icmp ne i32 %evl, 12 + br i1 %p, label %bb0, label %bb1 + +bb0: + %rvl.0 = mul i32 %evl, 3 + %wide.load.0 = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl.0) + %deinterleaved.results.0 = call { , , } @llvm.vector.deinterleave3( %wide.load.0) + br label %merge + +bb1: + %wide.load.1 = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 12) + %deinterleaved.results.1 = call { , , } @llvm.vector.deinterleave3( %wide.load.1) + br label %merge + +merge: + %deinterleaved.results = phi { , , } [%deinterleaved.results.0, %bb0], [%deinterleaved.results.1, %bb1] + %t0 = extractvalue { , , } %deinterleaved.results, 0 + %t2 = extractvalue { , , } %deinterleaved.results, 2 + %res0 = insertvalue { , } poison, %t0, 0 + %res1 = insertvalue { , } %res0, %t2, 1 + ret { , } %res1 +} + define {, , , } @load_factor4_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor4_v2: ; RV32: # %bb.0: diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll index 09e2c53465cd7..6c81d9a4d2ed6 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll @@ -10,8 +10,8 @@ define void @deinterleave_i8_factor2(ptr %ptr) { ; NEON-LABEL: define void @deinterleave_i8_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]]) -; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0 -; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1 +; NEON-NEXT: [[EXTRACT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0 +; NEON-NEXT: [[EXTRACT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1 ; NEON-NEXT: ret void ; ; SVE-FIXED-LABEL: define void @deinterleave_i8_factor2 @@ -33,8 +33,8 @@ define void @deinterleave_i16_factor2(ptr %ptr) { ; NEON-LABEL: define void @deinterleave_i16_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]]) -; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0 -; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1 +; NEON-NEXT: [[EXTRACT1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0 +; NEON-NEXT: [[EXTRACT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1 ; NEON-NEXT: ret void ; ; SVE-FIXED-LABEL: define void @deinterleave_i16_factor2 @@ -56,8 +56,8 @@ define void @deinterleave_8xi32_factor2(ptr %ptr) { ; NEON-LABEL: define void @deinterleave_8xi32_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]]) -; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0 -; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1 +; NEON-NEXT: [[EXTRACT1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0 +; NEON-NEXT: [[EXTRACT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1 ; NEON-NEXT: ret void ; ; SVE-FIXED-LABEL: define void @deinterleave_8xi32_factor2 @@ -79,8 +79,8 @@ define void @deinterleave_i64_factor2(ptr %ptr) { ; NEON-LABEL: define void @deinterleave_i64_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]]) -; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0 -; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1 +; NEON-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0 +; NEON-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1 ; NEON-NEXT: ret void ; ; SVE-FIXED-LABEL: define void @deinterleave_i64_factor2 @@ -102,8 +102,8 @@ define void @deinterleave_float_factor2(ptr %ptr) { ; NEON-LABEL: define void @deinterleave_float_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]]) -; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0 -; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1 +; NEON-NEXT: [[EXTRACT1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0 +; NEON-NEXT: [[EXTRACT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1 ; NEON-NEXT: ret void ; ; SVE-FIXED-LABEL: define void @deinterleave_float_factor2 @@ -125,8 +125,8 @@ define void @deinterleave_double_factor2(ptr %ptr) { ; NEON-LABEL: define void @deinterleave_double_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]]) -; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0 -; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1 +; NEON-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0 +; NEON-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1 ; NEON-NEXT: ret void ; ; SVE-FIXED-LABEL: define void @deinterleave_double_factor2 @@ -148,8 +148,8 @@ define void @deinterleave_ptr_factor2(ptr %ptr) { ; NEON-LABEL: define void @deinterleave_ptr_factor2 ; NEON-SAME: (ptr [[PTR:%.*]]) { ; NEON-NEXT: [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]]) -; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0 -; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1 +; NEON-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0 +; NEON-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1 ; NEON-NEXT: ret void ; ; SVE-FIXED-LABEL: define void @deinterleave_ptr_factor2 @@ -301,6 +301,10 @@ define void @deinterleave_wide_i16_factor2(ptr %ptr) #0 { ; NEON-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8) ; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1 ; NEON-NEXT: [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8) +; NEON-NEXT: [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0 +; NEON-NEXT: [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1 +; NEON-NEXT: [[EXTRACT1:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 0 +; NEON-NEXT: [[EXTRACT2:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 1 ; NEON-NEXT: ret void ; ; SVE-FIXED-LABEL: define void @deinterleave_wide_i16_factor2 diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll index 436389ba5b991..d7649801ea2fc 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll @@ -8,8 +8,8 @@ define void @deinterleave_nxi8_factor2(ptr %ptr) #0 { ; CHECK-LABEL: define void @deinterleave_nxi8_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( splat (i1 true), ptr [[PTR]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[LDN]], 0 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 1 @@ -23,8 +23,8 @@ define void @deinterleave_nxi16_factor2(ptr %ptr) #0 { ; CHECK-LABEL: define void @deinterleave_nxi16_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( splat (i1 true), ptr [[PTR]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[LDN]], 1 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 2 @@ -38,8 +38,8 @@ define void @deinterleave_nx8xi32_factor2(ptr %ptr) #0 { ; CHECK-LABEL: define void @deinterleave_nx8xi32_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( splat (i1 true), ptr [[PTR]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[LDN]], 1 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 4 @@ -53,8 +53,8 @@ define void @deinterleave_nxi64_factor2(ptr %ptr) #0 { ; CHECK-LABEL: define void @deinterleave_nxi64_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( splat (i1 true), ptr [[PTR]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[LDN]], 1 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 8 @@ -68,8 +68,8 @@ define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 { ; CHECK-LABEL: define void @deinterleave_nxfloat_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4f32( splat (i1 true), ptr [[PTR]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[LDN]], 1 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 4 @@ -83,8 +83,8 @@ define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 { ; CHECK-LABEL: define void @deinterleave_nxdouble_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( splat (i1 true), ptr [[PTR]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[LDN]], 1 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 8 @@ -98,8 +98,8 @@ define void @deinterleave_nxptr_factor2(ptr %ptr) #0 { ; CHECK-LABEL: define void @deinterleave_nxptr_factor2 ; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2p0( splat (i1 true), ptr [[PTR]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , } [[LDN]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[LDN]], 1 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 8 @@ -215,6 +215,10 @@ define void @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 { ; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP13]], [[TMP17]], i64 12) ; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[LDN3]], 1 ; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP15]], [[TMP19]], i64 12) +; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { , } poison, [[TMP18]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { , } [[TMP21]], [[TMP20]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[TMP22]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[TMP22]], 1 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 4 @@ -239,6 +243,10 @@ define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP7]], i64 2) ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP5]], [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , } poison, [[TMP8]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , } [[TMP11]], [[TMP10]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[TMP12]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[TMP12]], 1 ; CHECK-NEXT: ret void ; %load = load , ptr %ptr, align 8 diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll index c565066541d1d..58c0bccc3be38 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll @@ -49,8 +49,16 @@ define void @wide_deinterleave4(ptr %src) { ; CHECK-NEXT: [[TMP16:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP7]], [[TMP15]], i64 4) ; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN1]], 3 ; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP17]], i64 4) -; CHECK-NEXT: [[SUM:%.*]] = add [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[SUB:%.*]] = sub [[TMP16]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = insertvalue { , , , } poison, [[TMP12]], 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertvalue { , , , } [[TMP19]], [[TMP14]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { , , , } [[TMP20]], [[TMP16]], 2 +; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { , , , } [[TMP21]], [[TMP18]], 3 +; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { , , , } [[TMP22]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = extractvalue { , , , } [[TMP22]], 1 +; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { , , , } [[TMP22]], 2 +; CHECK-NEXT: [[TMP26:%.*]] = extractvalue { , , , } [[TMP22]], 3 +; CHECK-NEXT: [[SUM:%.*]] = add [[TMP23]], [[TMP24]] +; CHECK-NEXT: [[SUB:%.*]] = sub [[TMP25]], [[TMP26]] ; CHECK-NEXT: ret void ; %load = load , ptr %src, align 4 @@ -73,8 +81,8 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) { ; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 3 ; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( splat (i1 true), ptr [[SRC]]) -; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[LDN1]], 0 -; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: [[LD2_1:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[LD2_2:%.*]] = extractvalue { , } [[LDN1]], 1 ; CHECK-NEXT: ret void ; @@ -95,12 +103,11 @@ define void @mix_deinterleave4_deinterleave2(ptr %src) { define void @negative_deinterleave4_test(ptr %src) { ; CHECK-LABEL: define void @negative_deinterleave4_test ; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[SRC]], align 4 -; CHECK-NEXT: [[DEINTERLEAVE:%.*]] = tail call { , , , } @llvm.vector.deinterleave4.nxv16i32( [[LOAD]]) -; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[DEINTERLEAVE]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[DEINTERLEAVE]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[DEINTERLEAVE]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[DEINTERLEAVE]], 2 +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[SRC]]) +; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 2 ; CHECK-NEXT: ret void ; %load = load , ptr %src, align 4 diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll index b109448bd5d7c..1418ca09c0d61 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll @@ -606,6 +606,10 @@ define void @deinterleave_nxptr_factor2(ptr %ptr) #2 { ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP7]], i64 2) ; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN2]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP5]], [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , } poison, [[TMP8]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , } [[TMP11]], [[TMP10]], 1 +; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { , } [[TMP12]], 0 +; CHECK-NEXT: [[EXTRACT2:%.*]] = extractvalue { , } [[TMP12]], 1 ; CHECK-NEXT: ret void ; %wide.vec = load , ptr %ptr, align 8 From 8c28f4920dfda2e3d91c58e8eb5b568dd396fa2d Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Wed, 16 Jul 2025 21:47:13 +0100 Subject: [PATCH 102/813] [lldb] Print children-count warning for dwim-print and expr (#149088) When dumping variables, LLDB will print a one-time warning about truncating children (when the children count exceeds the default `target.max-children-count`). But we only do this for `frame variable`. So if we use `dwim-print` or `expression`, the output gets truncated but we don't print a warning. But because we store the fact that we truncated some output on the `CommandInterpreter`, we fire the warning next time we use `frame variable`. E.g.,: ``` (lldb) p arr (int[1000]) { [0] = -5 [1] = 0 [2] = 0 <-- snipped --> [253] = 0 [254] = 0 [255] = 0 ... } (lldb) v someLocal (int) someLocal = 10 *** Some of the displayed variables have more members than the debugger will show by default. To show all of them, you can either use the --show-all-children option to frame variable or raise the limit by changing the target.max-children-count setting. ``` This patch prints the warning for `dwim-print` and `expression`. I only added a test for the `target.max-children-count` for now because it seems the `target.max-children-depth` warning is broken (I can't get it to fire). --- .../Commands/CommandObjectDWIMPrint.cpp | 2 + .../Commands/CommandObjectExpression.cpp | 3 + .../Settings/TestChildCountTruncation.test | 67 +++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 lldb/test/Shell/Settings/TestChildCountTruncation.test diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp index a110eececf4d6..a2c004d0ee97f 100644 --- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp +++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp @@ -150,6 +150,8 @@ void CommandObjectDWIMPrint::DoExecute(StringRef command, return; } } + m_interpreter.PrintWarningsIfNecessary(result.GetOutputStream(), + m_cmd_name); result.SetStatus(eReturnStatusSuccessFinishResult); }; diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp index a95dea63720ac..c5b91678103d5 100644 --- a/lldb/source/Commands/CommandObjectExpression.cpp +++ b/lldb/source/Commands/CommandObjectExpression.cpp @@ -470,6 +470,9 @@ bool CommandObjectExpression::EvaluateExpression(llvm::StringRef expr, return false; } + m_interpreter.PrintWarningsIfNecessary(result.GetOutputStream(), + m_cmd_name); + if (suppress_result) if (auto result_var_sp = target.GetPersistentVariable(result_valobj_sp->GetName())) { diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test new file mode 100644 index 0000000000000..3b75498297b05 --- /dev/null +++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test @@ -0,0 +1,67 @@ +# Test that we warn the user about truncated output +# when target.max-children-count wasn't explicitly set. + +# RUN: split-file %s %t +# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s --check-prefix=DWIM +# +# RUN: %lldb -x -b -s %t/expr-commands.input %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s --check-prefix=EXPR +# +# RUN: %lldb -x -b -s %t/frame-var-commands.input %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s --check-prefix=VAR +# +# RUN: %lldb -x -b -s %t/with-setting-commands.input %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s --check-prefix=SETTING + +#--- main.cpp + +int main() { + int arr[512] = { 3 }; + __builtin_debugtrap(); +} + +#--- dwim-commands.input + +run +dwim-print arr +frame variable arr + +DWIM: (lldb) dwim-print arr +DWIM: *** Some of the displayed variables have more members +DWIM-SAME: use the --show-all-children option to dwim-print +DWIM: (lldb) frame variable arr +DWIM-NOT: *** Some of the displayed variables have more members + +#--- expr-commands.input + +run +expression arr +frame variable arr + +EXPR: (lldb) expression arr +EXPR: *** Some of the displayed variables have more members +EXPR-SAME: use the --show-all-children option to expression +EXPR: (lldb) frame variable arr +EXPR-NOT: *** Some of the displayed variables have more members + +#--- frame-var-commands.input + +run +frame variable arr + +VAR: (lldb) frame variable arr +VAR: *** Some of the displayed variables have more members +VAR-SAME: use the --show-all-children option to frame variable +VAR: (lldb) frame variable arr +VAR-NOT: *** Some of the displayed variables have more members + +#--- with-setting-commands.input + +run +settings set target.max-children-count 1 +frame variable arr + +SETTING: (lldb) frame variable arr +SETTING-NOT: *** Some of the displayed variables have more members From b9f5b39e04d22e9a6ad451bdc0779bed31015372 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 16 Jul 2025 21:54:49 +0100 Subject: [PATCH 103/813] [TableGen] Remove explicit recursion in LexToken (#143697) When profiling a Release+Asserts build of llvm-tblgen I noticed that it was recursing hundreds of times to lex a sequence of hundreds of space characters. --- llvm/lib/TableGen/TGLexer.cpp | 267 ++++++++++++++++++++-------------- 1 file changed, 156 insertions(+), 111 deletions(-) diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp index c8e020d791e09..aea1bb0c6d75e 100644 --- a/llvm/lib/TableGen/TGLexer.cpp +++ b/llvm/lib/TableGen/TGLexer.cpp @@ -174,129 +174,174 @@ int TGLexer::peekNextChar(int Index) const { } tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) { - TokStart = CurPtr; - // This always consumes at least one character. - int CurChar = getNextChar(); + while (true) { + TokStart = CurPtr; + // This always consumes at least one character. + int CurChar = getNextChar(); - switch (CurChar) { - default: - // Handle letters: [a-zA-Z_] - if (isValidIDChar(CurChar, /*First=*/true)) - return LexIdentifier(); - - // Unknown character, emit an error. - return ReturnError(TokStart, "unexpected character"); - case EOF: - // Lex next token, if we just left an include file. - // Note that leaving an include file means that the next - // symbol is located at the end of the 'include "..."' - // construct, so LexToken() is called with default - // false parameter. - if (processEOF()) - return LexToken(); + switch (CurChar) { + default: + // Handle letters: [a-zA-Z_] + if (isValidIDChar(CurChar, /*First=*/true)) + return LexIdentifier(); - // Return EOF denoting the end of lexing. - return tgtok::Eof; - - case ':': return tgtok::colon; - case ';': return tgtok::semi; - case ',': return tgtok::comma; - case '<': return tgtok::less; - case '>': return tgtok::greater; - case ']': return tgtok::r_square; - case '{': return tgtok::l_brace; - case '}': return tgtok::r_brace; - case '(': return tgtok::l_paren; - case ')': return tgtok::r_paren; - case '=': return tgtok::equal; - case '?': return tgtok::question; - case '#': - if (FileOrLineStart) { - tgtok::TokKind Kind = prepIsDirective(); - if (Kind != tgtok::Error) - return lexPreprocessor(Kind); - } + // Unknown character, emit an error. + return ReturnError(TokStart, "unexpected character"); + case EOF: + // Lex next token, if we just left an include file. + if (processEOF()) { + // Leaving an include file means that the next symbol is located at the + // end of the 'include "..."' construct. + FileOrLineStart = false; + break; + } - return tgtok::paste; + // Return EOF denoting the end of lexing. + return tgtok::Eof; + + case ':': + return tgtok::colon; + case ';': + return tgtok::semi; + case ',': + return tgtok::comma; + case '<': + return tgtok::less; + case '>': + return tgtok::greater; + case ']': + return tgtok::r_square; + case '{': + return tgtok::l_brace; + case '}': + return tgtok::r_brace; + case '(': + return tgtok::l_paren; + case ')': + return tgtok::r_paren; + case '=': + return tgtok::equal; + case '?': + return tgtok::question; + case '#': + if (FileOrLineStart) { + tgtok::TokKind Kind = prepIsDirective(); + if (Kind != tgtok::Error) + return lexPreprocessor(Kind); + } + + return tgtok::paste; - // The period is a separate case so we can recognize the "..." - // range punctuator. - case '.': - if (peekNextChar(0) == '.') { - ++CurPtr; // Eat second dot. + // The period is a separate case so we can recognize the "..." + // range punctuator. + case '.': if (peekNextChar(0) == '.') { - ++CurPtr; // Eat third dot. - return tgtok::dotdotdot; + ++CurPtr; // Eat second dot. + if (peekNextChar(0) == '.') { + ++CurPtr; // Eat third dot. + return tgtok::dotdotdot; + } + return ReturnError(TokStart, "invalid '..' punctuation"); } - return ReturnError(TokStart, "invalid '..' punctuation"); - } - return tgtok::dot; + return tgtok::dot; - case '\r': - llvm_unreachable("getNextChar() must never return '\r'"); + case '\r': + llvm_unreachable("getNextChar() must never return '\r'"); - case ' ': - case '\t': - // Ignore whitespace. - return LexToken(FileOrLineStart); - case '\n': - // Ignore whitespace, and identify the new line. - return LexToken(true); - case '/': - // If this is the start of a // comment, skip until the end of the line or - // the end of the buffer. - if (*CurPtr == '/') - SkipBCPLComment(); - else if (*CurPtr == '*') { - if (SkipCComment()) - return tgtok::Error; - } else // Otherwise, this is an error. - return ReturnError(TokStart, "unexpected character"); - return LexToken(FileOrLineStart); - case '-': case '+': - case '0': case '1': case '2': case '3': case '4': case '5': case '6': - case '7': case '8': case '9': { - int NextChar = 0; - if (isDigit(CurChar)) { - // Allow identifiers to start with a number if it is followed by - // an identifier. This can happen with paste operations like - // foo#8i. - int i = 0; - do { - NextChar = peekNextChar(i++); - } while (isDigit(NextChar)); - - if (NextChar == 'x' || NextChar == 'b') { - // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most - // likely a number. - int NextNextChar = peekNextChar(i); - switch (NextNextChar) { - default: - break; - case '0': case '1': - if (NextChar == 'b') - return LexNumber(); - [[fallthrough]]; - case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - if (NextChar == 'x') - return LexNumber(); - break; + case ' ': + case '\t': + // Ignore whitespace. + break; + case '\n': + // Ignore whitespace, and identify the new line. + FileOrLineStart = true; + break; + case '/': + // If this is the start of a // comment, skip until the end of the line or + // the end of the buffer. + if (*CurPtr == '/') + SkipBCPLComment(); + else if (*CurPtr == '*') { + if (SkipCComment()) + return tgtok::Error; + } else // Otherwise, this is an error. + return ReturnError(TokStart, "unexpected character"); + break; + case '-': + case '+': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': { + int NextChar = 0; + if (isDigit(CurChar)) { + // Allow identifiers to start with a number if it is followed by + // an identifier. This can happen with paste operations like + // foo#8i. + int i = 0; + do { + NextChar = peekNextChar(i++); + } while (isDigit(NextChar)); + + if (NextChar == 'x' || NextChar == 'b') { + // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most + // likely a number. + int NextNextChar = peekNextChar(i); + switch (NextNextChar) { + default: + break; + case '0': + case '1': + if (NextChar == 'b') + return LexNumber(); + [[fallthrough]]; + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + if (NextChar == 'x') + return LexNumber(); + break; + } } } - } - if (isValidIDChar(NextChar, /*First=*/true)) - return LexIdentifier(); + if (isValidIDChar(NextChar, /*First=*/true)) + return LexIdentifier(); - return LexNumber(); - } - case '"': return LexString(); - case '$': return LexVarName(); - case '[': return LexBracket(); - case '!': return LexExclaim(); + return LexNumber(); + } + case '"': + return LexString(); + case '$': + return LexVarName(); + case '[': + return LexBracket(); + case '!': + return LexExclaim(); + } } } From 7caa0c9a55b33d8d627975e94c3367aa68dc37c7 Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Wed, 16 Jul 2025 14:02:51 -0700 Subject: [PATCH 104/813] Revert "[CI] Make email check workflow fail when author's email is private in Github UI" (#149186) Reverts llvm/llvm-project#148694 The workflow is failing if user's email is not listed publicly on your GH profile. This is different from not having your email public on Github (in Github email settings page vs. email field in Github profile/email settings). --- .github/workflows/email-check.yaml | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml index 3339b1eed667b..904ad718f97dd 100644 --- a/.github/workflows/email-check.yaml +++ b/.github/workflows/email-check.yaml @@ -20,30 +20,14 @@ jobs: - name: Extract author email id: author - env: - GH_TOKEN: ${{ github.token }} run: | - # Use Github GraphQL APIs to get the email associated with the PR author because this takes into account the GitHub settings for email privacy. - query=' - query($login: String!) { - user(login: $login) { - email - } - }' - - PR_AUTHOR=${{ github.event.pull_request.user.login }} - - email=$(gh api graphql -f login="$PR_AUTHOR" -f query="$query" --jq '.data.user.email') - echo "EMAIL_AUTHOR_GH_UI=$email" >> "$GITHUB_OUTPUT" - + git log -1 + echo "EMAIL=$(git show -s --format='%ae' HEAD~0)" >> $GITHUB_OUTPUT # Create empty comment file echo "[]" > comments - # When EMAIL_AUTHOR_GH_UI is NULL, author's email is hidden in GitHub UI. - # In this case, we warn the user to turn off "Keep my email addresses private" - # setting in their account. - name: Validate author email - if: ${{ steps.author.outputs.EMAIL_AUTHOR_GH_UI == '' }} + if: ${{ endsWith(steps.author.outputs.EMAIL, 'noreply.github.com') }} env: COMMENT: >- ⚠️ We detected that you are using a GitHub private e-mail address to contribute to the repo.
From 1e4e2b332dc08e01498c677a6a375fcbc9d5e9f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 16 Jul 2025 14:12:27 -0700 Subject: [PATCH 105/813] [flang][cuda] Import type descriptor in the gpu module when needed (#149157) --- .../Optimizer/Transforms/CUFDeviceGlobal.cpp | 42 +++++++++++++------ flang/test/Fir/CUDA/cuda-device-global.f90 | 23 ++++++++++ 2 files changed, 53 insertions(+), 12 deletions(-) diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp index bfb0daeacb8c3..35badb6eadb1c 100644 --- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp +++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp @@ -53,21 +53,26 @@ static void processAddrOfOp(fir::AddrOfOp addrOfOp, } } +static void processTypeDescriptor(fir::RecordType recTy, + mlir::SymbolTable &symbolTable, + llvm::DenseSet &candidates) { + if (auto globalOp = symbolTable.lookup( + fir::NameUniquer::getTypeDescriptorName(recTy.getName()))) { + if (!candidates.contains(globalOp)) { + globalOp.walk([&](fir::AddrOfOp op) { + processAddrOfOp(op, symbolTable, candidates, + /*recurseInGlobal=*/true); + }); + candidates.insert(globalOp); + } + } +} + static void processEmboxOp(fir::EmboxOp emboxOp, mlir::SymbolTable &symbolTable, llvm::DenseSet &candidates) { if (auto recTy = mlir::dyn_cast( - fir::unwrapRefType(emboxOp.getMemref().getType()))) { - if (auto globalOp = symbolTable.lookup( - fir::NameUniquer::getTypeDescriptorName(recTy.getName()))) { - if (!candidates.contains(globalOp)) { - globalOp.walk([&](fir::AddrOfOp op) { - processAddrOfOp(op, symbolTable, candidates, - /*recurseInGlobal=*/true); - }); - candidates.insert(globalOp); - } - } - } + fir::unwrapRefType(emboxOp.getMemref().getType()))) + processTypeDescriptor(recTy, symbolTable, candidates); } static void @@ -85,6 +90,17 @@ prepareImplicitDeviceGlobals(mlir::func::FuncOp funcOp, } } +static void +processPotentialTypeDescriptor(mlir::Type candidateType, + mlir::SymbolTable &symbolTable, + llvm::DenseSet &candidates) { + if (auto boxTy = mlir::dyn_cast(candidateType)) + candidateType = boxTy.getEleTy(); + candidateType = fir::unwrapSequenceType(fir::unwrapRefType(candidateType)); + if (auto recTy = mlir::dyn_cast(candidateType)) + processTypeDescriptor(recTy, symbolTable, candidates); +} + class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase { public: void runOnOperation() override { @@ -115,6 +131,8 @@ class CUFDeviceGlobal : public fir::impl::CUFDeviceGlobalBase { for (auto globalOp : mod.getOps()) { if (cuf::isRegisteredDeviceGlobal(globalOp)) { candidates.insert(globalOp); + processPotentialTypeDescriptor(globalOp.getType(), parentSymTable, + candidates); } else if (globalOp.getConstant() && mlir::isa( fir::unwrapRefType(globalOp.resultType()))) { diff --git a/flang/test/Fir/CUDA/cuda-device-global.f90 b/flang/test/Fir/CUDA/cuda-device-global.f90 index 4c634513745fd..35c025dad3000 100644 --- a/flang/test/Fir/CUDA/cuda-device-global.f90 +++ b/flang/test/Fir/CUDA/cuda-device-global.f90 @@ -24,3 +24,26 @@ module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.conta // CHECK: gpu.module @cuda_device_mod // CHECK-DAG: fir.global @_QMm2ECc // CHECK-DAG: fir.global @_QMm1ECb + +// ----- + +module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module} { + fir.global @_QMmEddarrays {data_attr = #cuf.cuda} : !fir.box>>,phi_i:!fir.box>>,phi0_r:!fir.box>>,phi0_i:!fir.box>>,buf_r:!fir.box>>,buf_i:!fir.box>>}>>>> { + %c0 = arith.constant 0 : index + %0 = fir.zero_bits !fir.heap>>,phi_i:!fir.box>>,phi0_r:!fir.box>>,phi0_i:!fir.box>>,buf_r:!fir.box>>,buf_i:!fir.box>>}>>> + %1 = fir.shape %c0 : (index) -> !fir.shape<1> + %2 = fir.embox %0(%1) {allocator_idx = 3 : i32} : (!fir.heap>>,phi_i:!fir.box>>,phi0_r:!fir.box>>,phi0_i:!fir.box>>,buf_r:!fir.box>>,buf_i:!fir.box>>}>>>, !fir.shape<1>) -> !fir.box>>,phi_i:!fir.box>>,phi0_r:!fir.box>>,phi0_i:!fir.box>>,buf_r:!fir.box>>,buf_i:!fir.box>>}>>>> + fir.has_value %2 : !fir.box>>,phi_i:!fir.box>>,phi0_r:!fir.box>>,phi0_i:!fir.box>>,buf_r:!fir.box>>,buf_i:!fir.box>>}>>>> + } + fir.global linkonce_odr @_QMmE.dt.devicearrays constant target : !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}>>>>,bounds:!fir.box,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}> { + %0 = fir.undefined !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}>>>>,bounds:!fir.box,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}> + fir.has_value %0 : !fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}>>>>,bounds:!fir.box,value:i64}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}> + } +} + + +// CHECK-NAG: fir.global @_QMmEddarrays +// CHECK-NAG: fir.global linkonce_odr @_QMmE.dt.devicearrays +// CHECK: gpu.module @cuda_device_mod +// CHECK-NAG: fir.global @_QMmEddarrays +// CHECK-NAG: fir.global linkonce_odr @_QMmE.dt.devicearrays From b7f6abdd052412bebfedc9cac26fc58b9edb618d Mon Sep 17 00:00:00 2001 From: Daniil Fukalov Date: Wed, 16 Jul 2025 23:18:44 +0200 Subject: [PATCH 106/813] [AMDGPU] Try to reuse register with the constant from compare in v_cndmask (#148740) For some targets, the optimization X == Const ? X : Y -> X == Const ? Const : Y can cause extra register usage or redundant immediate encoding for the constant in cndmask generated from the ternary operation. This patch detects such cases and reuses the register from the compare instruction that already holds the constant, instead of materializing it again for cndmask. The optimization avoids immediates that can be encoded into cndmask instruction (including +-0.0), as well as !isNormal() constants. The change is reworked on the base of #131146 --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 83 + .../AMDGPU/select-cmp-shared-constant-fp.ll | 1429 +++++++++++++++++ .../AMDGPU/select-cmp-shared-constant-int.ll | 955 +++++++++++ 3 files changed, 2467 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll create mode 100644 llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 280f87b82b7fd..3d040fb705a8d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4843,11 +4843,94 @@ AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +// Detect when CMP and SELECT use the same constant and fold them to avoid +// loading the constant twice. Specifically handles patterns like: +// %cmp = icmp eq i32 %val, 4242 +// %sel = select i1 %cmp, i32 4242, i32 %other +// It can be optimized to reuse %val instead of 4242 in select. +static SDValue +foldCmpSelectWithSharedConstant(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const AMDGPUSubtarget *ST) { + SDValue Cond = N->getOperand(0); + SDValue TrueVal = N->getOperand(1); + SDValue FalseVal = N->getOperand(2); + + // Check if condition is a comparison. + if (Cond.getOpcode() != ISD::SETCC) + return SDValue(); + + SDValue LHS = Cond.getOperand(0); + SDValue RHS = Cond.getOperand(1); + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + + bool isFloatingPoint = LHS.getValueType().isFloatingPoint(); + bool isInteger = LHS.getValueType().isInteger(); + + // Handle simple floating-point and integer types only. + if (!isFloatingPoint && !isInteger) + return SDValue(); + + bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ); + bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE); + if (!isEquality && !isNonEquality) + return SDValue(); + + SDValue ArgVal, ConstVal; + if ((isFloatingPoint && isa(RHS)) || + (isInteger && isa(RHS))) { + ConstVal = RHS; + ArgVal = LHS; + } else if ((isFloatingPoint && isa(LHS)) || + (isInteger && isa(LHS))) { + ConstVal = LHS; + ArgVal = RHS; + } else { + return SDValue(); + } + + // Check if constant should not be optimized - early return if not. + if (isFloatingPoint) { + const APFloat &Val = cast(ConstVal)->getValueAPF(); + const GCNSubtarget *GCNST = static_cast(ST); + + // Only optimize normal floating-point values (finite, non-zero, and + // non-subnormal as per IEEE 754), skip optimization for inlinable + // floating-point constants. + if (!Val.isNormal() || GCNST->getInstrInfo()->isInlineConstant(Val)) + return SDValue(); + } else { + int64_t IntVal = cast(ConstVal)->getSExtValue(); + + // Skip optimization for inlinable integer immediates. + // Inlinable immediates include: -16 to 64 (inclusive). + if (IntVal >= -16 && IntVal <= 64) + return SDValue(); + } + + // For equality and non-equality comparisons, patterns: + // select (setcc x, const), const, y -> select (setcc x, const), x, y + // select (setccinv x, const), y, const -> select (setccinv x, const), y, x + if (!(isEquality && TrueVal == ConstVal) && + !(isNonEquality && FalseVal == ConstVal)) + return SDValue(); + + SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal; + SDValue SelectRHS = + (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal; + return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond, + SelectLHS, SelectRHS); +} + SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0))) return Folded; + // Try to fold CMP + SELECT patterns with shared constants (both FP and + // integer). + if (SDValue Folded = foldCmpSelectWithSharedConstant(N, DCI, Subtarget)) + return Folded; + SDValue Cond = N->getOperand(0); if (Cond.getOpcode() != ISD::SETCC) return SDValue(); diff --git a/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll new file mode 100644 index 0000000000000..11af704d30973 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-fp.ll @@ -0,0 +1,1429 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX1010 %s + +; Test the CMP+SELECT optimization that folds shared constants to reduce +; register pressure. + +;------------------------------------------------------------------------------ +; F32 Tests +;------------------------------------------------------------------------------ + +; Should be folded: fcmp oeq + select with constant in true value +define float @fcmp_select_fold_oeq_f32_imm(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_fold_oeq_f32_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x42487ed8 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_oeq_f32_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq float %arg, 0x40490FDB00000000 + %sel = select i1 %cmp, float 0x40490FDB00000000, float %other + ret float %sel +} + +; Should be folded: fcmp oeq + select with constant in true value (commutative) +define float @fcmp_select_fold_oeq_imm_f32(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_fold_oeq_imm_f32: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x42487ed8 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f32: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq float 0x40490FDB00000000, %arg + %sel = select i1 %cmp, float 0x40490FDB00000000, float %other + ret float %sel +} + +; Should be folded: fcmp one + select with constant in false value +define float @fcmp_select_fold_one_f32_imm(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_fold_one_f32_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x402df850 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_one_f32_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x402df850, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one float %arg, 0x4005BF0A00000000 + %sel = select i1 %cmp, float %other, float 0x4005BF0A00000000 + ret float %sel +} + +; Should be folded: fcmp one + select with constant in false value (commutative) +define float @fcmp_select_fold_one_imm_f32(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_fold_one_imm_f32: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x402df850 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_one_imm_f32: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x402df850, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one float 0x4005BF0A00000000, %arg + %sel = select i1 %cmp, float %other, float 0x4005BF0A00000000 + ret float %sel +} + +; Should NOT be folded: different constants +define float @fcmp_select_no_fold_f32_different_const(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_f32_different_const: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x42487ed8 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x46487ed8 +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f32_different_const: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x42487ed8, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x46487ed8, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq float %arg, 0x40490FDB00000000 + %sel = select i1 %cmp, float 0x40C90FDB00000000, float %other + ret float %sel +} + +; Should NOT be folded: fcmp oeq with constant in other position +define float @fcmp_select_no_fold_f32_other_pos(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_f32_other_pos: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x42487ed8 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x42487ed8 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f32_other_pos: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x42487ed8, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x42487ed8, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq float %arg, 0x40490FDB00000000 + %sel = select i1 %cmp, float %other, float 0x40490FDB00000000 + ret float %sel +} + +; Should NOT be folded: unsupported comparison type +define float @fcmp_select_no_fold_f32_unsupported_cmp(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_f32_unsupported_cmp: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x42487ed8 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x42487ed8 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f32_unsupported_cmp: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x42487ed8, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x42487ed8, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp olt float %arg, 0x40490FDB00000000 + %sel = select i1 %cmp, float %other, float 0x40490FDB00000000 + ret float %sel +} + +; Should NOT be folded: imm can be encoded into cndmask +define float @fcmp_select_no_fold_f32_enc_imm(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_f32_enc_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, 1.0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f32_enc_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f32_e32 vcc_lo, 1.0, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq float %arg, 1.0 + %sel = select i1 %cmp, float 1.0, float %other + ret float %sel +} + +; Should NOT be folded: imm can be encoded into cndmask +define float @fcmp_select_no_fold_f32_enc_imm_2(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_f32_enc_imm_2: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, -4.0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f32_enc_imm_2: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, -4.0, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one float -4.0, %arg + %sel = select i1 %cmp, float %other, float -4.0 + ret float %sel +} + +; Should NOT be folded: fcmp oeq with zero constant +define float @fcmp_select_no_fold_oeq_f32_zero(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_oeq_f32_zero: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_oeq_f32_zero: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq float %arg, 0.0 + %sel = select i1 %cmp, float 0.0, float %other + ret float %sel +} + +; Should NOT be folded: fcmp one with negative zero constant +define float @fcmp_select_no_fold_one_f32_negzero(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_one_f32_negzero: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_brev_b32 s4, 1 +; GFX900-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_one_f32_negzero: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x80000000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x80000000, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one float -0.0, %arg ; 0x8000000000000000 + %sel = select i1 %cmp, float %other, float -0.0 ;0x8000000000000000 + ret float %sel +} + +; NaN values should bypass the optimization due to special IEEE 754 behavior +; fcmp oeq with NaN always returns false, so select always chooses %other +define float @fcmp_select_no_fold_oeq_f32_nan(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_oeq_f32_nan: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_oeq_f32_nan: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, v1 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq float %arg, 0x7FF8000000000000 + %sel = select i1 %cmp, float 0x7FF8000000000000, float %other + ret float %sel +} + +; NaN values should bypass the optimization due to special IEEE 754 behavior +; fcmp one with NaN always returns false, so select always chooses the NaN constant +define float @fcmp_select_no_fold_one_f32_nan(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_one_f32_nan: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_one_f32_nan: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, 0x7fc00000 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one float 0x7FF8000000000000, %arg + %sel = select i1 %cmp, float %other, float 0x7FF8000000000000 + ret float %sel +} + +; Should NOT be folded: fcmp one with positive infinity +; Infinity values should bypass the optimization, generating unfolded code +define float @fcmp_select_no_fold_posinf_oeq_f32(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_posinf_oeq_f32: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_posinf_oeq_f32: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x7f800000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq float %arg, 0x7FF0000000000000 + %sel = select i1 %cmp, float 0x7FF0000000000000, float %other + ret float %sel +} + +; Should NOT be folded: fcmp one with negative infinity +; Infinity values should bypass the optimization, generating unfolded code +define float @fcmp_select_no_fold_neginf_f32_one(float %arg, float %other) { +; GFX900-LABEL: fcmp_select_no_fold_neginf_f32_one: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xff800000 +; GFX900-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_neginf_f32_one: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0xff800000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0xff800000, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one float 0xFFF0000000000000, %arg + %sel = select i1 %cmp, float %other, float 0xFFF0000000000000 + ret float %sel +} + +;------------------------------------------------------------------------------ +; F64 Tests +;------------------------------------------------------------------------------ + +; Should be folded: f64 fcmp oeq + select with constant in true value +define double @fcmp_select_fold_oeq_f64_imm(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_fold_oeq_f64_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX900-NEXT: s_mov_b32 s5, 0x400921fb +; GFX900-NEXT: v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_oeq_f64_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX1010-NEXT: s_mov_b32 s5, 0x400921fb +; GFX1010-NEXT: v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double %arg, 3.141592653589793 + %sel = select i1 %cmp, double 3.141592653589793, double %other + ret double %sel +} +; Should be folded: f64 fcmp oeq + select with constant in true value (commutative) +define double @fcmp_select_fold_oeq_imm_f64(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_fold_oeq_imm_f64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX900-NEXT: s_mov_b32 s5, 0x400921fb +; GFX900-NEXT: v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f64: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX1010-NEXT: s_mov_b32 s5, 0x400921fb +; GFX1010-NEXT: v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double 3.141592653589793, %arg + %sel = select i1 %cmp, double 3.141592653589793, double %other + ret double %sel +} + +; Should be folded: f64 fcmp one + select with constant in false value +define double @fcmp_select_fold_one_f64_imm(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_fold_one_f64_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x8b145769 +; GFX900-NEXT: s_mov_b32 s5, 0x4005bf0a +; GFX900-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_one_f64_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0x8b145769 +; GFX1010-NEXT: s_mov_b32 s5, 0x4005bf0a +; GFX1010-NEXT: v_cmp_lg_f64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one double %arg, 2.718281828459045 + %sel = select i1 %cmp, double %other, double 2.718281828459045 + ret double %sel +} +; Should be folded: f64 fcmp one + select with constant in false value (commutative) +define double @fcmp_select_fold_one_imm_f64(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_fold_one_imm_f64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x8b145769 +; GFX900-NEXT: s_mov_b32 s5, 0x4005bf0a +; GFX900-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_one_imm_f64: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0x8b145769 +; GFX1010-NEXT: s_mov_b32 s5, 0x4005bf0a +; GFX1010-NEXT: v_cmp_lg_f64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one double 2.718281828459045, %arg + %sel = select i1 %cmp, double %other, double 2.718281828459045 + ret double %sel +} + +; Should NOT be folded: f64 fcmp oeq with constant in other position +define double @fcmp_select_no_fold_f64_other_pos(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_f64_other_pos: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX900-NEXT: s_mov_b32 s5, 0x400921fb +; GFX900-NEXT: v_cmp_eq_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0x54442d18 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x400921fb +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f64_other_pos: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX1010-NEXT: s_mov_b32 s5, 0x400921fb +; GFX1010-NEXT: v_cmp_eq_f64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x54442d18, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x400921fb, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double %arg, 3.141592653589793 + %sel = select i1 %cmp, double %other, double 3.141592653589793 + ret double %sel +} + +; Should NOT be folded: f64 fcmp unsupported comparison type +define double @fcmp_select_no_fold_f64_unsupported_cmp(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_f64_unsupported_cmp: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX900-NEXT: s_mov_b32 s5, 0x400921fb +; GFX900-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0x54442d18 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x400921fb +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f64_unsupported_cmp: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX1010-NEXT: s_mov_b32 s5, 0x400921fb +; GFX1010-NEXT: v_cmp_gt_f64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x54442d18, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x400921fb, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp olt double %arg, 3.141592653589793 + %sel = select i1 %cmp, double %other, double 3.141592653589793 + ret double %sel +} + +; Should NOT be folded: imm can be encoded into cndmask +define double @fcmp_select_no_fold_f64_enc_imm(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_f64_enc_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_neq_f64_e32 vcc, 1.0, v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x3ff00000 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f64_enc_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f64_e32 vcc_lo, 1.0, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x3ff00000, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double %arg, 1.0 + %sel = select i1 %cmp, double 1.0, double %other + ret double %sel +} + +; Should NOT be folded: imm can be encoded into cndmask +define double @fcmp_select_no_fold_f64_enc_imm_2(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_f64_enc_imm_2: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_lg_f64_e32 vcc, -4.0, v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v1, 0xc0100000 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f64_enc_imm_2: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f64_e32 vcc_lo, -4.0, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0xc0100000, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one double -4.0, %arg + %sel = select i1 %cmp, double %other, double -4.0 + ret double %sel +} + +; Should NOT be folded: f64 fcmp oeq with zero constant +define double @fcmp_select_no_fold_oeq_f64_zero(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_oeq_f64_zero: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_neq_f64_e32 vcc, 0, v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_oeq_f64_zero: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double %arg, 0.0 + %sel = select i1 %cmp, double 0.0, double %other + ret double %sel +} + +; Should NOT be folded: f64 fcmp one with negative zero constant +define double @fcmp_select_no_fold_one_f64_negzero(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_one_f64_negzero: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: s_brev_b32 s5, 1 +; GFX900-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_one_f64_negzero: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f64_e32 vcc_lo, 0x80000000, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x80000000, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one double -0.0, %arg + %sel = select i1 %cmp, double %other, double -0.0 + ret double %sel +} + +; Should NOT be folded: f64 different constants +define double @fcmp_select_no_fold_f64_different_const(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_f64_different_const: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX900-NEXT: s_mov_b32 s5, 0x400921fb +; GFX900-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0x8b145769 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x4005bf0a +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f64_different_const: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0x54442d18 +; GFX1010-NEXT: s_mov_b32 s5, 0x400921fb +; GFX1010-NEXT: v_cmp_neq_f64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x8b145769, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x4005bf0a, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double %arg, 3.141592653589793 + %sel = select i1 %cmp, double 2.718281828459045, double %other + ret double %sel +} + +; Should NOT be folded: f64 fcmp oeq with NaN constant +; fcmp oeq with NaN always returns false, so select always chooses %other +define double @fcmp_select_no_fold_nan_f64(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_nan_f64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_nan_f64: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v1, v3 +; GFX1010-NEXT: v_mov_b32_e32 v0, v2 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double %arg, 0x7FF8000000000000 + %sel = select i1 %cmp, double 0x7FF8000000000000, double %other + ret double %sel +} + +; Should NOT be folded: f64 fcmp oeq with NaN constant (commutative variant) +; fcmp oeq with NaN always returns false, so select always chooses %other +define double @fcmp_select_no_fold_nan_f64_comm(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_nan_f64_comm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v1, v3 +; GFX900-NEXT: v_mov_b32_e32 v0, v2 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_comm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v1, v3 +; GFX1010-NEXT: v_mov_b32_e32 v0, v2 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double 0x7FF8000000000000, %arg + %sel = select i1 %cmp, double 0x7FF8000000000000, double %other + ret double %sel +} + +; Should NOT be folded: f64 fcmp one with NaN constant +; fcmp one with NaN always returns false, so select always chooses the NaN constant +define double @fcmp_select_no_fold_nan_f64_one(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_nan_f64_one: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_one: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one double %arg, 0x7FF8000000000000 + %sel = select i1 %cmp, double %other, double 0x7FF8000000000000 + ret double %sel +} + +; Should NOT be folded: f64 fcmp one with NaN constant (commutative variant) +; fcmp one with NaN always returns false, so select always chooses the NaN constant +define double @fcmp_select_no_fold_nan_f64_one_comm(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_nan_f64_one_comm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_nan_f64_one_comm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: v_mov_b32_e32 v1, 0x7ff80000 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one double 0x7FF8000000000000, %arg + %sel = select i1 %cmp, double %other, double 0x7FF8000000000000 + ret double %sel +} + +; Should NOT be folded: f64 fcmp oeq with positive infinity +; Infinity values should bypass the optimization, generating unfolded code +define double @fcmp_select_no_fold_posinf_f64(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_posinf_f64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX900-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff00000 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_posinf_f64: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x7ff00000, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double %arg, 0x7FF0000000000000 + %sel = select i1 %cmp, double 0x7FF0000000000000, double %other + ret double %sel +} + +; Should NOT be folded: f64 fcmp oeq with negative infinity +; Infinity values should bypass the optimization, generating unfolded code +define double @fcmp_select_no_fold_neginf_f64(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_neginf_f64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: s_mov_b32 s5, 0xfff00000 +; GFX900-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v1, 0xfff00000 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_neginf_f64: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0xfff00000, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0xfff00000, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double %arg, 0xFFF0000000000000 + %sel = select i1 %cmp, double 0xFFF0000000000000, double %other + ret double %sel +} + +; Should NOT be folded: f64 fcmp oeq with positive infinity (commutative variant) +; Infinity values should bypass the optimization, generating unfolded code +define double @fcmp_select_no_fold_posinf_f64_comm(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_posinf_f64_comm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX900-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v1, 0x7ff00000 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_posinf_f64_comm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x7ff00000, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double 0x7FF0000000000000, %arg + %sel = select i1 %cmp, double 0x7FF0000000000000, double %other + ret double %sel +} + +; Should NOT be folded: f64 fcmp oeq with negative infinity (commutative variant) +; Infinity values should bypass the optimization, generating unfolded code +define double @fcmp_select_no_fold_neginf_f64_comm(double %arg, double %other) { +; GFX900-LABEL: fcmp_select_no_fold_neginf_f64_comm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0 +; GFX900-NEXT: s_mov_b32 s5, 0xfff00000 +; GFX900-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v1, 0xfff00000 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_neginf_f64_comm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0xfff00000, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0xfff00000, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq double 0xFFF0000000000000, %arg + %sel = select i1 %cmp, double 0xFFF0000000000000, double %other + ret double %sel +} + +;------------------------------------------------------------------------------ +; F16 Tests +;------------------------------------------------------------------------------ + +; Should be folded: f16 fcmp oeq + select with constant in true value +define half @fcmp_select_fold_oeq_f16_imm(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_fold_oeq_f16_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x4248 +; GFX900-NEXT: v_cmp_eq_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_oeq_f16_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq half %arg, 0xH4248 + %sel = select i1 %cmp, half 0xH4248, half %other + ret half %sel +} + +; Should be folded: f16 fcmp oeq + select with constant in true value (commutative) +define half @fcmp_select_fold_oeq_imm_f16(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_fold_oeq_imm_f16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x4248 +; GFX900-NEXT: v_cmp_eq_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_oeq_imm_f16: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq half 0xH4248, %arg + %sel = select i1 %cmp, half 0xH4248, half %other + ret half %sel +} + +; Should be folded: f16 fcmp one + select with constant in false value +define half @fcmp_select_fold_one_f16_imm(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_fold_one_f16_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x4020 +; GFX900-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_one_f16_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x4020, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one half %arg, 0xH4020 + %sel = select i1 %cmp, half %other, half 0xH4020 + ret half %sel +} + +; Should be folded: f16 fcmp one + select with constant in false value (commutative) +define half @fcmp_select_fold_one_imm_f16(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_fold_one_imm_f16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x4020 +; GFX900-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_one_imm_f16: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x4020, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one half 0xH4020, %arg + %sel = select i1 %cmp, half %other, half 0xH4020 + ret half %sel +} + +; Should NOT be folded: different constants +define half @fcmp_select_no_fold_f16_different_const(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_f16_different_const: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x4248 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x4300 +; GFX900-NEXT: v_cmp_neq_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f16_different_const: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0x4248, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x4300, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq half %arg, 0xH4248 + %sel = select i1 %cmp, half 0xH4300, half %other + ret half %sel +} + +; Should NOT be folded: NaN values bypass optimization +define half @fcmp_select_no_fold_nan_f16(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_nan_f16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_nan_f16: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, v1 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq half %arg, 0xH7e00 + %sel = select i1 %cmp, half 0xH7e00, half %other + ret half %sel +} + +; Should NOT be folded: f16 fcmp one with NaN constant +define half @fcmp_select_no_fold_nan_f16_one(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_nan_f16_one: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0x7e00 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_nan_f16_one: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, 0x7e00 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one half %arg, 0xH7e00 + %sel = select i1 %cmp, half %other, half 0xH7e00 + ret half %sel +} + +; Should NOT be folded: f16 fcmp one with +Inf constant +define half @fcmp_select_no_fold_posinf_f16_one(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_posinf_f16_one: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x7c00 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7c00 +; GFX900-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_posinf_f16_one: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x7c00, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one half %arg, 0xH7c00 + %sel = select i1 %cmp, half %other, half 0xH7c00 + ret half %sel +} + +; Should NOT be folded: f16 fcmp one with -Inf constant +define half @fcmp_select_no_fold_neginf_f16_one(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_neginf_f16_one: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xfc00 +; GFX900-NEXT: v_mov_b32_e32 v2, 0xfc00 +; GFX900-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_neginf_f16_one: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0xfc00, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0xfc00, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one half %arg, 0xHfc00 + %sel = select i1 %cmp, half %other, half 0xHfc00 + ret half %sel +} +; Should NOT be folded: f16 fcmp oeq with zero constant +define half @fcmp_select_no_fold_oeq_f16_zero(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_oeq_f16_zero: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_neq_f16_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_oeq_f16_zero: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_neq_f16_e32 vcc_lo, 0, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq half %arg, 0xH0000 + %sel = select i1 %cmp, half 0xH0000, half %other + ret half %sel +} +; Should NOT be folded: f16 fcmp one with negative zero constant +define half @fcmp_select_no_fold_one_f16_negzero(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_one_f16_negzero: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0x8000 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x8000 +; GFX900-NEXT: v_cmp_lg_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_one_f16_negzero: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_lg_f16_e32 vcc_lo, 0x8000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x8000, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one half 0xH8000, %arg + %sel = select i1 %cmp, half %other, half 0xH8000 + ret half %sel +} + +; Should NOT be folded: f16 fcmp oeq with constant in other position +define half @fcmp_select_no_fold_f16_other_pos(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_f16_other_pos: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x4248 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x4248 +; GFX900-NEXT: v_cmp_eq_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f16_other_pos: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_f16_e32 vcc_lo, 0x4248, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq half %arg, 0xH4248 + %sel = select i1 %cmp, half %other, half 0xH4248 + ret half %sel +} + +; Should NOT be folded: f16 unsupported comparison type +define half @fcmp_select_no_fold_f16_unsupported_cmp(half %arg, half %other) { +; GFX900-LABEL: fcmp_select_no_fold_f16_unsupported_cmp: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x4248 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x4248 +; GFX900-NEXT: v_cmp_gt_f16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_f16_unsupported_cmp: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x4248, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp olt half %arg, 0xH4248 + %sel = select i1 %cmp, half %other, half 0xH4248 + ret half %sel +} + +;------------------------------------------------------------------------------ +; BF16 Tests +;------------------------------------------------------------------------------ + +; Should be folded: bfloat fcmp oeq + select with constant in true value +define bfloat @fcmp_select_fold_oeq_bf16_imm(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_fold_oeq_bf16_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x42480000 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, s4, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_oeq_bf16_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX1010-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v2 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq bfloat %arg, 0xR4248 + %sel = select i1 %cmp, bfloat 0xR4248, bfloat %other + ret bfloat %sel +} + +; Should be folded: bfloat fcmp oeq + select with constant in true value (commutative) +define bfloat @fcmp_select_fold_oeq_imm_bf16(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_fold_oeq_imm_bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x42480000 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, s4, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_oeq_imm_bf16: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX1010-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v2 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq bfloat 0xR4248, %arg + %sel = select i1 %cmp, bfloat 0xR4248, bfloat %other + ret bfloat %sel +} + +; Should be folded: bfloat fcmp one + select with constant in false value +define bfloat @fcmp_select_fold_one_bf16_imm(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_fold_one_bf16_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x40200000 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_one_bf16_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x40200000, v2 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one bfloat %arg, 0xR4020 + %sel = select i1 %cmp, bfloat %other, bfloat 0xR4020 + ret bfloat %sel +} + +; Should be folded: bfloat fcmp one + select with constant in false value (commutative) +define bfloat @fcmp_select_fold_one_imm_bf16(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_fold_one_imm_bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x40200000 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v2 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_fold_one_imm_bf16: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x40200000, v2 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one bfloat 0xR4020, %arg + %sel = select i1 %cmp, bfloat %other, bfloat 0xR4020 + ret bfloat %sel +} + +; Should NOT be folded: different constants +define bfloat @fcmp_select_no_fold_bf16_different_const(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_bf16_different_const: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x42480000 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x4300 +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_bf16_different_const: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1010-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0x42480000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x4300, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq bfloat %arg, 0xR4248 + %sel = select i1 %cmp, bfloat 0xR4300, bfloat %other + ret bfloat %sel +} + +; Should NOT be folded: NaN values bypass optimization +define bfloat @fcmp_select_no_fold_nan_bf16(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_nan_bf16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, v1 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_nan_bf16: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, v1 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq bfloat %arg, 0xR7FC0 + %sel = select i1 %cmp, bfloat 0xR7FC0, bfloat %other + ret bfloat %sel +} + +; Should NOT be folded: bfloat fcmp one with NaN constant +define bfloat @fcmp_select_no_fold_nan_bf16_one(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_nan_bf16_one: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v0, 0x7fc0 +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_nan_bf16_one: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v0, 0x7fc0 +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one bfloat %arg, 0xR7FC0 + %sel = select i1 %cmp, bfloat %other, bfloat 0xR7FC0 + ret bfloat %sel +} + +; Should NOT be folded: bfloat fcmp one with +Inf constant +define bfloat @fcmp_select_no_fold_posinf_bf16_one(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_posinf_bf16_one: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x7f800000 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7f80 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_posinf_bf16_one: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x7f800000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x7f80, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one bfloat %arg, 0xR7F80 + %sel = select i1 %cmp, bfloat %other, bfloat 0xR7F80 + ret bfloat %sel +} + +; Should NOT be folded: bfloat fcmp one with -Inf constant +define bfloat @fcmp_select_no_fold_neginf_bf16_one(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_neginf_bf16_one: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0xff800000 +; GFX900-NEXT: v_mov_b32_e32 v2, 0xffffff80 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_neginf_bf16_one: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0xff800000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0xffffff80, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one bfloat %arg, 0xRFF80 + %sel = select i1 %cmp, bfloat %other, bfloat 0xRFF80 + ret bfloat %sel +} + +; Should NOT be folded: bfloat fcmp oeq with zero constant +define bfloat @fcmp_select_no_fold_oeq_bf16_zero(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_oeq_bf16_zero: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_oeq_bf16_zero: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1010-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq bfloat %arg, 0xR0000 + %sel = select i1 %cmp, bfloat 0xR0000, bfloat %other + ret bfloat %sel +} + +; Should NOT be folded: bfloat fcmp one with negative zero constant +define bfloat @fcmp_select_no_fold_one_bf16_negzero(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_one_bf16_negzero: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_brev_b32 s4, 1 +; GFX900-NEXT: v_mov_b32_e32 v2, 0xffff8000 +; GFX900-NEXT: v_cmp_lg_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_one_bf16_negzero: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1010-NEXT: v_cmp_lg_f32_e32 vcc_lo, 0x80000000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0xffff8000, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp one bfloat 0xR8000, %arg + %sel = select i1 %cmp, bfloat %other, bfloat 0xR8000 + ret bfloat %sel +} + +; Should NOT be folded: bfloat fcmp oeq with constant in other position +define bfloat @fcmp_select_no_fold_bf16_other_pos(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_bf16_other_pos: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x42480000 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x4248 +; GFX900-NEXT: v_cmp_eq_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_bf16_other_pos: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1010-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0x42480000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp oeq bfloat %arg, 0xR4248 + %sel = select i1 %cmp, bfloat %other, bfloat 0xR4248 + ret bfloat %sel +} + +; Should NOT be folded: bfloat unsupported comparison type +define bfloat @fcmp_select_no_fold_bf16_unsupported_cmp(bfloat %arg, bfloat %other) { +; GFX900-LABEL: fcmp_select_no_fold_bf16_unsupported_cmp: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX900-NEXT: s_mov_b32 s4, 0x42480000 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x4248 +; GFX900-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: fcmp_select_no_fold_bf16_unsupported_cmp: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1010-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x42480000, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x4248, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = fcmp olt bfloat %arg, 0xR4248 + %sel = select i1 %cmp, bfloat %other, bfloat 0xR4248 + ret bfloat %sel +} diff --git a/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll new file mode 100644 index 0000000000000..4383cfd36f945 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/select-cmp-shared-constant-int.ll @@ -0,0 +1,955 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX1010 %s + +;------------------------------------------------------------------------------ +; I32 Tests +;------------------------------------------------------------------------------ + +; Should be folded: icmp eq + select with constant in true value +define i32 @icmp_select_fold_eq_i32_imm(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_fold_eq_i32_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_eq_i32_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i32 %arg, 4242 + %sel = select i1 %cmp, i32 4242, i32 %other + ret i32 %sel +} + +; Should be folded: icmp eq + select with constant in true value (commutative) +define i32 @icmp_select_fold_eq_imm_i32(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_fold_eq_imm_i32: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_eq_imm_i32: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i32 4242, %arg + %sel = select i1 %cmp, i32 4242, i32 %other + ret i32 %sel +} + +; Should be folded: icmp ne + select with constant in false value +define i32 @icmp_select_fold_ne_i32_imm(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_fold_ne_i32_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_ne_i32_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i32 %arg, 4242 + %sel = select i1 %cmp, i32 %other, i32 4242 + ret i32 %sel +} + +; Should be folded: icmp ne + select with constant in false value (commutative) +define i32 @icmp_select_fold_ne_imm_i32(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_fold_ne_imm_i32: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_ne_imm_i32: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i32 4242, %arg + %sel = select i1 %cmp, i32 %other, i32 4242 + ret i32 %sel +} + +; Should NOT be folded: icmp eq with different constants +define i32 @icmp_select_no_fold_i32_different(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_no_fold_i32_different: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x978 +; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i32_different: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x978, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i32 %arg, 4242 + %sel = select i1 %cmp, i32 2424, i32 %other + ret i32 %sel +} + +; Should NOT be folded: icmp eq with constant in other position +define i32 @icmp_select_no_fold_i32_other_pos(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_no_fold_i32_other_pos: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x1092 +; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i32_other_pos: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i32 %arg, 4242 + %sel = select i1 %cmp, i32 %other, i32 4242 + ret i32 %sel +} + +; Should NOT be folded: unsupported comparison type +define i32 @icmp_select_no_fold_i32_unsupported_cmp(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_no_fold_i32_unsupported_cmp: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1094 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x102d +; GFX900-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i32_unsupported_cmp: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x1094, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x102d, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ugt i32 %arg, 4243 + %sel = select i1 %cmp, i32 4141, i32 %other + ret i32 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i32 @icmp_select_no_fold_i32_enc_imm(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i32 %arg, 0 + %sel = select i1 %cmp, i32 0, i32 %other + ret i32 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i32 @icmp_select_no_fold_i32_enc_imm_2(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm_2: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, 64, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm_2: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, 64, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i32 64, %arg + %sel = select i1 %cmp, i32 64, i32 %other + ret i32 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i32 @icmp_select_no_fold_i32_enc_imm_3(i32 %arg, i32 %other) { +; GFX900-LABEL: icmp_select_no_fold_i32_enc_imm_3: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u32_e32 vcc, -16, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, -16, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i32_enc_imm_3: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u32_e32 vcc_lo, -16, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, -16, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i32 %arg, -16 + %sel = select i1 %cmp, i32 %other, i32 -16 + ret i32 %sel +} + +;------------------------------------------------------------------------------ +; I64 Tests +;------------------------------------------------------------------------------ + +; Should be folded: icmp eq + select with constant in true value +define i64 @icmp_select_fold_eq_i64_imm(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_fold_eq_i64_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX900-NEXT: s_movk_i32 s5, 0x62 +; GFX900-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_eq_i64_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX1010-NEXT: s_movk_i32 s5, 0x62 +; GFX1010-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i64 %arg, 424242424242 + %sel = select i1 %cmp, i64 424242424242, i64 %other + ret i64 %sel +} + +; Should be folded: icmp eq + select with constant in true value (commutative) +define i64 @icmp_select_fold_eq_imm_i64(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_fold_eq_imm_i64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX900-NEXT: s_movk_i32 s5, 0x62 +; GFX900-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_eq_imm_i64: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX1010-NEXT: s_movk_i32 s5, 0x62 +; GFX1010-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i64 424242424242, %arg + %sel = select i1 %cmp, i64 424242424242, i64 %other + ret i64 %sel +} + +; Should be folded: icmp ne + select with constant in false value +define i64 @icmp_select_fold_ne_i64_imm(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_fold_ne_i64_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX900-NEXT: s_movk_i32 s5, 0x62 +; GFX900-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_ne_i64_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX1010-NEXT: s_movk_i32 s5, 0x62 +; GFX1010-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i64 %arg, 424242424242 + %sel = select i1 %cmp, i64 %other, i64 424242424242 + ret i64 %sel +} + +; Should be folded: icmp ne + select with constant in false value (commutative) +define i64 @icmp_select_fold_ne_imm_i64(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_fold_ne_imm_i64: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX900-NEXT: s_movk_i32 s5, 0x62 +; GFX900-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_ne_imm_i64: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX1010-NEXT: s_movk_i32 s5, 0x62 +; GFX1010-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i64 424242424242, %arg + %sel = select i1 %cmp, i64 %other, i64 424242424242 + ret i64 %sel +} + +; Should NOT be folded: icmp eq with different constants +define i64 @icmp_select_no_fold_i64_different(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_no_fold_i64_different: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX900-NEXT: s_movk_i32 s5, 0x62 +; GFX900-NEXT: v_cmp_ne_u64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0x719c60f8 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i64_different: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX1010-NEXT: s_movk_i32 s5, 0x62 +; GFX1010-NEXT: v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x719c60f8, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i64 %arg, 424242424242 + %sel = select i1 %cmp, i64 242424242424, i64 %other + ret i64 %sel +} + +; Should NOT be folded: icmp eq with constant in other position +define i64 @icmp_select_no_fold_i64_other_pos(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_no_fold_i64_other_pos: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX900-NEXT: s_movk_i32 s5, 0x62 +; GFX900-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0xc6d1a9b2 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x62 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i64_other_pos: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0xc6d1a9b2 +; GFX1010-NEXT: s_movk_i32 s5, 0x62 +; GFX1010-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0xc6d1a9b2, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x62, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i64 %arg, 424242424242 + %sel = select i1 %cmp, i64 %other, i64 424242424242 + ret i64 %sel +} + +; Should NOT be folded: unsupported comparison type +define i64 @icmp_select_no_fold_i64_unsupported_cmp(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_no_fold_i64_unsupported_cmp: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_mov_b32 s4, 0xc6d1a9b3 +; GFX900-NEXT: s_movk_i32 s5, 0x62 +; GFX900-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] +; GFX900-NEXT: v_mov_b32_e32 v4, 0xc6d1a9b2 +; GFX900-NEXT: v_mov_b32_e32 v1, 0x62 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i64_unsupported_cmp: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: s_mov_b32 s4, 0xc6d1a9b3 +; GFX1010-NEXT: s_movk_i32 s5, 0x62 +; GFX1010-NEXT: v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0xc6d1a9b2, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0x62, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ugt i64 %arg, 424242424242 + %sel = select i1 %cmp, i64 424242424242, i64 %other + ret i64 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i64 @icmp_select_no_fold_i64_enc_imm(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i64 %arg, 0 + %sel = select i1 %cmp, i64 0, i64 %other + ret i64 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i64 @icmp_select_no_fold_i64_enc_imm_2(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm_2: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u64_e32 vcc, 32, v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, 32, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm_2: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u64_e32 vcc_lo, 32, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 32, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i64 32, %arg + %sel = select i1 %cmp, i64 32, i64 %other + ret i64 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i64 @icmp_select_no_fold_i64_enc_imm_3(i64 %arg, i64 %other) { +; GFX900-LABEL: icmp_select_no_fold_i64_enc_imm_3: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u64_e32 vcc, -8, v[0:1] +; GFX900-NEXT: v_cndmask_b32_e32 v0, -8, v2, vcc +; GFX900-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i64_enc_imm_3: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u64_e32 vcc_lo, -8, v[0:1] +; GFX1010-NEXT: v_cndmask_b32_e32 v0, -8, v2, vcc_lo +; GFX1010-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i64 %arg, -8 + %sel = select i1 %cmp, i64 %other, i64 -8 + ret i64 %sel +} + +;------------------------------------------------------------------------------ +; I16 Tests +;------------------------------------------------------------------------------ + +; Should be folded: icmp eq + select with constant in true value +define i16 @icmp_select_fold_eq_i16_imm(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_fold_eq_i16_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_eq_i16_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i16 %arg, 4242 + %sel = select i1 %cmp, i16 4242, i16 %other + ret i16 %sel +} + +; Should be folded: icmp eq + select with constant in true value (commutative) +define i16 @icmp_select_fold_eq_imm_i16(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_fold_eq_imm_i16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_eq_imm_i16: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i16 4242, %arg + %sel = select i1 %cmp, i16 4242, i16 %other + ret i16 %sel +} + +; Should be folded: icmp ne + select with constant in false value +define i16 @icmp_select_fold_ne_i16_imm(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_fold_ne_i16_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_ne_i16_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i16 %arg, 4242 + %sel = select i1 %cmp, i16 %other, i16 4242 + ret i16 %sel +} + +; Should be folded: icmp ne + select with constant in false value (commutative) +define i16 @icmp_select_fold_ne_imm_i16(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_fold_ne_imm_i16: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_ne_imm_i16: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i16 4242, %arg + %sel = select i1 %cmp, i16 %other, i16 4242 + ret i16 %sel +} + +; Should NOT be folded: icmp eq with different constants +define i16 @icmp_select_no_fold_i16_different(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_no_fold_i16_different: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x978 +; GFX900-NEXT: v_cmp_ne_u16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i16_different: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x978, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i16 %arg, 4242 + %sel = select i1 %cmp, i16 2424, i16 %other + ret i16 %sel +} + +; Should NOT be folded: icmp eq with constant in other position +define i16 @icmp_select_no_fold_i16_other_pos(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_no_fold_i16_other_pos: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1092 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x1092 +; GFX900-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i16_other_pos: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x1092, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i16 %arg, 4242 + %sel = select i1 %cmp, i16 %other, i16 4242 + ret i16 %sel +} + +; Should NOT be folded: unsupported comparison type +define i16 @icmp_select_no_fold_i16_unsupported_cmp(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_no_fold_i16_unsupported_cmp: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x1093 +; GFX900-NEXT: v_mov_b32_e32 v2, 0x1092 +; GFX900-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i16_unsupported_cmp: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x1093, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x1092, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ugt i16 %arg, 4242 + %sel = select i1 %cmp, i16 4242, i16 %other + ret i16 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i16 @icmp_select_no_fold_i16_enc_imm(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i16 %arg, 0 + %sel = select i1 %cmp, i16 0, i16 %other + ret i16 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i16 @icmp_select_no_fold_i16_enc_imm_2(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm_2: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u16_e32 vcc, 45, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, 45, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm_2: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u16_e32 vcc_lo, 45, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 45, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i16 45, %arg + %sel = select i1 %cmp, i16 45, i16 %other + ret i16 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i16 @icmp_select_no_fold_i16_enc_imm_3(i16 %arg, i16 %other) { +; GFX900-LABEL: icmp_select_no_fold_i16_enc_imm_3: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_cmp_ne_u16_e32 vcc, -12, v0 +; GFX900-NEXT: v_cndmask_b32_e32 v0, -12, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i16_enc_imm_3: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_cmp_ne_u16_e32 vcc_lo, -12, v0 +; GFX1010-NEXT: v_cndmask_b32_e32 v0, -12, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i16 %arg, -12 + %sel = select i1 %cmp, i16 %other, i16 -12 + ret i16 %sel +} + +;------------------------------------------------------------------------------ +; I8 Tests +;------------------------------------------------------------------------------ + +; Should be folded: icmp eq + select with constant in true value +define i8 @icmp_select_fold_eq_i8_imm(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_fold_eq_i8_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x7b +; GFX900-NEXT: v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_eq_i8_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX1010-NEXT: v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i8 %arg, 123 + %sel = select i1 %cmp, i8 123, i8 %other + ret i8 %sel +} + +; Should be folded: icmp eq + select with constant in true value (commutative) +define i8 @icmp_select_fold_eq_imm_i8(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_fold_eq_imm_i8: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x7b +; GFX900-NEXT: v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_eq_imm_i8: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX1010-NEXT: v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i8 123, %arg + %sel = select i1 %cmp, i8 123, i8 %other + ret i8 %sel +} + +; Should be folded: icmp ne + select with constant in false value +define i8 @icmp_select_fold_ne_i8_imm(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_fold_ne_i8_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x7b +; GFX900-NEXT: v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_ne_i8_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX1010-NEXT: v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i8 %arg, 123 + %sel = select i1 %cmp, i8 %other, i8 123 + ret i8 %sel +} + +; Should be folded: icmp ne + select with constant in false value (commutative) +define i8 @icmp_select_fold_ne_imm_i8(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_fold_ne_imm_i8: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x7b +; GFX900-NEXT: v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_fold_ne_imm_i8: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX1010-NEXT: v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i8 123, %arg + %sel = select i1 %cmp, i8 %other, i8 123 + ret i8 %sel +} + +; Should NOT be folded: icmp eq with different constants +define i8 @icmp_select_no_fold_i8_different(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_no_fold_i8_different: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x7b +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7c +; GFX900-NEXT: v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i8_different: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX1010-NEXT: v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x7c, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i8 %arg, 123 + %sel = select i1 %cmp, i8 124, i8 %other + ret i8 %sel +} + +; Should NOT be folded: icmp eq with constant in other position +define i8 @icmp_select_no_fold_i8_other_pos(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_no_fold_i8_other_pos: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x7b +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX900-NEXT: v_cmp_eq_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i8_other_pos: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX1010-NEXT: v_cmp_eq_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x7b, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i8 %arg, 123 + %sel = select i1 %cmp, i8 %other, i8 123 + ret i8 %sel +} + +; Should NOT be folded: unsupported comparison type +define i8 @icmp_select_no_fold_i8_unsupported_cmp(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_no_fold_i8_unsupported_cmp: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0x7c +; GFX900-NEXT: v_mov_b32_e32 v2, 0x7b +; GFX900-NEXT: v_cmp_lt_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i8_unsupported_cmp: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0x7c +; GFX1010-NEXT: v_cmp_lt_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0x7b, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ugt i8 %arg, 123 + %sel = select i1 %cmp, i8 123, i8 %other + ret i8 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i8 @icmp_select_no_fold_i8_enc_imm(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 0 +; GFX900-NEXT: v_cmp_ne_u16_sdwa vcc, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0 +; GFX1010-NEXT: v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i8 %arg, 0 + %sel = select i1 %cmp, i8 0, i8 %other + ret i8 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i8 @icmp_select_no_fold_i8_enc_imm_2(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm_2: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: v_mov_b32_e32 v2, 25 +; GFX900-NEXT: v_cmp_ne_u16_sdwa vcc, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, 25, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm_2: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 25 +; GFX1010-NEXT: v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, 25, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp eq i8 25, %arg + %sel = select i1 %cmp, i8 25, i8 %other + ret i8 %sel +} + +; Should NOT be folded: immediate can be encoded into cndmask +define i8 @icmp_select_no_fold_i8_enc_imm_3(i8 %arg, i8 %other) { +; GFX900-LABEL: icmp_select_no_fold_i8_enc_imm_3: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX900-NEXT: s_movk_i32 s4, 0xfb +; GFX900-NEXT: v_cmp_ne_u16_sdwa vcc, v0, s4 src0_sel:BYTE_0 src1_sel:DWORD +; GFX900-NEXT: v_cndmask_b32_e32 v0, -5, v1, vcc +; GFX900-NEXT: s_setpc_b64 s[30:31] +; +; GFX1010-LABEL: icmp_select_no_fold_i8_enc_imm_3: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1010-NEXT: v_mov_b32_e32 v2, 0xfb +; GFX1010-NEXT: v_cmp_ne_u16_sdwa vcc_lo, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD +; GFX1010-NEXT: v_cndmask_b32_e32 v0, -5, v1, vcc_lo +; GFX1010-NEXT: s_setpc_b64 s[30:31] +entry: + %cmp = icmp ne i8 %arg, -5 + %sel = select i1 %cmp, i8 %other, i8 -5 + ret i8 %sel +} From cb6d1bbfcd01866ef7a3027968e6480ef1c2c992 Mon Sep 17 00:00:00 2001 From: Guy David <49722543+guy-david@users.noreply.github.com> Date: Thu, 17 Jul 2025 00:29:38 +0300 Subject: [PATCH 107/813] [PowerPC] Test SPE incompatibility with VSX (#147184) PPCSubtarget is not always initialized, depending on which passes are running, and in our downstream fork, -enable-matrix is the default configuration (regardless of whether matrix intrinsics are present in the IR), which triggers a fatal error in builtins-ppc-fpconstrained.c. --- clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c | 2 +- llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c index 838db02415fe5..b46fa9f2cf157 100644 --- a/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c +++ b/clang/test/CodeGen/PowerPC/builtins-ppc-fpconstrained.c @@ -11,7 +11,7 @@ // RUN: -S -ffp-exception-behavior=strict \ // RUN: -o - %s | FileCheck --check-prefix=CHECK-ASM \ // RUN: --check-prefix=FIXME-CHECK %s -// RUN: %clang_cc1 -triple powerpcspe -ffp-exception-behavior=strict \ +// RUN: %clang_cc1 -triple powerpc -ffp-exception-behavior=strict \ // RUN: -target-feature +vsx -fexperimental-strict-floating-point -emit-llvm \ // RUN: %s -o - | FileCheck --check-prefix=CHECK-CONSTRAINED %s diff --git a/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll b/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll new file mode 100644 index 0000000000000..06c8f9a3b4bb6 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/spe-vsx-incompatibility.ll @@ -0,0 +1,8 @@ +; Adding -enable-matrix, which is disabled by default, forces the initialization +; of the PPCSubtarget which verifies the incompatible CPU features. +; RUN: not llc -mtriple=powerpcspe -mattr=+vsx -enable-matrix < %s 2>&1 | FileCheck %s + +; CHECK: SPE and traditional floating point cannot both be enabled +define void @test() { + ret void +} From 9912ccb0b4d17a4dd4ef8df718b63e3a907ad7c5 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 16 Jul 2025 14:35:07 -0700 Subject: [PATCH 108/813] [AMDGPU] gfx1250 MC support for FLAT GVS addressing (#149173) --- llvm/lib/Target/AMDGPU/AMDGPU.td | 10 + llvm/lib/Target/AMDGPU/FLATInstructions.td | 296 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s | 132 + .../AMDGPU/gfx1250_dasm_vflat.txt | 2826 +++++++++++++++++ 5 files changed, 3144 insertions(+), 123 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b2b2b3721a00c..faf59c1541fc0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -89,6 +89,12 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch", "Use scratch_* flat memory instructions to access scratch" >; +def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode", + "FlatGVSMode", + "true", + "Have GVS addressing mode with flat_* instructions" +>; + def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts", "AddNoCarryInsts", "true", @@ -1954,6 +1960,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureShaderCyclesHiLoRegisters, FeatureArchitectedFlatScratch, FeatureArchitectedSGPRs, + FeatureFlatGVSMode, FeatureAtomicFaddRtnInsts, FeatureAtomicFaddNoRtnInsts, FeatureAtomicDsPkAdd16Insts, @@ -2381,6 +2388,9 @@ def HasFlatScratchSTMode : Predicate<"Subtarget->hasFlatScratchSTMode()">, def HasFlatScratchSVSMode : Predicate<"Subtarget->hasFlatScratchSVSMode()">, AssemblerPredicate<(any_of FeatureGFX940Insts, FeatureGFX11Insts)>; +def HasFlatGVSMode : Predicate<"Subtarget->hasFlatGVSMode()">, + AssemblerPredicate<(all_of FeatureFlatGVSMode)>; + def HasGFX10_AEncoding : Predicate<"Subtarget->hasGFX10_AEncoding()">, AssemblerPredicate<(all_of FeatureGFX10_AEncoding)>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 3625db9a4791f..06e23dbb92450 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -237,10 +237,18 @@ class FLAT_Load_Pseudo< let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } -multiclass FLAT_Load_Pseudo_t16 { - def "" : FLAT_Load_Pseudo; +multiclass FLAT_Flat_Load_Pseudo { + def "" : FLAT_Load_Pseudo, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Load_Pseudo, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Load_Pseudo_t16 { + defm "" : FLAT_Flat_Load_Pseudo; let True16Predicate = UseRealTrue16Insts in - def _t16 : FLAT_Load_Pseudo, True16D16Table; + defm _t16 : FLAT_Flat_Load_Pseudo, True16D16Table; } class FLAT_Store_Pseudo { - def "" : FLAT_Store_Pseudo; - let OtherPredicates = [HasTrue16BitInsts] in - def _t16 : FLAT_Store_Pseudo, True16D16Table; +multiclass FLAT_Flat_Store_Pseudo { + def "" : FLAT_Store_Pseudo, + GlobalSaddrTable<0, opName>; + let OtherPredicates = [HasFlatGVSMode] in + def _SADDR : FLAT_Store_Pseudo, + GlobalSaddrTable<1, opName>; +} + +multiclass FLAT_Flat_Store_Pseudo_t16 { + defm "" : FLAT_Flat_Store_Pseudo; + + defvar Name16 = opName#"_t16"; + let OtherPredicates = [HasFlatGVSMode, HasTrue16BitInsts] in { + def _t16 : FLAT_Store_Pseudo, + GlobalSaddrTable<0, Name16>, + True16D16Table; + def _SADDR_t16 : FLAT_Store_Pseudo, + GlobalSaddrTable<1, Name16>, + True16D16Table; + } } multiclass FLAT_Global_Load_Pseudo { @@ -657,6 +681,18 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN< let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR : FLAT_AtomicNoRet_Pseudo , + GlobalSaddrTable<1, opName> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo_RTN< @@ -665,15 +701,29 @@ multiclass FLAT_Atomic_Pseudo_RTN< ValueType vt, ValueType data_vt = vt, RegisterClass data_rc = vdst_rc, - RegisterOperand data_op = getLdStRegisterOperand.ret> { + RegisterOperand data_op = getLdStRegisterOperand.ret, + RegisterOperand vdst_op = getLdStRegisterOperand.ret> { def _RTN : FLAT_AtomicRet_Pseudo .ret:$vdst), + (outs vdst_op:$vdst), (ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol), " $vdst, $vaddr, $vdata$offset$cpol">, GlobalSaddrTable<0, opName#"_rtn"> { let FPAtomic = data_vt.isFP; let AddedComplexity = -1; // Prefer global atomics if available } + + def _SADDR_RTN : FLAT_AtomicRet_Pseudo , + GlobalSaddrTable<1, opName#"_rtn"> { + let OtherPredicates = [HasFlatGVSMode]; + let has_saddr = 1; + let enabled_saddr = 1; + let PseudoInstr = NAME#"_SADDR_RTN"; + let FPAtomic = data_vt.isFP; + let AddedComplexity = -1; // Prefer global atomics if available + } } multiclass FLAT_Atomic_Pseudo< @@ -762,36 +812,36 @@ multiclass FLAT_Global_Atomic_Pseudo< // Flat Instructions //===----------------------------------------------------------------------===// -def FLAT_LOAD_UBYTE : FLAT_Load_Pseudo <"flat_load_ubyte", VGPR_32>; -def FLAT_LOAD_SBYTE : FLAT_Load_Pseudo <"flat_load_sbyte", VGPR_32>; -def FLAT_LOAD_USHORT : FLAT_Load_Pseudo <"flat_load_ushort", VGPR_32>; -def FLAT_LOAD_SSHORT : FLAT_Load_Pseudo <"flat_load_sshort", VGPR_32>; -def FLAT_LOAD_DWORD : FLAT_Load_Pseudo <"flat_load_dword", VGPR_32>; -def FLAT_LOAD_DWORDX2 : FLAT_Load_Pseudo <"flat_load_dwordx2", VReg_64>; -def FLAT_LOAD_DWORDX4 : FLAT_Load_Pseudo <"flat_load_dwordx4", VReg_128>; -def FLAT_LOAD_DWORDX3 : FLAT_Load_Pseudo <"flat_load_dwordx3", VReg_96>; +defm FLAT_LOAD_UBYTE : FLAT_Flat_Load_Pseudo <"flat_load_ubyte", VGPR_32>; +defm FLAT_LOAD_SBYTE : FLAT_Flat_Load_Pseudo <"flat_load_sbyte", VGPR_32>; +defm FLAT_LOAD_USHORT : FLAT_Flat_Load_Pseudo <"flat_load_ushort", VGPR_32>; +defm FLAT_LOAD_SSHORT : FLAT_Flat_Load_Pseudo <"flat_load_sshort", VGPR_32>; +defm FLAT_LOAD_DWORD : FLAT_Flat_Load_Pseudo <"flat_load_dword", VGPR_32>; +defm FLAT_LOAD_DWORDX2 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx2", VReg_64>; +defm FLAT_LOAD_DWORDX4 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx4", VReg_128>; +defm FLAT_LOAD_DWORDX3 : FLAT_Flat_Load_Pseudo <"flat_load_dwordx3", VReg_96>; -def FLAT_STORE_DWORD : FLAT_Store_Pseudo <"flat_store_dword", VGPR_32>; -def FLAT_STORE_DWORDX2 : FLAT_Store_Pseudo <"flat_store_dwordx2", VReg_64>; -def FLAT_STORE_DWORDX4 : FLAT_Store_Pseudo <"flat_store_dwordx4", VReg_128>; -def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; +defm FLAT_STORE_DWORD : FLAT_Flat_Store_Pseudo <"flat_store_dword", VGPR_32>; +defm FLAT_STORE_DWORDX2 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx2", VReg_64>; +defm FLAT_STORE_DWORDX4 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx4", VReg_128>; +defm FLAT_STORE_DWORDX3 : FLAT_Flat_Store_Pseudo <"flat_store_dwordx3", VReg_96>; let SubtargetPredicate = HasD16LoadStore in { let TiedSourceNotRead = 1 in { -def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_ubyte_d16">; -def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo_t16 <"flat_load_sbyte_d16">; -def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; -defm FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo_t16 <"flat_load_short_d16">; +defm FLAT_LOAD_UBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_UBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_ubyte_d16">; +defm FLAT_LOAD_SBYTE_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SBYTE_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_sbyte_d16">; +defm FLAT_LOAD_SHORT_D16_HI : FLAT_Flat_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; +defm FLAT_LOAD_SHORT_D16 : FLAT_Flat_Load_Pseudo_t16 <"flat_load_short_d16">; } -def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; -def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; +defm FLAT_STORE_BYTE_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; +defm FLAT_STORE_SHORT_D16_HI : FLAT_Flat_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; } -defm FLAT_STORE_BYTE : FLAT_Store_Pseudo_t16 <"flat_store_byte">; -defm FLAT_STORE_SHORT : FLAT_Store_Pseudo_t16 <"flat_store_short">; +defm FLAT_STORE_BYTE : FLAT_Flat_Store_Pseudo_t16 <"flat_store_byte">; +defm FLAT_STORE_SHORT : FLAT_Flat_Store_Pseudo_t16 <"flat_store_short">; defm FLAT_ATOMIC_CMPSWAP : FLAT_Atomic_Pseudo <"flat_atomic_cmpswap", VGPR_32, i32, v2i32, VReg_64>; @@ -2832,11 +2882,11 @@ multiclass VFLAT_Real_Base_gfx12 op, VFLAT_Aliases_gfx12, VFLAT_Real_gfx12; -multiclass VFLAT_Real_Atomics_gfx12 op, - string name = get_FLAT_ps.Mnemonic, - string alias = name> : +multiclass VFLAT_Real_AllAddr_gfx12 op, + string name = get_FLAT_ps.Mnemonic, + string alias = name> : VFLAT_Real_Base_gfx12 { - defm _RTN : VFLAT_Real_gfx12; + defm _SADDR : VFLAT_Real_gfx12; } multiclass VGLOBAL_Real_AllAddr_gfx12 op, @@ -2853,7 +2903,7 @@ multiclass VGLOBAL_Real_AllAddr_gfx1200 op> { } } -multiclass VGLOBAL_Real_AllAddr_gfx12_w64 op, +multiclass VFLAT_Real_AllAddr_gfx12_w64 op, string name = get_FLAT_ps.Mnemonic> : VFLAT_Aliases_gfx12 { let DecoderNamespace = "GFX12W64" in { @@ -2862,10 +2912,10 @@ multiclass VGLOBAL_Real_AllAddr_gfx12_w64 op, } } -multiclass VGLOBAL_Real_Atomics_gfx12 op, +multiclass VFLAT_Real_Atomics_gfx12 op, string name = get_FLAT_ps.Mnemonic, string alias = name> : - VGLOBAL_Real_AllAddr_gfx12 { + VFLAT_Real_AllAddr_gfx12 { defm _RTN : VFLAT_Real_gfx12; defm _SADDR_RTN : VFLAT_Real_gfx12; } @@ -2879,28 +2929,28 @@ multiclass VSCRATCH_Real_AllAddr_gfx12 op, } // ENC_VFLAT. -defm FLAT_LOAD_UBYTE : VFLAT_Real_Base_gfx12<0x010, "flat_load_u8">; -defm FLAT_LOAD_SBYTE : VFLAT_Real_Base_gfx12<0x011, "flat_load_i8">; -defm FLAT_LOAD_USHORT : VFLAT_Real_Base_gfx12<0x012, "flat_load_u16">; -defm FLAT_LOAD_SSHORT : VFLAT_Real_Base_gfx12<0x013, "flat_load_i16">; -defm FLAT_LOAD_DWORD : VFLAT_Real_Base_gfx12<0x014, "flat_load_b32">; -defm FLAT_LOAD_DWORDX2 : VFLAT_Real_Base_gfx12<0x015, "flat_load_b64">; -defm FLAT_LOAD_DWORDX3 : VFLAT_Real_Base_gfx12<0x016, "flat_load_b96">; -defm FLAT_LOAD_DWORDX4 : VFLAT_Real_Base_gfx12<0x017, "flat_load_b128">; -defm FLAT_STORE_BYTE : VFLAT_Real_Base_gfx12<0x018, "flat_store_b8">; -defm FLAT_STORE_SHORT : VFLAT_Real_Base_gfx12<0x019, "flat_store_b16">; -defm FLAT_STORE_DWORD : VFLAT_Real_Base_gfx12<0x01a, "flat_store_b32">; -defm FLAT_STORE_DWORDX2 : VFLAT_Real_Base_gfx12<0x01b, "flat_store_b64">; -defm FLAT_STORE_DWORDX3 : VFLAT_Real_Base_gfx12<0x01c, "flat_store_b96">; -defm FLAT_STORE_DWORDX4 : VFLAT_Real_Base_gfx12<0x01d, "flat_store_b128">; -defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_Base_gfx12<0x01e, "flat_load_d16_u8">; -defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_Base_gfx12<0x01f, "flat_load_d16_i8">; -defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_Base_gfx12<0x020, "flat_load_d16_b16">; -defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x021, "flat_load_d16_hi_u8">; -defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_Base_gfx12<0x022, "flat_load_d16_hi_i8">; -defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x023, "flat_load_d16_hi_b16">; -defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_Base_gfx12<0x024, "flat_store_d16_hi_b8">; -defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_Base_gfx12<0x025, "flat_store_d16_hi_b16">; +defm FLAT_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "flat_load_u8">; +defm FLAT_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "flat_load_i8">; +defm FLAT_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "flat_load_u16">; +defm FLAT_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "flat_load_i16">; +defm FLAT_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "flat_load_b32">; +defm FLAT_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "flat_load_b64">; +defm FLAT_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "flat_load_b96">; +defm FLAT_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "flat_load_b128">; +defm FLAT_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "flat_store_b8">; +defm FLAT_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "flat_store_b16">; +defm FLAT_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "flat_store_b32">; +defm FLAT_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "flat_store_b64">; +defm FLAT_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "flat_store_b96">; +defm FLAT_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "flat_store_b128">; +defm FLAT_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "flat_load_d16_u8">; +defm FLAT_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "flat_load_d16_i8">; +defm FLAT_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "flat_load_d16_b16">; +defm FLAT_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "flat_load_d16_hi_u8">; +defm FLAT_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "flat_load_d16_hi_i8">; +defm FLAT_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "flat_load_d16_hi_b16">; +defm FLAT_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "flat_store_d16_hi_b8">; +defm FLAT_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "flat_store_d16_hi_b16">; defm FLAT_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "flat_atomic_swap_b32">; defm FLAT_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "flat_atomic_cmpswap_b32">; defm FLAT_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "flat_atomic_add_u32">; @@ -2936,74 +2986,74 @@ defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; // ENC_VGLOBAL. -defm GLOBAL_LOAD_UBYTE : VGLOBAL_Real_AllAddr_gfx12<0x010, "global_load_u8">; -defm GLOBAL_LOAD_SBYTE : VGLOBAL_Real_AllAddr_gfx12<0x011, "global_load_i8">; -defm GLOBAL_LOAD_USHORT : VGLOBAL_Real_AllAddr_gfx12<0x012, "global_load_u16">; -defm GLOBAL_LOAD_SSHORT : VGLOBAL_Real_AllAddr_gfx12<0x013, "global_load_i16">; -defm GLOBAL_LOAD_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x014, "global_load_b32">; -defm GLOBAL_LOAD_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x015, "global_load_b64">; -defm GLOBAL_LOAD_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x016, "global_load_b96">; -defm GLOBAL_LOAD_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x017, "global_load_b128">; -defm GLOBAL_STORE_BYTE : VGLOBAL_Real_AllAddr_gfx12<0x018, "global_store_b8">; -defm GLOBAL_STORE_SHORT : VGLOBAL_Real_AllAddr_gfx12<0x019, "global_store_b16">; -defm GLOBAL_STORE_DWORD : VGLOBAL_Real_AllAddr_gfx12<0x01a, "global_store_b32">; -defm GLOBAL_STORE_DWORDX2 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "global_store_b64">; -defm GLOBAL_STORE_DWORDX3 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "global_store_b96">; -defm GLOBAL_STORE_DWORDX4 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "global_store_b128">; -defm GLOBAL_LOAD_UBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; -defm GLOBAL_LOAD_SBYTE_D16 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; -defm GLOBAL_LOAD_SHORT_D16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; -defm GLOBAL_LOAD_UBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; -defm GLOBAL_LOAD_SBYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; -defm GLOBAL_LOAD_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; -defm GLOBAL_STORE_BYTE_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; -defm GLOBAL_STORE_SHORT_D16_HI : VGLOBAL_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; -defm GLOBAL_LOAD_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; -defm GLOBAL_STORE_DWORD_ADDTID : VGLOBAL_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; -defm GLOBAL_LOAD_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x053>; -defm GLOBAL_STORE_BLOCK : VGLOBAL_Real_AllAddr_gfx12<0x054>; - -defm GLOBAL_ATOMIC_SWAP : VGLOBAL_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; -defm GLOBAL_ATOMIC_CMPSWAP : VGLOBAL_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; -defm GLOBAL_ATOMIC_ADD : VGLOBAL_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; -defm GLOBAL_ATOMIC_SUB : VGLOBAL_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; -defm GLOBAL_ATOMIC_CSUB : VGLOBAL_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; -defm GLOBAL_ATOMIC_SMIN : VGLOBAL_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; -defm GLOBAL_ATOMIC_UMIN : VGLOBAL_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; -defm GLOBAL_ATOMIC_SMAX : VGLOBAL_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; -defm GLOBAL_ATOMIC_UMAX : VGLOBAL_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; -defm GLOBAL_ATOMIC_AND : VGLOBAL_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; -defm GLOBAL_ATOMIC_OR : VGLOBAL_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; -defm GLOBAL_ATOMIC_XOR : VGLOBAL_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; -defm GLOBAL_ATOMIC_INC : VGLOBAL_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; -defm GLOBAL_ATOMIC_DEC : VGLOBAL_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; -defm GLOBAL_ATOMIC_SWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; -defm GLOBAL_ATOMIC_CMPSWAP_X2 : VGLOBAL_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; -defm GLOBAL_ATOMIC_ADD_X2 : VGLOBAL_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; -defm GLOBAL_ATOMIC_SUB_X2 : VGLOBAL_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; -defm GLOBAL_ATOMIC_SMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; -defm GLOBAL_ATOMIC_UMIN_X2 : VGLOBAL_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; -defm GLOBAL_ATOMIC_SMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; -defm GLOBAL_ATOMIC_UMAX_X2 : VGLOBAL_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; -defm GLOBAL_ATOMIC_AND_X2 : VGLOBAL_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; -defm GLOBAL_ATOMIC_OR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; -defm GLOBAL_ATOMIC_XOR_X2 : VGLOBAL_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; -defm GLOBAL_ATOMIC_INC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; -defm GLOBAL_ATOMIC_DEC_X2 : VGLOBAL_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; -defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050>; -defm GLOBAL_ATOMIC_FMIN : VGLOBAL_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; -defm GLOBAL_ATOMIC_FMAX : VGLOBAL_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; -defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>; +defm GLOBAL_LOAD_UBYTE : VFLAT_Real_AllAddr_gfx12<0x010, "global_load_u8">; +defm GLOBAL_LOAD_SBYTE : VFLAT_Real_AllAddr_gfx12<0x011, "global_load_i8">; +defm GLOBAL_LOAD_USHORT : VFLAT_Real_AllAddr_gfx12<0x012, "global_load_u16">; +defm GLOBAL_LOAD_SSHORT : VFLAT_Real_AllAddr_gfx12<0x013, "global_load_i16">; +defm GLOBAL_LOAD_DWORD : VFLAT_Real_AllAddr_gfx12<0x014, "global_load_b32">; +defm GLOBAL_LOAD_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x015, "global_load_b64">; +defm GLOBAL_LOAD_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x016, "global_load_b96">; +defm GLOBAL_LOAD_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x017, "global_load_b128">; +defm GLOBAL_STORE_BYTE : VFLAT_Real_AllAddr_gfx12<0x018, "global_store_b8">; +defm GLOBAL_STORE_SHORT : VFLAT_Real_AllAddr_gfx12<0x019, "global_store_b16">; +defm GLOBAL_STORE_DWORD : VFLAT_Real_AllAddr_gfx12<0x01a, "global_store_b32">; +defm GLOBAL_STORE_DWORDX2 : VFLAT_Real_AllAddr_gfx12<0x01b, "global_store_b64">; +defm GLOBAL_STORE_DWORDX3 : VFLAT_Real_AllAddr_gfx12<0x01c, "global_store_b96">; +defm GLOBAL_STORE_DWORDX4 : VFLAT_Real_AllAddr_gfx12<0x01d, "global_store_b128">; +defm GLOBAL_LOAD_UBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01e, "global_load_d16_u8">; +defm GLOBAL_LOAD_SBYTE_D16 : VFLAT_Real_AllAddr_gfx12<0x01f, "global_load_d16_i8">; +defm GLOBAL_LOAD_SHORT_D16 : VFLAT_Real_AllAddr_gfx12<0x020, "global_load_d16_b16">; +defm GLOBAL_LOAD_UBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x021, "global_load_d16_hi_u8">; +defm GLOBAL_LOAD_SBYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x022, "global_load_d16_hi_i8">; +defm GLOBAL_LOAD_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x023, "global_load_d16_hi_b16">; +defm GLOBAL_STORE_BYTE_D16_HI : VFLAT_Real_AllAddr_gfx12<0x024, "global_store_d16_hi_b8">; +defm GLOBAL_STORE_SHORT_D16_HI : VFLAT_Real_AllAddr_gfx12<0x025, "global_store_d16_hi_b16">; +defm GLOBAL_LOAD_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x028, "global_load_addtid_b32">; +defm GLOBAL_STORE_DWORD_ADDTID : VFLAT_Real_AllAddr_gfx12<0x029, "global_store_addtid_b32">; +defm GLOBAL_LOAD_BLOCK : VFLAT_Real_AllAddr_gfx12<0x053>; +defm GLOBAL_STORE_BLOCK : VFLAT_Real_AllAddr_gfx12<0x054>; + +defm GLOBAL_ATOMIC_SWAP : VFLAT_Real_Atomics_gfx12<0x033, "global_atomic_swap_b32">; +defm GLOBAL_ATOMIC_CMPSWAP : VFLAT_Real_Atomics_gfx12<0x034, "global_atomic_cmpswap_b32">; +defm GLOBAL_ATOMIC_ADD : VFLAT_Real_Atomics_gfx12<0x035, "global_atomic_add_u32">; +defm GLOBAL_ATOMIC_SUB : VFLAT_Real_Atomics_gfx12<0x036, "global_atomic_sub_u32">; +defm GLOBAL_ATOMIC_CSUB : VFLAT_Real_Atomics_gfx12<0x037, "global_atomic_sub_clamp_u32", "global_atomic_csub_u32">; +defm GLOBAL_ATOMIC_SMIN : VFLAT_Real_Atomics_gfx12<0x038, "global_atomic_min_i32">; +defm GLOBAL_ATOMIC_UMIN : VFLAT_Real_Atomics_gfx12<0x039, "global_atomic_min_u32">; +defm GLOBAL_ATOMIC_SMAX : VFLAT_Real_Atomics_gfx12<0x03a, "global_atomic_max_i32">; +defm GLOBAL_ATOMIC_UMAX : VFLAT_Real_Atomics_gfx12<0x03b, "global_atomic_max_u32">; +defm GLOBAL_ATOMIC_AND : VFLAT_Real_Atomics_gfx12<0x03c, "global_atomic_and_b32">; +defm GLOBAL_ATOMIC_OR : VFLAT_Real_Atomics_gfx12<0x03d, "global_atomic_or_b32">; +defm GLOBAL_ATOMIC_XOR : VFLAT_Real_Atomics_gfx12<0x03e, "global_atomic_xor_b32">; +defm GLOBAL_ATOMIC_INC : VFLAT_Real_Atomics_gfx12<0x03f, "global_atomic_inc_u32">; +defm GLOBAL_ATOMIC_DEC : VFLAT_Real_Atomics_gfx12<0x040, "global_atomic_dec_u32">; +defm GLOBAL_ATOMIC_SWAP_X2 : VFLAT_Real_Atomics_gfx12<0x041, "global_atomic_swap_b64">; +defm GLOBAL_ATOMIC_CMPSWAP_X2 : VFLAT_Real_Atomics_gfx12<0x042, "global_atomic_cmpswap_b64">; +defm GLOBAL_ATOMIC_ADD_X2 : VFLAT_Real_Atomics_gfx12<0x043, "global_atomic_add_u64">; +defm GLOBAL_ATOMIC_SUB_X2 : VFLAT_Real_Atomics_gfx12<0x044, "global_atomic_sub_u64">; +defm GLOBAL_ATOMIC_SMIN_X2 : VFLAT_Real_Atomics_gfx12<0x045, "global_atomic_min_i64">; +defm GLOBAL_ATOMIC_UMIN_X2 : VFLAT_Real_Atomics_gfx12<0x046, "global_atomic_min_u64">; +defm GLOBAL_ATOMIC_SMAX_X2 : VFLAT_Real_Atomics_gfx12<0x047, "global_atomic_max_i64">; +defm GLOBAL_ATOMIC_UMAX_X2 : VFLAT_Real_Atomics_gfx12<0x048, "global_atomic_max_u64">; +defm GLOBAL_ATOMIC_AND_X2 : VFLAT_Real_Atomics_gfx12<0x049, "global_atomic_and_b64">; +defm GLOBAL_ATOMIC_OR_X2 : VFLAT_Real_Atomics_gfx12<0x04a, "global_atomic_or_b64">; +defm GLOBAL_ATOMIC_XOR_X2 : VFLAT_Real_Atomics_gfx12<0x04b, "global_atomic_xor_b64">; +defm GLOBAL_ATOMIC_INC_X2 : VFLAT_Real_Atomics_gfx12<0x04c, "global_atomic_inc_u64">; +defm GLOBAL_ATOMIC_DEC_X2 : VFLAT_Real_Atomics_gfx12<0x04d, "global_atomic_dec_u64">; +defm GLOBAL_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050>; +defm GLOBAL_ATOMIC_FMIN : VFLAT_Real_Atomics_gfx12<0x051, "global_atomic_min_num_f32", "global_atomic_min_f32">; +defm GLOBAL_ATOMIC_FMAX : VFLAT_Real_Atomics_gfx12<0x052, "global_atomic_max_num_f32", "global_atomic_max_f32">; +defm GLOBAL_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>; defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x057>; defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx1200<0x058>; -defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x057>; -defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12_w64<0x058>; +defm GLOBAL_LOAD_TR_B128_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x057>; +defm GLOBAL_LOAD_TR_B64_w64 : VFLAT_Real_AllAddr_gfx12_w64<0x058>; -defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>; -defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>; -defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>; +defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VFLAT_Real_Atomics_gfx12<0x073>; +defm GLOBAL_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>; +defm GLOBAL_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>; defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>; defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 68430526dba26..67c6daaa24c2a 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -214,6 +214,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool FlatInstOffsets = false; bool FlatGlobalInsts = false; bool FlatScratchInsts = false; + bool FlatGVSMode = false; bool ScalarFlatScratchInsts = false; bool HasArchitectedFlatScratch = false; bool EnableFlatScratch = false; @@ -1160,6 +1161,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasLshlAddU64Inst() const { return HasLshlAddU64Inst; } + bool hasFlatGVSMode() const { return FlatGVSMode; } + bool enableSIScheduler() const { return EnableSIScheduler; } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s index 07b4055f0ab9c..737d7b3de4e92 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s @@ -20,3 +20,135 @@ tensor_stop tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS // GFX1250: tensor_stop th:TH_STORE_BYPASS scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x1b,0xee,0x00,0x00,0x3c,0x00,0x00,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU + +flat_atomic_add_f32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 +// GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_and_b32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] +// GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_max_i32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_max_u32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_min_i32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_min_u32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_or_b32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 +// GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode + +flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 +// GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt index 6421c6f30e177..55bc3e7a5746c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt @@ -1,5 +1,2831 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s +# GFX1250: flat_atomic_add_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x80,0x15,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_add_u32 v2, v3, s[2:3] offset:-64 ; encoding: [0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff] +0x02,0x40,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0xc0,0xff,0xff + +# GFX1250: flat_atomic_add_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_and_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x00,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_and_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_cmpswap_b32 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_cmpswap_b64 v2, v[2:5], s[2:3] ; encoding: [0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00] +0x02,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00 + +# GFX1250: flat_atomic_cond_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x00,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_dec_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x00,0x10,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_dec_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_inc_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0xc0,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_inc_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x80,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x80,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0xc0,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_i32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x00,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_i64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_num_f32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x40,0x14,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x40,0x0e,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_or_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x40,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_or_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_pk_add_bf16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x80,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_pk_add_f16 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x40,0x16,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_clamp_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0xc0,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_u32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x80,0x0d,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_u64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_swap_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0xc0,0x0c,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_swap_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_xor_b32 v2, v3, s[2:3] offset:64 ; encoding: [0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00] +0x02,0x80,0x0f,0xec,0x00,0x00,0x80,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_xor_b64 v2, v[2:3], s[2:3] offset:64 ; encoding: [0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00] +0x02,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x02,0x40,0x00,0x00 + +# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85] +0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85 + +# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_add_f32 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a] +0x7c,0x80,0x15,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a + +# GFX1250: flat_atomic_add_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_add_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x15,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_add_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_add_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_add_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_add_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_add_u32 v[4:5], v5 ; encoding: [0x7c,0x40,0x0d,0xec,0x00,0x00,0x80,0x02,0x04,0x00,0x00,0x00] +0x7c,0x40,0x0d,0xec,0x00,0x00,0x80,0x02,0x04,0x00,0x00,0x00 + +# GFX1250: flat_atomic_add_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_add_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_add_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_add_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_and_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_and_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_and_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_and_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_and_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_and_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_and_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_and_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_cmpswap_b32 v0, v[2:3], v[4:5] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00] +0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00 + +# GFX1250: flat_atomic_cmpswap_b32 v0, v[2:3], v[4:5] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00] +0x7c,0x00,0x0d,0xec,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: flat_atomic_cmpswap_b32 v1, v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_cmpswap_b32 v1, v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v[2:5] offset:-64 ; encoding: [0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_cmpswap_b64 v[0:1], v[2:5] offset:64 ; encoding: [0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00] +0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00 + +# GFX1250: flat_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00] +0x7c,0x80,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00 + +# GFX1250: flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_cond_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_cond_sub_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_dec_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_dec_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x10,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_dec_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_dec_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_dec_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_dec_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_dec_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_dec_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_inc_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_inc_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_inc_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_inc_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_inc_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_inc_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x13,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_inc_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_inc_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x13,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_i32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_i32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_i32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_i32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_i64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_i64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_i64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_i64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_num_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_num_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_num_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_max_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_max_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_i32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_i32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_i32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_i32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_i64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_i64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_i64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_i64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_num_f32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_num_f32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x14,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_num_f32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_num_f32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x14,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0e,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0e,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_min_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_min_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_or_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_or_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_or_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_or_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_or_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_or_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_or_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_or_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85] +0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85 + +# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a] +0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a + +# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] +0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00 + +# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_pk_add_bf16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85] +0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85 + +# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a] +0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a + +# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] +0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00 + +# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_pk_add_f16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_sub_clamp_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_clamp_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_u32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_sub_u32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0d,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_u32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_sub_u32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0d,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_u64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_sub_u64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x11,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_sub_u64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_sub_u64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x11,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:-2048 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0xf8,0xff] +0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0xf8,0xff + +# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00] +0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00 + +# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 offset:2048 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x08,0x00] +0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x08,0x00 + +# GFX1250: flat_atomic_swap_b32 v0, v[2:3], v3 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00] +0x7c,0xc0,0x0c,0xec,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00 + +# GFX1250: flat_atomic_swap_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_swap_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0c,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_swap_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_swap_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0c,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_swap_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_swap_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x10,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_swap_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_swap_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_swap_b64 v[2:3], v[4:5], v[6:7] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00] +0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00 + +# GFX1250: flat_atomic_swap_b64 v[2:3], v[4:5], v[6:7] th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00] +0x7c,0x40,0x10,0xec,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00 + +# GFX1250: flat_atomic_xor_b32 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_xor_b32 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0f,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_xor_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_xor_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0f,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_xor_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_xor_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x12,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_atomic_xor_b64 v[2:3], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_atomic_xor_b64 v[2:3], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x12,0xec,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_b128 v[2:5], v[0:1] offset:-64 ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_b128 v[2:5], v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_b128 v[2:5], v[6:7] ; encoding: [0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00] +0x7c,0xc0,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00 + +# GFX1250: flat_load_b32 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_b32 v1, v[0:1] offset:64 ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_b32 v1, v[4:5] ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_b32 v1, v[4:5] offset:-2048 ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0xf8,0xff] +0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0xf8,0xff + +# GFX1250: flat_load_b32 v1, v[4:5] offset:-4 ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xfc,0xff,0xff] +0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xfc,0xff,0xff + +# GFX1250: flat_load_b32 v1, v[4:5] offset:2047 ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00] +0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00 + +# GFX1250: flat_load_b32 v1, v[4:5] offset:2048 ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x08,0x00] +0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x08,0x00 + +# GFX1250: flat_load_b32 v1, v[4:5] offset:4 ; encoding: [0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x04,0x00,0x00] +0x7c,0x00,0x05,0xec,0x01,0x00,0x00,0x00,0x04,0x04,0x00,0x00 + +# GFX1250: flat_load_b64 v[2:3], v[0:1] offset:-64 ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_b64 v[2:3], v[0:1] offset:64 ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_b64 v[2:3], v[4:5] ; encoding: [0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x05,0xec,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_b96 v[2:4], v[0:1] offset:-64 ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_b96 v[2:4], v[0:1] offset:64 ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_b96 v[2:4], v[6:7] ; encoding: [0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00] +0x7c,0x80,0x05,0xec,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00 + +# GFX1250: flat_load_d16_b16 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_d16_b16 v1, v[0:1] offset:64 ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_d16_b16 v1, v[4:5] ; encoding: [0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_d16_hi_b16 v1, v[0:1] offset:-64 ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_d16_hi_b16 v1, v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_d16_hi_b16 v1, v[4:5] ; encoding: [0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0xc0,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_d16_hi_i8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_d16_hi_i8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_d16_hi_i8 v1, v[4:5] ; encoding: [0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x80,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_d16_hi_u8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_d16_hi_u8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_d16_hi_u8 v1, v[4:5] ; encoding: [0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x08,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_d16_i8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_d16_i8 v1, v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_d16_i8 v1, v[4:5] ; encoding: [0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0xc0,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_d16_u8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_d16_u8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_d16_u8 v1, v[4:5] ; encoding: [0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x80,0x07,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_i16 v1, v[0:1] offset:-64 ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_i16 v1, v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_i16 v1, v[4:5] ; encoding: [0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0xc0,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_i8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_i8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_i8 v1, v[4:5] ; encoding: [0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_u16 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_u16 v1, v[0:1] offset:64 ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_u16 v1, v[4:5] ; encoding: [0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x80,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_load_u8 v1, v[0:1] offset:-64 ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: flat_load_u8 v1, v[0:1] offset:64 ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: flat_load_u8 v1, v[4:5] ; encoding: [0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x04,0xec,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_store_b128 v[0:1], v[2:5] offset:-64 ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_store_b128 v[0:1], v[2:5] offset:64 ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_store_b128 v[2:3], v[4:7] ; encoding: [0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00] +0x7c,0x40,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: flat_store_b16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_store_b16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_store_b16 v[4:5], v1 ; encoding: [0x7c,0x40,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_store_b32 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_store_b32 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_store_b32 v[4:5], v1 offset:-16 ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff] +0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff + +# GFX1250: flat_store_b32 v[4:5], v1 offset:16 ; encoding: [0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00] +0x7c,0x80,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00 + +# GFX1250: flat_store_b64 v[0:1], v[2:3] offset:-64 ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_store_b64 v[0:1], v[2:3] offset:64 ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_store_b64 v[2:3], v[4:5] ; encoding: [0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00] +0x7c,0xc0,0x06,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: flat_store_b8 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_store_b8 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x06,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_store_b8 v[4:5], v1 ; encoding: [0x7c,0x00,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x06,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_store_b96 v[0:1], v[2:4] offset:-64 ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_store_b96 v[0:1], v[2:4] offset:64 ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_store_b96 v[2:3], v[4:6] ; encoding: [0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00] +0x7c,0x00,0x07,0xec,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: flat_store_d16_hi_b16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_store_d16_hi_b16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_store_d16_hi_b16 v[4:5], v1 ; encoding: [0x7c,0x40,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: flat_store_d16_hi_b8 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: flat_store_d16_hi_b8 v[0:1], v2 offset:64 ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x09,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: flat_store_d16_hi_b8 v[4:5], v1 ; encoding: [0x7c,0x00,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x09,0xec,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_atomic_add_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x15,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_add_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_add_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_and_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_and_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_and_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_and_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_and_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_and_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_and_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_and_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_and_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_and_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_and_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_and_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_and_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_and_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_and_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_and_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0xff,0x07,0x00] +0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0xff,0x07,0x00 + +# GFX1250: global_atomic_cmpswap_b32 v0, v1, v[2:3], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0x00,0x00,0x00] +0x02,0x00,0x0d,0xee,0x00,0x00,0x10,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00] +0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0xff,0x07,0x00 + +# GFX1250: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00] +0x7c,0x00,0x0d,0xee,0x00,0x00,0x10,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cmpswap_b32 v1, v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b32 v1, v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cmpswap_b32 v1, v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b32 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cmpswap_b32 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b64 v0, v[2:5], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cmpswap_b64 v0, v[2:5], s[0:1] offset:64 ; encoding: [0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b64 v[0:1], v[2:5], off offset:-64 ; encoding: [0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cmpswap_b64 v[0:1], v[2:5], off offset:64 ; encoding: [0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b64 v[2:3], v0, v[2:5], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cmpswap_b64 v[2:3], v0, v[2:5], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b64 v[2:3], v3, v[6:9], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00] +0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00 + +# GFX1250: global_atomic_cmpswap_b64 v[2:3], v3, v[6:9], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00] +0x02,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00] +0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00 + +# GFX1250: global_atomic_cmpswap_b64 v[2:3], v[4:5], v[6:9], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00] +0x7c,0x80,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00 + +# GFX1250: global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cond_sub_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cond_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cond_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_cond_sub_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_cond_sub_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_dec_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_dec_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_dec_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_dec_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_dec_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_dec_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x10,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_dec_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_dec_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_dec_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_dec_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_dec_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_dec_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_dec_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_dec_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_dec_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_dec_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_inc_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_inc_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_inc_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_inc_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_inc_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_inc_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_inc_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_inc_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_inc_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_inc_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_inc_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_inc_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x13,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_inc_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_inc_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_inc_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_inc_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x13,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_i32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_i32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_i32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_i32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_i32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_i32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_i32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_i32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_i64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_i64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_i64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_i64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_i64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_i64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_i64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_i64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_num_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_num_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_num_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_num_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_num_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_num_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_num_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_max_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_max_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_i32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_i32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_i32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_i32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_i32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_i32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_i32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_i32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_i64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_i64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_i64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_i64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_i64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_i64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_i64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_i64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_num_f32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_num_f32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_num_f32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_num_f32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_num_f32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_num_f32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x14,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_num_f32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_num_f32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x14,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0e,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0e,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_min_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_min_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_or_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_or_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_or_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_or_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_or_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_or_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_or_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_or_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_or_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_or_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_or_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_or_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_or_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_or_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_or_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_or_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] +0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00 + +# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_pk_add_bf16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_pk_add_bf16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00] +0x00,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00 + +# GFX1250: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_pk_add_bf16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_pk_add_bf16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] +0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00 + +# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_pk_add_f16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_pk_add_f16 v1, v0, v2, s[0:1] th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00] +0x00,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x00,0x00,0x00 + +# GFX1250: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_pk_add_f16 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x16,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_pk_add_f16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_pk_add_f16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x16,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_clamp_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_clamp_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_clamp_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_clamp_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_clamp_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_u32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_u32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_u32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_u32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_u32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_u32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0d,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_u32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_u32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0d,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_u64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_u64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_u64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_u64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x11,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_u64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_u64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_sub_u64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_sub_u64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x11,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b32 v0, v1, v3, s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0xff,0x07,0x00] +0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0xff,0x07,0x00 + +# GFX1250: global_atomic_swap_b32 v0, v1, v3, s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0x00,0x00,0x00] +0x02,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: global_atomic_swap_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_swap_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b32 v0, v[2:3], v3, off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00] +0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0xff,0x07,0x00 + +# GFX1250: global_atomic_swap_b32 v0, v[2:3], v3, off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00] +0x7c,0xc0,0x0c,0xee,0x00,0x00,0x90,0x01,0x02,0x00,0x00,0x00 + +# GFX1250: global_atomic_swap_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_swap_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_swap_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0c,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_swap_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x0c,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_swap_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_swap_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x10,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_swap_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b64 v[2:3], v3, v[6:7], s[2:3] offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00] +0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0xff,0x07,0x00 + +# GFX1250: global_atomic_swap_b64 v[2:3], v3, v[6:7], s[2:3] th:TH_ATOMIC_RETURN ; encoding: [0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00] +0x02,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x03,0x00,0x00,0x00 + +# GFX1250: global_atomic_swap_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_swap_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_swap_b64 v[2:3], v[4:5], v[6:7], off offset:2047 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00] +0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0xff,0x07,0x00 + +# GFX1250: global_atomic_swap_b64 v[2:3], v[4:5], v[6:7], off th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00] +0x7c,0x40,0x10,0xee,0x02,0x00,0x10,0x03,0x04,0x00,0x00,0x00 + +# GFX1250: global_atomic_xor_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_xor_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_xor_b32 v1, v0, v2, s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_xor_b32 v1, v0, v2, s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_xor_b32 v1, v[0:1], v2, off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_xor_b32 v1, v[0:1], v2, off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0f,0xee,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_xor_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_xor_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x0f,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_xor_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_xor_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_xor_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_xor_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x12,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_xor_b64 v[2:3], v0, v[2:3], s[0:1] offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_xor_b64 v[2:3], v0, v[2:3], s[0:1] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_atomic_xor_b64 v[2:3], v[0:1], v[2:3], off offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_atomic_xor_b64 v[2:3], v[0:1], v[2:3], off offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x12,0xee,0x02,0x00,0x10,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_inv ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x7c,0xc0,0x0a,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_inv scope:SCOPE_DEV ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00] +0x7c,0xc0,0x0a,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_inv scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00] +0x7c,0xc0,0x0a,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_load_addtid_b32 v1, off ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_load_addtid_b32 v1, off offset:-64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_addtid_b32 v1, off offset:64 ; encoding: [0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_addtid_b32 v1, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_addtid_b32 v1, s[0:1] offset:64 ; encoding: [0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_addtid_b32 v1, s[2:3] ; encoding: [0x02,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x02,0x00,0x0a,0xee,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_load_b128 v[2:5], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_b128 v[2:5], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_b128 v[2:5], v5, s[2:3] ; encoding: [0x02,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00] +0x02,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00 + +# GFX1250: global_load_b128 v[2:5], v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_b128 v[2:5], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_b128 v[2:5], v[6:7], off ; encoding: [0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00] +0x7c,0xc0,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00 + +# GFX1250: global_load_b32 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_b32 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_b32 v1, v3, s[2:3] ; encoding: [0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_b32 v1, v3, s[2:3] offset:2047 ; encoding: [0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0xff,0x07,0x00] +0x02,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x03,0xff,0x07,0x00 + +# GFX1250: global_load_b32 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_b32 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_b32 v1, v[4:5], off ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_b32 v1, v[4:5], off offset:2047 ; encoding: [0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00] +0x7c,0x00,0x05,0xee,0x01,0x00,0x00,0x00,0x04,0xff,0x07,0x00 + +# GFX1250: global_load_b64 v[2:3], v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_b64 v[2:3], v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_b64 v[2:3], v3, s[2:3] ; encoding: [0x02,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_b64 v[2:3], v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_b64 v[2:3], v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_b64 v[2:3], v[4:5], off ; encoding: [0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x05,0xee,0x02,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_b96 v[2:4], v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_b96 v[2:4], v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_b96 v[2:4], v5, s[2:3] ; encoding: [0x02,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00] +0x02,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x05,0x00,0x00,0x00 + +# GFX1250: global_load_b96 v[2:4], v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_b96 v[2:4], v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_b96 v[2:4], v[6:7], off ; encoding: [0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00] +0x7c,0x80,0x05,0xee,0x02,0x00,0x00,0x00,0x06,0x00,0x00,0x00 + +# GFX1250: global_load_block v[8:39], v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_block v[8:39], v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_block v[8:39], v5, s[2:3] ; encoding: [0x02,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x05,0x00,0x00,0x00] +0x02,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x05,0x00,0x00,0x00 + +# GFX1250: global_load_block v[8:39], v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_block v[8:39], v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_block v[8:39], v[6:7], off ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x06,0x00,0x00,0x00] +0x7c,0xc0,0x14,0xee,0x08,0x00,0x00,0x00,0x06,0x00,0x00,0x00 + +# GFX1250: global_load_block v[8:39], v[6:7], off th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x7c,0xc0,0x14,0xee,0x08,0x00,0x24,0x00,0x06,0x00,0x00,0x00] +0x7c,0xc0,0x14,0xee,0x08,0x00,0x24,0x00,0x06,0x00,0x00,0x00 + +# GFX1250: global_load_d16_b16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_b16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_b16 v1, v3, s[2:3] ; encoding: [0x02,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_d16_b16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_b16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_b16 v1, v[4:5], off ; encoding: [0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_d16_hi_b16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_hi_b16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_hi_b16 v1, v3, s[2:3] ; encoding: [0x02,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_d16_hi_b16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_hi_b16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_hi_b16 v1, v[4:5], off ; encoding: [0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0xc0,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_d16_hi_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_hi_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_hi_i8 v1, v3, s[2:3] ; encoding: [0x02,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_d16_hi_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_hi_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_hi_i8 v1, v[4:5], off ; encoding: [0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x80,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_d16_hi_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_hi_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_hi_u8 v1, v3, s[2:3] ; encoding: [0x02,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_d16_hi_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_hi_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_hi_u8 v1, v[4:5], off ; encoding: [0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x08,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_d16_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_i8 v1, v3, s[2:3] ; encoding: [0x02,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_d16_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_i8 v1, v[4:5], off ; encoding: [0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0xc0,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_d16_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_u8 v1, v3, s[2:3] ; encoding: [0x02,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_d16_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_d16_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_d16_u8 v1, v[4:5], off ; encoding: [0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x80,0x07,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_i16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_i16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_i16 v1, v3, s[2:3] ; encoding: [0x02,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_i16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_i16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_i16 v1, v[4:5], off ; encoding: [0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0xc0,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_i8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_i8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_i8 v1, v3, s[2:3] ; encoding: [0x02,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_i8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_i8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_i8 v1, v[4:5], off ; encoding: [0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_u16 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_u16 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_u16 v1, v3, s[2:3] ; encoding: [0x02,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_u16 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_u16 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_u16 v1, v[4:5], off ; encoding: [0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x80,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_load_u8 v1, v0, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_u8 v1, v0, s[0:1] offset:64 ; encoding: [0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_u8 v1, v3, s[2:3] ; encoding: [0x02,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00] +0x02,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_load_u8 v1, v[0:1], off offset:-64 ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: global_load_u8 v1, v[0:1], off offset:64 ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: global_load_u8 v1, v[4:5], off ; encoding: [0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x04,0xee,0x01,0x00,0x00,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_store_addtid_b32 v2, off offset:-64 ; encoding: [0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_addtid_b32 v2, off offset:64 ; encoding: [0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_addtid_b32 v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_addtid_b32 v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x0a,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b128 v0, v[2:5], s[0:1] offset:-64 ; encoding: [0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b128 v0, v[2:5], s[0:1] offset:64 ; encoding: [0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b128 v1, v[4:7], s[2:3] ; encoding: [0x02,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00] +0x02,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00 + +# GFX1250: global_store_b128 v[0:1], v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b128 v[0:1], v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b128 v[2:3], v[4:7], off ; encoding: [0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00] +0x7c,0x40,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_b16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b16 v3, v1, s[2:3] ; encoding: [0x02,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00] +0x02,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_store_b16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b16 v[4:5], v1, off ; encoding: [0x7c,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_store_b32 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b32 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b32 v3, v1, s[2:3] offset:-16 ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0xf0,0xff,0xff] +0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0xf0,0xff,0xff + +# GFX1250: global_store_b32 v3, v1, s[2:3] offset:16 ; encoding: [0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x10,0x00,0x00] +0x02,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x10,0x00,0x00 + +# GFX1250: global_store_b32 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b32 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b32 v[4:5], v1, off offset:-16 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff] +0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0xf0,0xff,0xff + +# GFX1250: global_store_b32 v[4:5], v1, off offset:16 ; encoding: [0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00] +0x7c,0x80,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x10,0x00,0x00 + +# GFX1250: global_store_b64 v0, v[2:3], s[0:1] offset:-64 ; encoding: [0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b64 v0, v[2:3], s[0:1] offset:64 ; encoding: [0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b64 v1, v[2:3], s[2:3] ; encoding: [0x02,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00] +0x02,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: global_store_b64 v[0:1], v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b64 v[0:1], v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b64 v[2:3], v[4:5], off ; encoding: [0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00] +0x7c,0xc0,0x06,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_b8 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b8 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b8 v3, v1, s[2:3] ; encoding: [0x02,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00] +0x02,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_store_b8 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b8 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x06,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b8 v[4:5], v1, off ; encoding: [0x7c,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x06,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_store_b96 v0, v[2:4], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b96 v0, v[2:4], s[0:1] offset:64 ; encoding: [0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b96 v1, v[4:6], s[2:3] ; encoding: [0x02,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00] +0x02,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00 + +# GFX1250: global_store_b96 v[0:1], v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_b96 v[0:1], v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_b96 v[2:3], v[4:6], off ; encoding: [0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00] +0x7c,0x00,0x07,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_block v0, v[2:33], s[0:1] offset:-64 ; encoding: [0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_block v0, v[2:33], s[0:1] offset:64 ; encoding: [0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_block v1, v[4:35], s[2:3] ; encoding: [0x02,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00] +0x02,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x01,0x00,0x00,0x00 + +# GFX1250: global_store_block v[0:1], v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_block v[0:1], v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_block v[2:3], v[4:35], off ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00] +0x7c,0x00,0x15,0xee,0x00,0x00,0x00,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_block v[2:3], v[4:35], off th:TH_STORE_HT scope:SCOPE_SE ; encoding: [0x7c,0x00,0x15,0xee,0x00,0x00,0x24,0x02,0x02,0x00,0x00,0x00] +0x7c,0x00,0x15,0xee,0x00,0x00,0x24,0x02,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_d16_hi_b16 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_d16_hi_b16 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_d16_hi_b16 v3, v1, s[2:3] ; encoding: [0x02,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00] +0x02,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_store_d16_hi_b16 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_d16_hi_b16 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_d16_hi_b16 v[4:5], v1, off ; encoding: [0x7c,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00] +0x7c,0x40,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_store_d16_hi_b8 v0, v2, s[0:1] offset:-64 ; encoding: [0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_d16_hi_b8 v0, v2, s[0:1] offset:64 ; encoding: [0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_d16_hi_b8 v3, v1, s[2:3] ; encoding: [0x02,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00] +0x02,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x03,0x00,0x00,0x00 + +# GFX1250: global_store_d16_hi_b8 v[0:1], v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: global_store_d16_hi_b8 v[0:1], v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x09,0xee,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: global_store_d16_hi_b8 v[4:5], v1, off ; encoding: [0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00] +0x7c,0x00,0x09,0xee,0x00,0x00,0x80,0x00,0x04,0x00,0x00,0x00 + +# GFX1250: global_wb ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x7c,0x00,0x0b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_wb scope:SCOPE_DEV ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00] +0x7c,0x00,0x0b,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_wb scope:SCOPE_SYS ; encoding: [0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00] +0x7c,0x00,0x0b,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_wbinv ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x7c,0xc0,0x13,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_wbinv scope:SCOPE_DEV ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00] +0x7c,0xc0,0x13,0xee,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_wbinv scope:SCOPE_SYS ; encoding: [0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00] +0x7c,0xc0,0x13,0xee,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: scratch_load_b128 v[2:5], off, off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b128 v[2:5], off, off offset:64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b128 v[2:5], off, s0 offset:-64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b128 v[2:5], off, s0 offset:64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b128 v[2:5], v0, off offset:-64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b128 v[2:5], v0, off offset:64 ; encoding: [0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b128 v[2:5], v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b128 v[2:5], v0, s0 offset:64 ; encoding: [0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b128 v[2:5], v2, s1 ; encoding: [0x01,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0xc0,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_b32 v1, off, off ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: scratch_load_b32 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b32 v1, off, off offset:2047 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00] +0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00 + +# GFX1250: scratch_load_b32 v1, off, off offset:64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b32 v1, off, s0 offset:-64 ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b32 v1, off, s0 offset:64 ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b32 v1, off, s1 offset:2047 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00] +0x01,0x00,0x05,0xed,0x01,0x00,0x00,0x00,0x00,0xff,0x07,0x00 + +# GFX1250: scratch_load_b32 v1, v0, off offset:-64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b32 v1, v0, off offset:64 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b32 v1, v0, s0 offset:-64 ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b32 v1, v0, s0 offset:64 ; encoding: [0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b32 v1, v2, off offset:2047 ; encoding: [0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00] +0x7c,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00 + +# GFX1250: scratch_load_b32 v1, v2, s1 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_b32 v1, v2, s1 offset:-4095 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x01,0xf0,0xff] +0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x01,0xf0,0xff + +# GFX1250: scratch_load_b32 v1, v2, s1 offset:-61440 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x10,0xff] +0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x10,0xff + +# GFX1250: scratch_load_b32 v1, v2, s1 offset:2047 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00] +0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x07,0x00 + +# GFX1250: scratch_load_b32 v1, v2, s1 offset:4095 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x0f,0x00] +0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0xff,0x0f,0x00 + +# GFX1250: scratch_load_b32 v1, v2, s1 offset:61440 ; encoding: [0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0xf0,0x00] +0x01,0x00,0x05,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0xf0,0x00 + +# GFX1250: scratch_load_b64 v[2:3], off, off offset:-64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b64 v[2:3], off, off offset:64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b64 v[2:3], off, s0 offset:-64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b64 v[2:3], off, s0 offset:64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b64 v[2:3], v0, off offset:-64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b64 v[2:3], v0, off offset:64 ; encoding: [0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b64 v[2:3], v0, s0 offset:-64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b64 v[2:3], v0, s0 offset:64 ; encoding: [0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b64 v[2:3], v2, s1 ; encoding: [0x01,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x40,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_b96 v[2:4], off, off offset:-64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b96 v[2:4], off, off offset:64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b96 v[2:4], off, s0 offset:-64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b96 v[2:4], off, s0 offset:64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x05,0xed,0x02,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b96 v[2:4], v0, off offset:-64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b96 v[2:4], v0, off offset:64 ; encoding: [0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b96 v[2:4], v0, s0 offset:-64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_b96 v[2:4], v0, s0 offset:64 ; encoding: [0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_b96 v[2:4], v2, s1 ; encoding: [0x01,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x80,0x05,0xed,0x02,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_block v[4:35], off, off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_block v[4:35], off, off offset:64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_block v[4:35], off, s0 offset:-64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_block v[4:35], off, s0 offset:64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x14,0xed,0x04,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_block v[4:35], v0, off offset:-64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_block v[4:35], v0, off offset:64 ; encoding: [0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_block v[4:35], v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_block v[4:35], v0, s0 offset:64 ; encoding: [0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_block v[4:35], v2, s1 ; encoding: [0x01,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0xc0,0x14,0xed,0x04,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_block v[4:35], v2, s1 th:TH_LOAD_HT scope:SCOPE_SE ; encoding: [0x01,0xc0,0x14,0xed,0x04,0x00,0x26,0x00,0x02,0x00,0x00,0x00] +0x01,0xc0,0x14,0xed,0x04,0x00,0x26,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_d16_b16 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_b16 v1, off, off offset:64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_b16 v1, off, s0 offset:-64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_b16 v1, off, s0 offset:64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_b16 v1, v0, off offset:-64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_b16 v1, v0, off offset:64 ; encoding: [0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_b16 v1, v0, s0 offset:-64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_b16 v1, v0, s0 offset:64 ; encoding: [0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_b16 v1, v2, s1 ; encoding: [0x01,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x00,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_b16 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_b16 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_b16 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_b16 v1, off, s0 offset:64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_b16 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_b16 v1, v0, off offset:64 ; encoding: [0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_b16 v1, v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_b16 v1, v0, s0 offset:64 ; encoding: [0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_b16 v1, v2, s1 ; encoding: [0x01,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0xc0,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_i8 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_i8 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_i8 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_i8 v1, off, s0 offset:64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_i8 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_i8 v1, v0, off offset:64 ; encoding: [0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_i8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_i8 v1, v0, s0 offset:64 ; encoding: [0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_i8 v1, v2, s1 ; encoding: [0x01,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x80,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_u8 v1, off, off offset:64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_u8 v1, off, s0 offset:-64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_u8 v1, off, s0 offset:64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x08,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_u8 v1, v0, off offset:-64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_u8 v1, v0, off offset:64 ; encoding: [0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_u8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_hi_u8 v1, v0, s0 offset:64 ; encoding: [0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_hi_u8 v1, v2, s1 ; encoding: [0x01,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x40,0x08,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_d16_i8 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_i8 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_i8 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_i8 v1, off, s0 offset:64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_i8 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_i8 v1, v0, off offset:64 ; encoding: [0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_i8 v1, v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_i8 v1, v0, s0 offset:64 ; encoding: [0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_i8 v1, v2, s1 ; encoding: [0x01,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0xc0,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_d16_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_u8 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_u8 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_u8 v1, off, s0 offset:64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x07,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_u8 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_u8 v1, v0, off offset:64 ; encoding: [0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_u8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_d16_u8 v1, v0, s0 offset:64 ; encoding: [0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_d16_u8 v1, v2, s1 ; encoding: [0x01,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x80,0x07,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_i16 v1, off, off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_i16 v1, off, off offset:64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_i16 v1, off, s0 offset:-64 ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_i16 v1, off, s0 offset:64 ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_i16 v1, v0, off offset:-64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_i16 v1, v0, off offset:64 ; encoding: [0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_i16 v1, v0, s0 offset:-64 ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_i16 v1, v0, s0 offset:64 ; encoding: [0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_i16 v1, v2, s1 ; encoding: [0x01,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0xc0,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_i8 v1, off, off offset:-64 ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_i8 v1, off, off offset:64 ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_i8 v1, off, s0 offset:-64 ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_i8 v1, off, s0 offset:64 ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_i8 v1, v0, off offset:-64 ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_i8 v1, v0, off offset:64 ; encoding: [0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_i8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_i8 v1, v0, s0 offset:64 ; encoding: [0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_i8 v1, v2, s1 ; encoding: [0x01,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x40,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_u16 v1, off, off offset:-64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_u16 v1, off, off offset:64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_u16 v1, off, s0 offset:-64 ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_u16 v1, off, s0 offset:64 ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_u16 v1, v0, off offset:-64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_u16 v1, v0, off offset:64 ; encoding: [0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_u16 v1, v0, s0 offset:-64 ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_u16 v1, v0, s0 offset:64 ; encoding: [0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_u16 v1, v2, s1 ; encoding: [0x01,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x80,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_u8 v1, off, off offset:-64 ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_u8 v1, off, off offset:64 ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_u8 v1, off, s0 offset:-64 ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_u8 v1, off, s0 offset:64 ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x04,0xed,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_u8 v1, v0, off offset:-64 ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_u8 v1, v0, off offset:64 ; encoding: [0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x7c,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_u8 v1, v0, s0 offset:-64 ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff] +0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_load_u8 v1, v0, s0 offset:64 ; encoding: [0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00] +0x00,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_load_u8 v1, v2, s1 ; encoding: [0x01,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0x01,0x00,0x04,0xed,0x01,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_store_b128 off, v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b128 off, v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b128 off, v[2:5], s0 offset:-64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b128 off, v[2:5], s0 offset:64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b128 v0, v[2:5], off offset:-64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b128 v0, v[2:5], off offset:64 ; encoding: [0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b128 v0, v[2:5], s0 offset:-64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b128 v0, v[2:5], s0 offset:64 ; encoding: [0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b128 v1, v[2:5], s3 ; encoding: [0x03,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0x40,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_b16 off, v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b16 off, v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b16 off, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b16 off, v2, s0 offset:64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b16 v0, v2, off offset:-64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b16 v0, v2, off offset:64 ; encoding: [0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b16 v0, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b16 v0, v2, s0 offset:64 ; encoding: [0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b16 v1, v2, s3 ; encoding: [0x03,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0x40,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_b32 off, v2, off ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00] +0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00 + +# GFX1250: scratch_store_b32 off, v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b32 off, v2, off offset:2047 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00] +0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00 + +# GFX1250: scratch_store_b32 off, v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b32 off, v2, s0 offset:-64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b32 off, v2, s0 offset:64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b32 off, v2, s3 offset:2047 ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00] +0x03,0x80,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xff,0x07,0x00 + +# GFX1250: scratch_store_b32 v0, v2, off offset:-64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b32 v0, v2, off offset:64 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b32 v0, v2, s0 offset:-64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b32 v0, v2, s0 offset:64 ; encoding: [0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b32 v1, v2, off offset:2047 ; encoding: [0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00] +0x7c,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00 + +# GFX1250: scratch_store_b32 v1, v2, s1 offset:-4095 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x01,0xf0,0xff] +0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x01,0xf0,0xff + +# GFX1250: scratch_store_b32 v1, v2, s1 offset:-61440 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x10,0xff] +0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x10,0xff + +# GFX1250: scratch_store_b32 v1, v2, s1 offset:4095 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x0f,0x00] +0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x0f,0x00 + +# GFX1250: scratch_store_b32 v1, v2, s1 offset:61440 ; encoding: [0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0xf0,0x00] +0x01,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0xf0,0x00 + +# GFX1250: scratch_store_b32 v1, v2, s3 ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_b32 v1, v2, s3 offset:2047 ; encoding: [0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00] +0x03,0x80,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0xff,0x07,0x00 + +# GFX1250: scratch_store_b64 off, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b64 off, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b64 off, v[2:3], s0 offset:-64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b64 off, v[2:3], s0 offset:64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b64 v0, v[2:3], off offset:-64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b64 v0, v[2:3], off offset:64 ; encoding: [0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b64 v0, v[2:3], s0 offset:-64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b64 v0, v[2:3], s0 offset:64 ; encoding: [0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b64 v1, v[2:3], s3 ; encoding: [0x03,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0xc0,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_b8 off, v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b8 off, v2, off offset:64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b8 off, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b8 off, v2, s0 offset:64 ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x06,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b8 v0, v2, off offset:-64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b8 v0, v2, off offset:64 ; encoding: [0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b8 v0, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b8 v0, v2, s0 offset:64 ; encoding: [0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b8 v1, v2, s3 ; encoding: [0x03,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0x00,0x06,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_b96 off, v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b96 off, v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b96 off, v[2:4], s0 offset:-64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b96 off, v[2:4], s0 offset:64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x07,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b96 v0, v[2:4], off offset:-64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b96 v0, v[2:4], off offset:64 ; encoding: [0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b96 v0, v[2:4], s0 offset:-64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_b96 v0, v[2:4], s0 offset:64 ; encoding: [0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_b96 v1, v[2:4], s3 ; encoding: [0x03,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0x00,0x07,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_block off, v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_block off, v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_block off, v[2:33], s0 offset:-64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_block off, v[2:33], s0 offset:64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x15,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_block v0, v[2:33], off offset:-64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_block v0, v[2:33], off offset:64 ; encoding: [0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_block v0, v[2:33], s0 offset:-64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_block v0, v[2:33], s0 offset:64 ; encoding: [0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_block v1, v[2:33], s3 ; encoding: [0x03,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0x00,0x15,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_block v1, v[2:33], s3 th:TH_STORE_HT scope:SCOPE_SE ; encoding: [0x03,0x00,0x15,0xed,0x00,0x00,0x26,0x01,0x01,0x00,0x00,0x00] +0x03,0x00,0x15,0xed,0x00,0x00,0x26,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b16 off, v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_d16_hi_b16 off, v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b16 off, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_d16_hi_b16 off, v2, s0 offset:64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b16 v0, v2, off offset:-64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_d16_hi_b16 v0, v2, off offset:64 ; encoding: [0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b16 v0, v2, s0 offset:-64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_d16_hi_b16 v0, v2, s0 offset:64 ; encoding: [0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b16 v1, v2, s3 ; encoding: [0x03,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0x40,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b8 off, v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_d16_hi_b8 off, v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b8 off, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_d16_hi_b8 off, v2, s0 offset:64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x09,0xed,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b8 v0, v2, off offset:-64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_d16_hi_b8 v0, v2, off offset:64 ; encoding: [0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x7c,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b8 v0, v2, s0 offset:-64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff] +0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0xc0,0xff,0xff + +# GFX1250: scratch_store_d16_hi_b8 v0, v2, s0 offset:64 ; encoding: [0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00] +0x00,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x00,0x40,0x00,0x00 + +# GFX1250: scratch_store_d16_hi_b8 v1, v2, s3 ; encoding: [0x03,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] +0x03,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 + # GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] 0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 From 703501e66198c6d4be48773b617c784370e23d4a Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 16 Jul 2025 15:06:37 -0700 Subject: [PATCH 109/813] [AMDGPU] Select flat GVS loads on gfx1250 (#149183) --- llvm/lib/Target/AMDGPU/FLATInstructions.td | 110 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 +- llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 2405 +++++++++++++++++++ 3 files changed, 2481 insertions(+), 38 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 06e23dbb92450..3965b5dd8c5c3 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1250,6 +1250,16 @@ class GlobalLoadSaddrPat_D16 ; +class FlatLoadSaddrPat_D16 : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), + (inst $saddr, $voffset, $offset, (i32 0), $in) +>; + +class FlatLoadSaddrPat_D16_t16 : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), + (inst $saddr, $voffset, $offset, (i32 0)) +>; + class GlobalLoadSaddrPat_D16_t16 : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, (i32 0)) @@ -1260,7 +1270,7 @@ class FlatLoadSignedPat (inst $vaddr, $offset) >; -class GlobalLoadSaddrPat : GCNPat < +class FlatLoadSaddrPat : GCNPat < (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), (inst $saddr, $voffset, $offset, 0) >; @@ -1444,7 +1454,7 @@ multiclass GlobalFLATLoadPats(!cast(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1454,7 +1464,7 @@ multiclass GlobalFLATLoadPats_D16(!cast(inst)#"_SADDR"), node, vt> { + def : FlatLoadSaddrPat_D16(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1618,32 +1628,60 @@ multiclass ScratchFLATLoadPats_D16_t16 { + def : FlatLoadPat ; + + def : FlatLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16 { + def : FlatLoadPat_D16 ; + + def : FlatLoadSaddrPat_D16(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatLoadPats_D16_t16 { + def : FlatLoadPat_D16_t16 ; + + def : FlatLoadSaddrPat_D16_t16(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + let OtherPredicates = [HasFlatAddressSpace] in { -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; -def : FlatLoadPat ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in let True16Predicate = p in { - def : FlatLoadPat ; - def : FlatLoadPat ; - def : FlatLoadPat ; - def : FlatLoadPat ; - def : FlatLoadPat ; - def : FlatLoadPat ; - def : FlatLoadPat ; - def : FlatLoadPat ; + defm : FlatLoadPats ; + defm : FlatLoadPats ; + defm : FlatLoadPats ; + defm : FlatLoadPats ; + defm : FlatLoadPats ; + defm : FlatLoadPats ; + defm : FlatLoadPats ; + defm : FlatLoadPats ; def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; @@ -1651,28 +1689,28 @@ let True16Predicate = p in { } let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { - def : FlatLoadPat_D16_t16; - def : FlatLoadPat_D16_t16; - def : FlatLoadPat_D16_t16; - def : FlatLoadPat_D16_t16; - def : FlatLoadPat_D16_t16; - def : FlatLoadPat_D16_t16; - def : FlatLoadPat_D16_t16; - def : FlatLoadPat_D16_t16; + defm : FlatLoadPats_D16_t16; + defm : FlatLoadPats_D16_t16; + defm : FlatLoadPats_D16_t16; + defm : FlatLoadPats_D16_t16; + defm : FlatLoadPats_D16_t16; + defm : FlatLoadPats_D16_t16; + defm : FlatLoadPats_D16_t16; + defm : FlatLoadPats_D16_t16; def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; def : FlatStorePat ; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts -def : FlatLoadPat ; -def : FlatLoadPat ; +defm : FlatLoadPats ; +defm : FlatLoadPats ; def : FlatStorePat ; def : FlatStorePat ; foreach vt = Reg32Types.types in { -def : FlatLoadPat ; +defm : FlatLoadPats ; def : FlatStorePat ; } @@ -1684,7 +1722,7 @@ def : FlatLoadPat ; def : FlatStorePat ; foreach vt = VReg_128.RegTypes in { -def : FlatLoadPat ; +defm : FlatLoadPats ; def : FlatStorePat ; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index a1e14d90ebcab..6109a2c4dfc7f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6460,7 +6460,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldSAddrIdx < 0) return false; - assert(isSegmentSpecificFLAT(Inst)); + assert(isSegmentSpecificFLAT(Inst) || (isFLAT(Inst) && ST.hasFlatGVSMode())); int NewOpc = AMDGPU::getGlobalVaddrOp(Opc); if (NewOpc < 0) @@ -6537,7 +6537,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { // FIXME: Remove this when SelectionDAG is obsoleted. void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const { - if (!isSegmentSpecificFLAT(MI)) + if (!isSegmentSpecificFLAT(MI) && !ST.hasFlatGVSMode()) return; // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll new file mode 100644 index 0000000000000..f0988a17b35f0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -0,0 +1,2405 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +; Test using saddr addressing mode of flat_*load_* instructions. + +; -------------------------------------------------------------------------------- +; No vgpr offset, constants +; -------------------------------------------------------------------------------- + +; SGPR base only +define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %load = load i8, ptr %sbase + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx1250 immediate offset +define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_8388607: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum gfx1250 immediate offset + 1 +define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_8388608: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0x800000 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388608 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx1250 immediate offset +define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_neg8388608: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388608 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388608 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; SGPR base with maximum negative gfx1250 immediate offset -1 +define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg8388609: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0xff800000, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg8388609: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0xff7fffff +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -8388609 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, 0xff800000 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967295 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000000: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, 1 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000000: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967296 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000001: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000001: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 1 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294967297 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000FFF: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4095 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000FFF: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0xfff +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971391 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100001000: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:4096 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100001000: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0x1000 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, 1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 4294971392 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0x800000, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-8388607 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 1 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967295 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000000: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_add_co_i32 s3, s3, -1 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000000: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, 0 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967296 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbase) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000001: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, s0, 0, s2 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, -1, s3, s0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:-1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000001: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s2, -1 +; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s3, -2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 -4294967297 + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Basic addressing patterns +; -------------------------------------------------------------------------------- + +; Basic pattern, no immediate offset. +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx1250 +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388607 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx1250 + 1 +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800000, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388608 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum negative offset on gfx1250 +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388608(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388608: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388608 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -8388608 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum negative offset on gfx1250 - 1 +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388607(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388607: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -8388607 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:8388607 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 8388607 + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 %zext.offset + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; pointer addressing done in integers +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %sbase.as.int, %zext.offset + %dirty.gep = inttoptr i64 %add to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %dirty.gep = inttoptr i64 %add to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression, with immediate offset +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %add.immoffset = add i64 %add, 128 + %dirty.gep = inttoptr i64 %add.immoffset to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression, with immediate offset in non-canonical position +define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add.immoffset = add i64 %sbase.as.int, 128 + %add = add i64 %zext.offset, %add.immoffset + %dirty.gep = inttoptr i64 %add to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Uniformity edge cases +; -------------------------------------------------------------------------------- + +@ptr.in.lds = internal addrspace(3) global ptr undef + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v1 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v1 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v1 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[0:1] offset:42 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v1 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:42 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both 64-bit base and 32-bit offset are scalar +define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both 64-bit base and 32-bit offset are scalar, with immediate offset. +define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-24 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -24 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both components uniform, zext forced to LHS of addressing expression +define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %dirty.gep = inttoptr i64 %add to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both components uniform, zext forced to LHS of addressing expression, with immediate offset +define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) { +; GFX1250-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v0, s4 +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %sbase.as.int = ptrtoint ptr %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %add.immoffset = add i64 %add, 128 + %dirty.gep = inttoptr i64 %add.immoffset to ptr + %load = load i8, ptr %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; divergent 64-bit base, 32-bit scalar offset. +define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffset) { +; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, ptr %vbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; divergent 64-bit base, 32-bit scalar offset, with imm offset +define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i32 inreg %soffset) { +; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: s_mov_b32 s3, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[2:3] +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: s_mov_b32 s3, 0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] offset:8388607 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, ptr %vbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 8388607 + %load = load i8, ptr %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Natural addressing shifts with restricted range +; -------------------------------------------------------------------------------- + +; Cannot push the shift into 32-bits, and cannot match. +define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-SDAG-LABEL: flat_load_saddr_f32_natural_addressing: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3] +; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_f32_natural_addressing: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset + %load = load float, ptr %gep + ret float %load +} + +; Cannot push the shift into 32-bits, with an immediate offset. +define amdgpu_ps float @flat_load_saddr_f32_natural_addressing_immoffset(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing_immoffset: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 128 + %load = load float, ptr %gep1 + ret float %load +} + +; Range is sufficiently restricted to push the shift into 32-bits. +define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{} + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset + %load = load float, ptr %gep + ret float %load +} + +; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset +define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_imm_offset: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:400 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr, !range !0, !noundef !{} + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds float, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds float, ptr %gep0, i64 100 + %load = load float, ptr %gep1 + ret float %load +} + +; Range is 1 beyond the limit where we can move the shift into 32-bits. +define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) { +; GFX1250-SDAG-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 2, s[2:3] +; GFX1250-SDAG-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b64_e32 v[0:1], 2, v[0:1] +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v3, v1, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %voffset = load i32, ptr %voffset.ptr, !range !1, !noundef !{} + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, ptr %sbase, i64 %zext.offset + %load = load float, ptr %gep + ret float %load +} + +; -------------------------------------------------------------------------------- +; Stress various type loads +; -------------------------------------------------------------------------------- + +define amdgpu_ps half @flat_load_saddr_i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %cast.load = bitcast i16 %load to half + ret half %cast.load +} + +define amdgpu_ps half @flat_load_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %cast.load = bitcast i16 %load to half + ret half %cast.load +} + +define amdgpu_ps half @flat_load_saddr_f16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load half, ptr %gep0 + ret half %load +} + +define amdgpu_ps half @flat_load_saddr_f16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load half, ptr %gep1 + ret half %load +} + +define amdgpu_ps float @flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i32, ptr %gep0 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i32, ptr %gep1 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @flat_load_saddr_f32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load float, ptr %gep0 + ret float %load +} + +define amdgpu_ps float @flat_load_saddr_f32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_f32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load float, ptr %gep1 + ret float %load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_v2i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x i16>, ptr %gep0 + %cast.load = bitcast <2 x i16> %load to <2 x half> + ret <2 x half> %cast.load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_v2i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x i16>, ptr %gep1 + %cast.load = bitcast <2 x i16> %load to <2 x half> + ret <2 x half> %cast.load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_v2f16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x half>, ptr %gep0 + ret <2 x half> %load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_v2f16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2f16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x half>, ptr %gep1 + ret <2 x half> %load +} + +define amdgpu_ps <2 x half> @flat_load_saddr_p3(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_p3: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load ptr addrspace(3), ptr %gep0 + %cast.load0 = ptrtoint ptr addrspace(3) %load to i32 + %cast.load1 = bitcast i32 %cast.load0 to <2 x half> + ret <2 x half> %cast.load1 +} + +define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_p3_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load ptr addrspace(3), ptr %gep1 + %cast.load0 = ptrtoint ptr addrspace(3) %load to i32 + %cast.load1 = bitcast i32 %cast.load0 to <2 x half> + ret <2 x half> %cast.load1 +} + +define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_f64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_f64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load double, ptr %gep0 + %cast.load = bitcast double %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_f64_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_f64_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load double, ptr %gep1 + %cast.load = bitcast double %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i64: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i64: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i64, ptr %gep0 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i64_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i64_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i64, ptr %gep1 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_v2f32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_v2f32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x float>, ptr %gep0 + ret <2 x float> %load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_v2f32_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_v2f32_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x float>, ptr %gep1 + ret <2 x float> %load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_v2i32: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_v2i32: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x i32>, ptr %gep0 + %cast.load = bitcast <2 x i32> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_v2i32_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_v2i32_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x i32>, ptr %gep1 + %cast.load = bitcast <2 x i32> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_v4i16: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_v4i16: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x i16>, ptr %gep0 + %cast.load = bitcast <4 x i16> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_v4i16_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_v4i16_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x i16>, ptr %gep1 + %cast.load = bitcast <4 x i16> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_v4f16: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_v4f16: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x half>, ptr %gep0 + %cast.load = bitcast <4 x half> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_v4f16_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_v4f16_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x half>, ptr %gep1 + %cast.load = bitcast <4 x half> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_p1: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_p1: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load ptr, ptr %gep0 + %cast.load0 = ptrtoint ptr %load to i64 + %cast.load1 = bitcast i64 %cast.load0 to <2 x float> + ret <2 x float> %cast.load1 +} + +define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_p1_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_p1_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load ptr, ptr %gep1 + %cast.load0 = ptrtoint ptr %load to i64 + %cast.load1 = bitcast i64 %cast.load0 to <2 x float> + ret <2 x float> %cast.load1 +} + +define amdgpu_ps <3 x float> @flat_load_saddr_v3f32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v3f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <3 x float>, ptr %gep0 + ret <3 x float> %load +} + +define amdgpu_ps <3 x float> @flat_load_saddr_v3f32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v3f32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <3 x float>, ptr %gep1 + ret <3 x float> %load +} + +define amdgpu_ps <3 x float> @flat_load_saddr_v3i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v3i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <3 x i32>, ptr %gep0 + %cast.load = bitcast <3 x i32> %load to <3 x float> + ret <3 x float> %cast.load +} + +define amdgpu_ps <3 x float> @flat_load_saddr_v3i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v3i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <3 x i32>, ptr %gep1 + %cast.load = bitcast <3 x i32> %load to <3 x float> + ret <3 x float> %cast.load +} + +define amdgpu_ps <6 x half> @flat_load_saddr_v6f16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v6f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <6 x half>, ptr %gep0 + ret <6 x half> %load +} + +define amdgpu_ps <6 x half> @flat_load_saddr_v6f16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v6f16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b96 v[0:2], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <6 x half>, ptr %gep1 + ret <6 x half> %load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4f32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x float>, ptr %gep0 + ret <4 x float> %load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4f32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4f32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x float>, ptr %gep1 + ret <4 x float> %load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x i32>, ptr %gep0 + %cast.load = bitcast <4 x i32> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x i32>, ptr %gep1 + %cast.load = bitcast <4 x i32> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v2i64(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x i64>, ptr %gep0 + %cast.load = bitcast <2 x i64> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v2i64_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2i64_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x i64>, ptr %gep1 + %cast.load = bitcast <2 x i64> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_i128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i128, ptr %gep0 + %cast.load = bitcast i128 %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_i128_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i128_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i128, ptr %gep1 + %cast.load = bitcast i128 %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v2p1(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2p1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <2 x ptr>, ptr %gep0 + %cast.load0 = ptrtoint <2 x ptr> %load to <2 x i64> + %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v2p1_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v2p1_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <2 x ptr>, ptr %gep1 + %cast.load0 = ptrtoint <2 x ptr> %load to <2 x i64> + %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4p3(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4p3: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load <4 x ptr addrspace(3)>, ptr %gep0 + %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32> + %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @flat_load_saddr_v4p3_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_v4p3_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b128 v[0:3], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load <4 x ptr addrspace(3)>, ptr %gep1 + %cast.load0 = ptrtoint <4 x ptr addrspace(3)> %load to <4 x i32> + %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +; -------------------------------------------------------------------------------- +; Extending loads +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_sextload_saddr_i8(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_sextload_saddr_i8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %sextload = sext i8 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_sextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_sextload_saddr_i8_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %sextload = sext i8 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_sextload_saddr_i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_sextload_saddr_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_i16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %sextload = sext i16 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_sextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_sextload_saddr_i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_i16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %sextload = sext i16 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_zextload_saddr_i8(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_zextload_saddr_i8: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zextload = zext i8 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_zextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_zextload_saddr_i8_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %zextload = zext i8 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_zextload_saddr_i16(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_zextload_saddr_i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %zextload = zext i16 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @flat_zextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_zextload_saddr_i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %zextload = zext i16 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +; -------------------------------------------------------------------------------- +; Atomic load +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: atomic_flat_load_saddr_i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load atomic i32, ptr %gep0 seq_cst, align 4 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: atomic_flat_load_saddr_i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load atomic i32, ptr %gep1 seq_cst, align 4 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: atomic_flat_load_saddr_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load atomic i64, ptr %gep0 seq_cst, align 8 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: atomic_flat_load_saddr_i64_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load atomic i64, ptr %gep1 seq_cst, align 8 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +; -------------------------------------------------------------------------------- +; D16 load (low 16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16lo_undef_hi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> undef, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> undef, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16lo_zero_hi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> %reg, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> %reg, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +; -------------------------------------------------------------------------------- +; D16 hi load (hi16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16hi_undef_hi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> undef, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> undef, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, 0, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i16, ptr %gep0 + %build = insertelement <2 x i16> %reg, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u16 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i16, ptr %gep1 + %build = insertelement <2 x i16> %reg, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %load = load i8, ptr %gep0 + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: flat_load_i8 v0, v0, s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1250-GISEL-NEXT: v_and_or_b32 v0, 0xffff, v1, v0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %load = load i8, ptr %gep1 + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +; -------------------------------------------------------------------------------- +; or-with-constant as add +; -------------------------------------------------------------------------------- + +; Check add-as-or with split 64-bit or. +define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) { +; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_bitop2_b32 v0, 16, v0 bitop3:0x54 +; GFX1250-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.idx = zext i32 %idx to i64 + %or = or i64 %zext.idx, 16 + %addr = inttoptr i64 %or to ptr + %load = load i8, ptr %addr + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) { +; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, 0x1040, v0 +; GFX1250-GISEL-NEXT: flat_load_u8 v0, v[0:1] +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %zext.idx = zext i32 %idx to i64 + %or = or i64 %zext.idx, 4160 + %addr = inttoptr i64 %or to ptr + %load = load i8, ptr %addr + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Full 64-bit scalar add. +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) { +; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: .LBB116_1: ; %bb3 +; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB116_1 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: .LBB116_1: ; %bb3 +; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4 +; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 +; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB116_1 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2 +; GFX1250-GISEL-NEXT: s_endpgm +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] + %i4 = zext i32 %i to i64 + %i5 = getelementptr inbounds float, ptr %arg, i64 %i4 + %i6 = load volatile float, ptr %i5, align 4 + %i8 = add nuw nsw i32 %i, 1 + %i9 = icmp eq i32 %i8, 256 + br i1 %i9, label %bb2, label %bb3 +} + +; Make sure we only have a single zero vaddr initialization. + +define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inreg %arg.1) { +; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv_multiload: +; GFX1250-SDAG: ; %bb.0: ; %bb +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-SDAG-NEXT: .LBB117_1: ; %bb3 +; GFX1250-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[4:5], s[2:3], s[0:1] +; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: flat_load_b32 v1, v0, s[4:5] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s0, 0x400 +; GFX1250-SDAG-NEXT: s_cbranch_scc0 .LBB117_1 +; GFX1250-SDAG-NEXT: ; %bb.2: ; %bb2 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload: +; GFX1250-GISEL: ; %bb.0: ; %bb +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX1250-GISEL-NEXT: .LBB117_1: ; %bb3 +; GFX1250-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 4 +; GFX1250-GISEL-NEXT: flat_load_b32 v6, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: flat_load_b32 v4, v[4:5] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x400, v2 +; GFX1250-GISEL-NEXT: s_cbranch_vccz .LBB117_1 +; GFX1250-GISEL-NEXT: ; %bb.2: ; %bb2 +; GFX1250-GISEL-NEXT: s_endpgm +bb: + br label %bb3 + +bb2: ; preds = %bb3 + ret void + +bb3: ; preds = %bb3, %bb + %i = phi i32 [ 0, %bb ], [ %i8, %bb3 ] + %i4 = zext i32 %i to i64 + %i5 = getelementptr inbounds float, ptr %arg, i64 %i4 + %i6 = load volatile float, ptr %i5, align 4 + %i5.1 = getelementptr inbounds float, ptr %arg.1, i64 %i4 + %i6.1 = load volatile float, ptr %i5, align 4 + %i8 = add nuw nsw i32 %i, 1 + %i9 = icmp eq i32 %i8, 256 + br i1 %i9, label %bb2, label %bb3 +} + +!0 = !{i32 0, i32 1073741824} ; (1 << 30) +!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1 From 038e80cfd37e948d78c70d5a258ffe424a438d51 Mon Sep 17 00:00:00 2001 From: Ryan Prichard Date: Wed, 16 Jul 2025 15:57:13 -0700 Subject: [PATCH 110/813] [libc++][Android] Update compiler and sysroot (#148998) * Upgrade from r536225 to r563880. * Upgrade from ab/12644632 to f8b85cc5262c6e5cbc9a92c1bab2b18b32a4c63f, the current HEAD commit of https://android.googlesource.com/platform/prebuilts/ndk/+/refs/heads/mirror-goog-main-ndk The previous source of sysroots (ci.android.com), deleted its artifacts after a short period of time, and is currently out-of-date because of the aosp-main turndown. Updating the Docker image also fixes two tests. --- .../meta.unary.comp/is_bounded_array.pass.cpp | 2 +- ...ue_object_representations.compile.pass.cpp | 2 +- libcxx/utils/ci/Dockerfile | 21 +++++++++---------- libcxx/utils/ci/docker-compose.yml | 6 +++--- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp index 09086a4c046d6..97e3afed1c036 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp @@ -8,7 +8,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // The Clang version that Android currently uses in the CI is too old. -// XFAIL: LIBCXX-ANDROID-FIXME +// UNSUPPORTED: LIBCXX-ANDROID-FIXME // type_traits diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp index 9aac871f2633f..ac63fec691377 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14 // The Clang version that Android currently uses in the CI is too old. -// XFAIL: LIBCXX-ANDROID-FIXME +// UNSUPPORTED: LIBCXX-ANDROID-FIXME // type_traits diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile index 0a1985b02807b..63ceceaa67635 100644 --- a/libcxx/utils/ci/Dockerfile +++ b/libcxx/utils/ci/Dockerfile @@ -184,7 +184,7 @@ FROM ubuntu:jammy AS android-builder-base ARG ANDROID_CLANG_VERSION ARG ANDROID_CLANG_PREBUILTS_COMMIT -ARG ANDROID_SYSROOT_BID +ARG ANDROID_SYSROOT_COMMIT RUN apt-get update && apt-get install -y curl bzip2 git unzip @@ -217,19 +217,18 @@ RUN < Date: Wed, 16 Jul 2025 19:06:03 -0400 Subject: [PATCH 111/813] [AMDGPU] Add support for `v_rsq_bf16` on gfx1250 (#149194) Co-authored-by: Mekhanoshin, Stanislav --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 ++++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 + .../CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll | 95 +++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 +++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 48 ++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 56 +++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 60 ++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 16 ++++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 45 +++++++++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 48 ++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 56 +++++++++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 60 ++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16 ++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 20 ++++ .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 63 ++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 59 ++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 15 +++ .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 64 +++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 60 ++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 20 ++++ 23 files changed, 881 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 313c0e640d240..a80f571140666 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -671,6 +671,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index dcfdea648e93c..8d227a5f957c8 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -421,6 +421,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_rsq: case AMDGPU::BI__builtin_amdgcn_rsqf: case AMDGPU::BI__builtin_amdgcn_rsqh: + case AMDGPU::BI__builtin_amdgcn_rsq_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_rsq); case AMDGPU::BI__builtin_amdgcn_rsq_clamp: case AMDGPU::BI__builtin_amdgcn_rsq_clampf: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e50f02ad27357..8b7ec143a2e00 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -80,6 +80,25 @@ void test_rcp_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_rcp_bf16(a); } +// CHECK-LABEL: @test_rsq_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.rsq.bf16(bfloat [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_rsq_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_rsq_bf16(a); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index e2f371079179d..6f8437e82700e 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -531,6 +531,7 @@ let SubtargetPredicate = HasBF16TransInsts in { defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; +defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -1141,6 +1142,7 @@ defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>; defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>; defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>; defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; +defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll new file mode 100644 index 0000000000000..0a8a90422d1f2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s + +; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select. +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.amdgcn.rsq.bf16(bfloat) #0 + +define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 { +; SDAG-REAL16-LABEL: rsq_bf16: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: v_rsq_bf16_e32 v0.l, s2 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: rsq_bf16: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: v_rsq_bf16_e32 v0, s2 +; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat %src) #0 + store bfloat %rsq, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: rsq_bf16_constant_4: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_rsq_bf16_e32 v0.l, 4.0 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: rsq_bf16_constant_4: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_rsq_bf16_e32 v0, 4.0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat 4.0) #0 + store bfloat %rsq, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: rsq_bf16_constant_100: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_rsq_bf16_e32 v0.l, 0x42c8 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: rsq_bf16_constant_100: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_rsq_bf16_e32 v0, 0x42c8 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat 100.0) #0 + store bfloat %rsq, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @rsq_undef_bf16(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: rsq_undef_bf16: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: rsq_undef_bf16: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_endpgm + %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat undef) + store bfloat %rsq, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index c587b66e65011..467418874592a 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -163,6 +163,51 @@ v_sqrt_bf16 v5, src_scc v_sqrt_bf16 v127, 0x8000 // GFX1250: v_sqrt_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf4,0xfe,0x7e,0x00,0x80,0x00,0x00] +v_rsq_bf16 v5, v1 +// GFX1250: v_rsq_bf16_e32 v5, v1 ; encoding: [0x01,0xf7,0x0a,0x7e] + +v_rsq_bf16 v5, v127 +// GFX1250: v_rsq_bf16_e32 v5, v127 ; encoding: [0x7f,0xf7,0x0a,0x7e] + +v_rsq_bf16 v5, s1 +// GFX1250: v_rsq_bf16_e32 v5, s1 ; encoding: [0x01,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, s105 +// GFX1250: v_rsq_bf16_e32 v5, s105 ; encoding: [0x69,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, vcc_lo +// GFX1250: v_rsq_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, vcc_hi +// GFX1250: v_rsq_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, ttmp15 +// GFX1250: v_rsq_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, m0 +// GFX1250: v_rsq_bf16_e32 v5, m0 ; encoding: [0x7d,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, exec_lo +// GFX1250: v_rsq_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, exec_hi +// GFX1250: v_rsq_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, null +// GFX1250: v_rsq_bf16_e32 v5, null ; encoding: [0x7c,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, -1 +// GFX1250: v_rsq_bf16_e32 v5, -1 ; encoding: [0xc1,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, 0.5 +// GFX1250: v_rsq_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, src_scc +// GFX1250: v_rsq_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf6,0x0a,0x7e] + +v_rsq_bf16 v127, 0x8000 +// GFX1250: v_rsq_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 719eb3abc02a3..1d90f3fe345a5 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -172,6 +172,54 @@ v_sqrt_bf16 v127, 0x8000 v_sqrt_bf16 v5.h, v1.h // GFX1250: v_sqrt_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf5,0x0a,0x7f] +v_rsq_bf16 v5, v1 +// GFX1250: v_rsq_bf16_e32 v5, v1 ; encoding: [0x01,0xf7,0x0a,0x7e] + +v_rsq_bf16 v5, v127 +// GFX1250: v_rsq_bf16_e32 v5, v127 ; encoding: [0x7f,0xf7,0x0a,0x7e] + +v_rsq_bf16 v5, s1 +// GFX1250: v_rsq_bf16_e32 v5, s1 ; encoding: [0x01,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, s105 +// GFX1250: v_rsq_bf16_e32 v5, s105 ; encoding: [0x69,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, vcc_lo +// GFX1250: v_rsq_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, vcc_hi +// GFX1250: v_rsq_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, ttmp15 +// GFX1250: v_rsq_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, m0 +// GFX1250: v_rsq_bf16_e32 v5, m0 ; encoding: [0x7d,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, exec_lo +// GFX1250: v_rsq_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, exec_hi +// GFX1250: v_rsq_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, null +// GFX1250: v_rsq_bf16_e32 v5, null ; encoding: [0x7c,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, -1 +// GFX1250: v_rsq_bf16_e32 v5, -1 ; encoding: [0xc1,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, 0.5 +// GFX1250: v_rsq_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf6,0x0a,0x7e] + +v_rsq_bf16 v5, src_scc +// GFX1250: v_rsq_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf6,0x0a,0x7e] + +v_rsq_bf16 v127, 0x8000 +// GFX1250: v_rsq_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00] + +v_rsq_bf16 v5.h, v1.h +// GFX1250: v_rsq_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf7,0x0a,0x7f] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index 44859fcffe223..dd49e49e4b20b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -170,6 +170,62 @@ v_sqrt_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 f // GFX1250: v_sqrt_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf4,0xfe,0x7e,0x7f,0x6f,0x35,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_mirror +// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_half_mirror +// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_shl:1 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_shl:15 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_shr:1 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_shr:15 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_ror:1 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_ror:15 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index 8fef387700972..3415e76188e78 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -182,6 +182,66 @@ v_sqrt_bf16 v5.h, v1.h quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_rsq_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_mirror +// GFX1250: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_half_mirror +// GFX1250: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_shl:1 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_shl:15 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_shr:1 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_shr:15 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_ror:1 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_ror:15 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5.h, v1.h quad_perm:[3,2,1,0] +// GFX1250: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index 28368456a35df..5cce927831f12 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -38,6 +38,18 @@ v_sqrt_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_sqrt_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf4,0xfe,0x7e,0x7f,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index 1ed8f5faff3fc..5ba421bf014ac 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -50,6 +50,22 @@ v_sqrt_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7f,0x81,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 4f7be4833681d..31daff336fd48 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -217,6 +217,51 @@ v_sqrt_bf16_e64 v5, src_scc mul:4 v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 // GFX1250: v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfa,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +v_rsq_bf16_e64 v5, v1 +// GFX1250: v_rsq_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00] + +v_rsq_bf16_e64 v5, v255 +// GFX1250: v_rsq_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00] + +v_rsq_bf16_e64 v5, s1 +// GFX1250: v_rsq_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, s105 +// GFX1250: v_rsq_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, vcc_lo +// GFX1250: v_rsq_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, vcc_hi +// GFX1250: v_rsq_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, ttmp15 +// GFX1250: v_rsq_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, m0 +// GFX1250: v_rsq_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, exec_lo +// GFX1250: v_rsq_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, exec_hi +// GFX1250: v_rsq_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, null +// GFX1250: v_rsq_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, -1 +// GFX1250: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] + +v_rsq_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] + +v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 8b16e42566fde..d270a34a30275 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -226,6 +226,54 @@ v_sqrt_bf16_e64 v255, -|0x8000| clamp div:2 v_sqrt_bf16 v5.h, v128.h // GFX1250: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] +v_rsq_bf16_e64 v5, v1 +// GFX1250: v_rsq_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00] + +v_rsq_bf16_e64 v5, v255 +// GFX1250: v_rsq_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00] + +v_rsq_bf16_e64 v5, s1 +// GFX1250: v_rsq_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, s105 +// GFX1250: v_rsq_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, vcc_lo +// GFX1250: v_rsq_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, vcc_hi +// GFX1250: v_rsq_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, ttmp15 +// GFX1250: v_rsq_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, m0 +// GFX1250: v_rsq_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, exec_lo +// GFX1250: v_rsq_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, exec_hi +// GFX1250: v_rsq_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, null +// GFX1250: v_rsq_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, -1 +// GFX1250: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] + +v_rsq_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] + +v_rsq_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] + +v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +v_rsq_bf16 v5.h, v128.h +// GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index 2c2aef4940b57..d5b12002cb5ba 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -170,6 +170,62 @@ v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mas // GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index 1588b6b391198..70961a901ffb7 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -182,6 +182,66 @@ v_sqrt_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] +// GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 0402565695975..c8469f4188738 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -50,6 +50,22 @@ v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 71cda1b36dd3c..f33ee9be04f82 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -62,6 +62,26 @@ v_sqrt_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index 739a2034a079e..35b4ebc6f87f8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -215,6 +215,69 @@ 0x81,0xf5,0x0a,0x7f # GFX1250-REAL16: v_sqrt_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf5,0x0a,0x7f] +0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00] + +0xc1,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, -1 ; encoding: [0xc1,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, -1 ; encoding: [0xc1,0xf6,0x0a,0x7e] + +0xf0,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, 0.5 ; encoding: [0xf0,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf6,0x0a,0x7e] + +0x7f,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, exec_hi ; encoding: [0x7f,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf6,0x0a,0x7e] + +0x7e,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, exec_lo ; encoding: [0x7e,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf6,0x0a,0x7e] + +0x7d,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, m0 ; encoding: [0x7d,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, m0 ; encoding: [0x7d,0xf6,0x0a,0x7e] + +0x7c,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, null ; encoding: [0x7c,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, null ; encoding: [0x7c,0xf6,0x0a,0x7e] + +0x01,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, s1 ; encoding: [0x01,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, s1 ; encoding: [0x01,0xf6,0x0a,0x7e] + +0x69,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, s105 ; encoding: [0x69,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, s105 ; encoding: [0x69,0xf6,0x0a,0x7e] + +0xfd,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, src_scc ; encoding: [0xfd,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf6,0x0a,0x7e] + +0x7b,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf6,0x0a,0x7e] + +0x01,0xf7,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, v1.l ; encoding: [0x01,0xf7,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, v1 ; encoding: [0x01,0xf7,0x0a,0x7e] + +0x7f,0xf7,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, v127.l ; encoding: [0x7f,0xf7,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, v127 ; encoding: [0x7f,0xf7,0x0a,0x7e] + +0x6b,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf6,0x0a,0x7e] + +0x6a,0xf6,0x0a,0x7e +# GFX1250-REAL16: v_rsq_bf16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xf6,0x0a,0x7e] +# GFX1250-FAKE16: v_rsq_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf6,0x0a,0x7e] + +0x81,0xf7,0x0a,0x7f +# GFX1250-REAL16: v_rsq_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf7,0x0a,0x7f] + 0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00 # GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index 04b38093d30f6..0f98bced09d36 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -179,6 +179,65 @@ 0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff # GFX1250-REAL16: v_sqrt_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf4,0x0a,0x7f,0x81,0x1b,0x00,0xff] +0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30 +# GFX1250-REAL16: v_rsq_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +# GFX1250-FAKE16: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x41,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x40,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x21,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x2f,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x50,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x01,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x11,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf6,0x0a,0x7e,0x01,0x60,0x09,0x13] + +0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff +# GFX1250-REAL16: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff] + 0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30 # GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index 58994519a5234..1be1451c0f3ed 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -50,6 +50,21 @@ # GFX1250-REAL16: v_sqrt_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf4,0x0a,0x7f,0x81,0x77,0x39,0x05] # GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] +0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_rsq_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rsq_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf6,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX1250-REAL16: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05] + 0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 89d9b02cdbd52..5285033fb34b9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -194,6 +194,70 @@ # GFX1250-REAL16: v_sqrt_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfa,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sqrt_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfa,0xd5,0x80,0x01,0x00,0x00] +0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfb,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfb,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfb,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, null ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfb,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfb,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfb,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfb,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfb,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfb,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfb,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfb,0xd5,0x6a,0x00,0x00,0x00] + +0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00] + 0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 9e45aca0168d6..d546ddff9f28c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -2,6 +2,66 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s +0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + 0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index 18959f8dec20a..ae5d331b2eea0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -2,6 +2,26 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s +0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfb,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + 0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From ad6d5d28215adb3def221517b1490b8df3fd1190 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Wed, 16 Jul 2025 19:09:34 -0400 Subject: [PATCH 112/813] [AMDGPU] Add support for `v_log_bf16` on gfx1250 (#149201) Co-authored-by: Mekhanoshin, Stanislav --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 ++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 + llvm/test/CodeGen/AMDGPU/bf16-math.ll | 28 ++ .../CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll | 33 +++ llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll | 240 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 ++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 48 ++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 56 ++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 60 +++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 + llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 16 ++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 45 ++++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 48 ++++ .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 56 ++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 60 +++++ .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16 ++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 20 ++ .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 66 +++++ .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 59 +++++ .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 15 ++ .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 64 +++++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 60 +++++ .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 20 ++ 25 files changed, 1090 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/bf16-math.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index a80f571140666..eee0a94f6fc64 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -672,6 +672,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 8d227a5f957c8..0312205d4ff8d 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -436,6 +436,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: return EmitAMDGPUDispatchPtr(*this, E); case AMDGPU::BI__builtin_amdgcn_logf: + case AMDGPU::BI__builtin_amdgcn_log_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log); case AMDGPU::BI__builtin_amdgcn_exp2f: return emitBuiltinWithOneOverloadedType<1>(*this, E, diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 8b7ec143a2e00..bdf169a1a97da 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -99,6 +99,25 @@ void test_rsq_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_rsq_bf16(a); } +// CHECK-LABEL: @test_log_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_log_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_log_bf16(a); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 6f8437e82700e..e1bc39302e126 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -532,6 +532,7 @@ defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; +defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -1143,6 +1144,7 @@ defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>; defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>; defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; +defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll new file mode 100644 index 0000000000000..05eee2d4d549d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s + +; TODO: Add global-isel when it can support bf16 + +define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) { +; GCN-LABEL: llvm_log2_bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_log_bf16_e32 v2, v2 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %log = call bfloat @llvm.log2.bf16(bfloat %src) + store bfloat %log, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) { +; GCN-LABEL: llvm_log2_bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: v_log_bf16_e32 v2, s0 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %log = call bfloat @llvm.log2.bf16(bfloat %src) + store bfloat %log, ptr addrspace(1) %out, align 2 + ret void +} + +declare bfloat @llvm.log2.bf16(bfloat) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll new file mode 100644 index 0000000000000..a8b2077f5a35b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll @@ -0,0 +1,33 @@ +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.amdgcn.log.bf16(bfloat) #0 + +; GCN-LABEL: {{^}}log_bf16: +; GCN: v_log_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} +define amdgpu_kernel void @log_bf16(ptr addrspace(1) %out, bfloat %src) #1 { + %log = call bfloat @llvm.amdgcn.log.bf16(bfloat %src) #0 + store bfloat %log, ptr addrspace(1) %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}log_bf16_constant_4 +; GCN: v_log_bf16_e32 v0, 4.0 +define amdgpu_kernel void @log_bf16_constant_4(ptr addrspace(1) %out) #1 { + %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 4.0) #0 + store bfloat %log, ptr addrspace(1) %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}log_bf16_constant_100 +; GCN: v_log_bf16_e32 {{v[0-9]+}}, 0x42c8 +define amdgpu_kernel void @log_bf16_constant_100(ptr addrspace(1) %out) #1 { + %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 100.0) #0 + store bfloat %log, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll new file mode 100644 index 0000000000000..5bd9fa6f23aa0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-TRUE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-FAKE16 %s + +define bfloat @v_log2_bf16(bfloat %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_bf16: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_bf16: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0 +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %result = call bfloat @llvm.log2.bf16(bfloat %in) + ret bfloat %result +} + +define bfloat @v_log2_fabs_bf16(bfloat %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_bf16: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, |v0.l| +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_bf16: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, |v0| +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call bfloat @llvm.fabs.bf16(bfloat %in) + %result = call bfloat @llvm.log2.bf16(bfloat %fabs) + ret bfloat %result +} + +define bfloat @v_log2_fneg_fabs_bf16(bfloat %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_bf16: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -|v0.l| +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_bf16: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -|v0| +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call bfloat @llvm.fabs.bf16(bfloat %in) + %fneg.fabs = fneg bfloat %fabs + %result = call bfloat @llvm.log2.bf16(bfloat %fneg.fabs) + ret bfloat %result +} + +define bfloat @v_log2_fneg_bf16(bfloat %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_bf16: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v0.l +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_bf16: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0 +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fneg = fneg bfloat %in + %result = call bfloat @llvm.log2.bf16(bfloat %fneg) + ret bfloat %result +} + +define bfloat @v_log2_bf16_fast(bfloat %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_bf16_fast: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_bf16_fast: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0 +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %result = call fast bfloat @llvm.log2.bf16(bfloat %in) + ret bfloat %result +} + +define <2 x bfloat> @v_log2_v2bf16(<2 x bfloat> %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v0.h +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0 +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1 +; GFX-SDAG-FAKE16-NEXT: v_nop +; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in) + ret <2 x bfloat> %result +} + +define <2 x bfloat> @v_log2_fabs_v2bf16(<2 x bfloat> %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_v2bf16: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15 +; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v1.l +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v2.l +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_v2bf16: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0 +; GFX-SDAG-FAKE16-NEXT: v_nop +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in) + %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fabs) + ret <2 x bfloat> %result +} + +define <2 x bfloat> @v_log2_fneg_fabs_v2bf16(<2 x bfloat> %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2bf16: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15 +; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v1.l +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -v2.l +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2bf16: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -v1 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0 +; GFX-SDAG-FAKE16-NEXT: v_nop +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in) + %fneg.fabs = fneg <2 x bfloat> %fabs + %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg.fabs) + ret <2 x bfloat> %result +} + +define <2 x bfloat> @v_log2_fneg_v2bf16(<2 x bfloat> %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_v2bf16: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -v0.h +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v0.l +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_v2bf16: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0 +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -v1 +; GFX-SDAG-FAKE16-NEXT: v_nop +; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fneg = fneg <2 x bfloat> %in + %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg) + ret <2 x bfloat> %result +} + +define <2 x bfloat> @v_log2_v2bf16_fast(<2 x bfloat> %in) { +; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16_fast: +; GFX-SDAG-TRUE16: ; %bb.0: +; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v0.h +; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l +; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16_fast: +; GFX-SDAG-FAKE16: ; %bb.0: +; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0 +; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1 +; GFX-SDAG-FAKE16-NEXT: v_nop +; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %result = call fast <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in) + ret <2 x bfloat> %result +} + +declare bfloat @llvm.log2.bf16(bfloat) #0 +declare <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat>) #0 +declare bfloat @llvm.fabs.bf16(bfloat) #0 +declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index 467418874592a..0f5ce56f1a2cf 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -208,6 +208,51 @@ v_rsq_bf16 v5, src_scc v_rsq_bf16 v127, 0x8000 // GFX1250: v_rsq_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00] +v_log_bf16 v5, v1 +// GFX1250: v_log_bf16_e32 v5, v1 ; encoding: [0x01,0xf9,0x0a,0x7e] + +v_log_bf16 v5, v127 +// GFX1250: v_log_bf16_e32 v5, v127 ; encoding: [0x7f,0xf9,0x0a,0x7e] + +v_log_bf16 v5, s1 +// GFX1250: v_log_bf16_e32 v5, s1 ; encoding: [0x01,0xf8,0x0a,0x7e] + +v_log_bf16 v5, s105 +// GFX1250: v_log_bf16_e32 v5, s105 ; encoding: [0x69,0xf8,0x0a,0x7e] + +v_log_bf16 v5, vcc_lo +// GFX1250: v_log_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf8,0x0a,0x7e] + +v_log_bf16 v5, vcc_hi +// GFX1250: v_log_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf8,0x0a,0x7e] + +v_log_bf16 v5, ttmp15 +// GFX1250: v_log_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf8,0x0a,0x7e] + +v_log_bf16 v5, m0 +// GFX1250: v_log_bf16_e32 v5, m0 ; encoding: [0x7d,0xf8,0x0a,0x7e] + +v_log_bf16 v5, exec_lo +// GFX1250: v_log_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf8,0x0a,0x7e] + +v_log_bf16 v5, exec_hi +// GFX1250: v_log_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf8,0x0a,0x7e] + +v_log_bf16 v5, null +// GFX1250: v_log_bf16_e32 v5, null ; encoding: [0x7c,0xf8,0x0a,0x7e] + +v_log_bf16 v5, -1 +// GFX1250: v_log_bf16_e32 v5, -1 ; encoding: [0xc1,0xf8,0x0a,0x7e] + +v_log_bf16 v5, 0.5 +// GFX1250: v_log_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf8,0x0a,0x7e] + +v_log_bf16 v5, src_scc +// GFX1250: v_log_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf8,0x0a,0x7e] + +v_log_bf16 v127, 0x8000 +// GFX1250: v_log_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 1d90f3fe345a5..9dd11e6249b27 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -220,6 +220,54 @@ v_rsq_bf16 v127, 0x8000 v_rsq_bf16 v5.h, v1.h // GFX1250: v_rsq_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf7,0x0a,0x7f] +v_log_bf16 v5, v1 +// GFX1250: v_log_bf16_e32 v5, v1 ; encoding: [0x01,0xf9,0x0a,0x7e] + +v_log_bf16 v5, v127 +// GFX1250: v_log_bf16_e32 v5, v127 ; encoding: [0x7f,0xf9,0x0a,0x7e] + +v_log_bf16 v5, s1 +// GFX1250: v_log_bf16_e32 v5, s1 ; encoding: [0x01,0xf8,0x0a,0x7e] + +v_log_bf16 v5, s105 +// GFX1250: v_log_bf16_e32 v5, s105 ; encoding: [0x69,0xf8,0x0a,0x7e] + +v_log_bf16 v5, vcc_lo +// GFX1250: v_log_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf8,0x0a,0x7e] + +v_log_bf16 v5, vcc_hi +// GFX1250: v_log_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf8,0x0a,0x7e] + +v_log_bf16 v5, ttmp15 +// GFX1250: v_log_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf8,0x0a,0x7e] + +v_log_bf16 v5, m0 +// GFX1250: v_log_bf16_e32 v5, m0 ; encoding: [0x7d,0xf8,0x0a,0x7e] + +v_log_bf16 v5, exec_lo +// GFX1250: v_log_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf8,0x0a,0x7e] + +v_log_bf16 v5, exec_hi +// GFX1250: v_log_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf8,0x0a,0x7e] + +v_log_bf16 v5, null +// GFX1250: v_log_bf16_e32 v5, null ; encoding: [0x7c,0xf8,0x0a,0x7e] + +v_log_bf16 v5, -1 +// GFX1250: v_log_bf16_e32 v5, -1 ; encoding: [0xc1,0xf8,0x0a,0x7e] + +v_log_bf16 v5, 0.5 +// GFX1250: v_log_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf8,0x0a,0x7e] + +v_log_bf16 v5, src_scc +// GFX1250: v_log_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf8,0x0a,0x7e] + +v_log_bf16 v127, 0x8000 +// GFX1250: v_log_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00] + +v_log_bf16 v5.h, v1.h +// GFX1250: v_log_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf9,0x0a,0x7f] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index dd49e49e4b20b..3882e43b5daf4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -226,6 +226,62 @@ v_rsq_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi // GFX1250: v_rsq_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf6,0xfe,0x7e,0x7f,0x6f,0x35,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_log_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_mirror +// GFX1250: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_half_mirror +// GFX1250: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_shl:1 +// GFX1250: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_shl:15 +// GFX1250: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_shr:1 +// GFX1250: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_shr:15 +// GFX1250: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_ror:1 +// GFX1250: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_ror:15 +// GFX1250: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index 3415e76188e78..2f849b15edee9 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -242,6 +242,66 @@ v_rsq_bf16 v5.h, v1.h quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_log_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_mirror +// GFX1250: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_half_mirror +// GFX1250: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_shl:1 +// GFX1250: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_shl:15 +// GFX1250: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_shr:1 +// GFX1250: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_shr:15 +// GFX1250: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_ror:1 +// GFX1250: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_ror:15 +// GFX1250: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5.h, v1.h quad_perm:[3,2,1,0] +// GFX1250: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index 5cce927831f12..85cf08bdb3a31 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -50,6 +50,18 @@ v_rsq_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_rsq_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf6,0xfe,0x7e,0x7f,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index 5ba421bf014ac..d9b320ac6c094 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -66,6 +66,22 @@ v_rsq_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 31daff336fd48..0d4de4c8c877a 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -262,6 +262,51 @@ v_rsq_bf16_e64 v5, src_scc mul:4 v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 // GFX1250: v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfb,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +v_log_bf16_e64 v5, v1 +// GFX1250: v_log_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00] + +v_log_bf16_e64 v5, v255 +// GFX1250: v_log_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00] + +v_log_bf16_e64 v5, s1 +// GFX1250: v_log_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00] + +v_log_bf16_e64 v5, s105 +// GFX1250: v_log_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] + +v_log_bf16_e64 v5, vcc_lo +// GFX1250: v_log_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00] + +v_log_bf16_e64 v5, vcc_hi +// GFX1250: v_log_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00] + +v_log_bf16_e64 v5, ttmp15 +// GFX1250: v_log_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] + +v_log_bf16_e64 v5, m0 +// GFX1250: v_log_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00] + +v_log_bf16_e64 v5, exec_lo +// GFX1250: v_log_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00] + +v_log_bf16_e64 v5, exec_hi +// GFX1250: v_log_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] + +v_log_bf16_e64 v5, null +// GFX1250: v_log_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00] + +v_log_bf16_e64 v5, -1 +// GFX1250: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] + +v_log_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] + +v_log_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] + +v_log_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index d270a34a30275..8bf5d242660b6 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -274,6 +274,54 @@ v_rsq_bf16_e64 v255, -|0x8000| clamp div:2 v_rsq_bf16 v5.h, v128.h // GFX1250: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] +v_log_bf16_e64 v5, v1 +// GFX1250: v_log_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00] + +v_log_bf16_e64 v5, v255 +// GFX1250: v_log_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00] + +v_log_bf16_e64 v5, s1 +// GFX1250: v_log_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00] + +v_log_bf16_e64 v5, s105 +// GFX1250: v_log_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] + +v_log_bf16_e64 v5, vcc_lo +// GFX1250: v_log_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00] + +v_log_bf16_e64 v5, vcc_hi +// GFX1250: v_log_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00] + +v_log_bf16_e64 v5, ttmp15 +// GFX1250: v_log_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] + +v_log_bf16_e64 v5, m0 +// GFX1250: v_log_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00] + +v_log_bf16_e64 v5, exec_lo +// GFX1250: v_log_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00] + +v_log_bf16_e64 v5, exec_hi +// GFX1250: v_log_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] + +v_log_bf16_e64 v5, null +// GFX1250: v_log_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00] + +v_log_bf16_e64 v5, -1 +// GFX1250: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] + +v_log_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] + +v_log_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] + +v_log_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +v_log_bf16 v5.h, v128.h +// GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index d5b12002cb5ba..4231fcf7c5e92 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -226,6 +226,62 @@ v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask // GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index 70961a901ffb7..1a094e285e730 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -242,6 +242,66 @@ v_rsq_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] +// GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index c8469f4188738..f6a2103ed9077 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -66,6 +66,22 @@ v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index f33ee9be04f82..5a1b1414dda37 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -82,6 +82,26 @@ v_rsq_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index 35b4ebc6f87f8..c318dd7fc4ee0 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -2,6 +2,9 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +0xff,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf +# GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] + 0xc1,0x3a,0x08,0x7e # GFX1250: v_mov_b64_e32 v[4:5], -1 ; encoding: [0xc1,0x3a,0x08,0x7e] @@ -278,6 +281,69 @@ 0x81,0xf7,0x0a,0x7f # GFX1250-REAL16: v_rsq_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf7,0x0a,0x7f] +0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00] + +0xc1,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, -1 ; encoding: [0xc1,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, -1 ; encoding: [0xc1,0xf8,0x0a,0x7e] + +0xf0,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, 0.5 ; encoding: [0xf0,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf8,0x0a,0x7e] + +0x7f,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, exec_hi ; encoding: [0x7f,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf8,0x0a,0x7e] + +0x7e,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, exec_lo ; encoding: [0x7e,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf8,0x0a,0x7e] + +0x7d,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, m0 ; encoding: [0x7d,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, m0 ; encoding: [0x7d,0xf8,0x0a,0x7e] + +0x7c,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, null ; encoding: [0x7c,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, null ; encoding: [0x7c,0xf8,0x0a,0x7e] + +0x01,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, s1 ; encoding: [0x01,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, s1 ; encoding: [0x01,0xf8,0x0a,0x7e] + +0x69,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, s105 ; encoding: [0x69,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, s105 ; encoding: [0x69,0xf8,0x0a,0x7e] + +0xfd,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, src_scc ; encoding: [0xfd,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf8,0x0a,0x7e] + +0x7b,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf8,0x0a,0x7e] + +0x01,0xf9,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, v1.l ; encoding: [0x01,0xf9,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, v1 ; encoding: [0x01,0xf9,0x0a,0x7e] + +0x7f,0xf9,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, v127.l ; encoding: [0x7f,0xf9,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, v127 ; encoding: [0x7f,0xf9,0x0a,0x7e] + +0x6b,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf8,0x0a,0x7e] + +0x6a,0xf8,0x0a,0x7e +# GFX1250-REAL16: v_log_bf16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xf8,0x0a,0x7e] +# GFX1250-FAKE16: v_log_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf8,0x0a,0x7e] + +0x81,0xf9,0x0a,0x7f +# GFX1250-REAL16: v_log_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf9,0x0a,0x7f] + 0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00 # GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index 0f98bced09d36..22ed09e957de7 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -238,6 +238,65 @@ 0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff # GFX1250-REAL16: v_rsq_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf6,0x0a,0x7f,0x81,0x1b,0x00,0xff] +0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30 +# GFX1250-REAL16: v_log_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +# GFX1250-FAKE16: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x41,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x40,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x21,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x2f,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x50,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x01,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x11,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xf8,0x0a,0x7e,0x01,0x60,0x09,0x13] + +0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff +# GFX1250-REAL16: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff] + 0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30 # GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index 1be1451c0f3ed..d8458e8808b39 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -65,6 +65,21 @@ 0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05 # GFX1250-REAL16: v_rsq_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf6,0x0a,0x7f,0x81,0x77,0x39,0x05] +0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_log_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_log_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xf8,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX1250-REAL16: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05] + 0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 5285033fb34b9..d1a7158ce582e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -258,6 +258,70 @@ # GFX1250-REAL16: v_rsq_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfb,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfb,0xd5,0x80,0x01,0x00,0x00] +0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfc,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_log_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfc,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfc,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, null ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfc,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfc,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_log_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfc,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfc,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfc,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfc,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfc,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfc,0xd5,0x6a,0x00,0x00,0x00] + +0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00] + 0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index d546ddff9f28c..56f65d0711664 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -62,6 +62,66 @@ # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfb,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + 0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index ae5d331b2eea0..9ff9e54c1b40c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -22,6 +22,26 @@ # GFX1250-REAL16: v_rsq_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfb,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfc,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + 0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From 0110168f6aa5c8a8d02ffd9e62c7929ce6d24d26 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 17 Jul 2025 08:29:09 +0900 Subject: [PATCH 113/813] RuntimeLibcalls: Fix calling conv of win32 div libcalls (#149098) There's probably an existing test this should be added to, but our test coverage is really bad that this wasn't caught by one. --- llvm/include/llvm/IR/RuntimeLibcalls.td | 2 +- .../CodeGen/X86/win32-int-runtime-libcalls.ll | 113 ++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index 11926d4128fcf..f0297cd1a0873 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -2129,7 +2129,7 @@ defvar X86CommonLibcalls = ); defvar Windows32DivRemMulCalls = - LibcallImpls<(add WindowsDivRemMulLibcalls), + LibcallsWithCC<(add WindowsDivRemMulLibcalls), X86_STDCALL, RuntimeLibcallPredicate<"TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()">>; def X86_32SystemLibrary diff --git a/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll new file mode 100644 index 0000000000000..5ac90a0af2e57 --- /dev/null +++ b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll @@ -0,0 +1,113 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck -check-prefix=CHECK32 %s +; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck -check-prefix=CHECK64 %s + +define i64 @test_sdiv_i64(i64 %a, i64 %b) { +; CHECK32-LABEL: test_sdiv_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: calll __alldiv +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: test_sdiv_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movq %rdx, %r8 +; CHECK64-NEXT: movq %rcx, %rax +; CHECK64-NEXT: cqto +; CHECK64-NEXT: idivq %r8 +; CHECK64-NEXT: retq + %ret = sdiv i64 %a, %b + ret i64 %ret +} + +define i64 @test_srem_i64(i64 %a, i64 %b) { +; CHECK32-LABEL: test_srem_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: calll __allrem +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: test_srem_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movq %rdx, %r8 +; CHECK64-NEXT: movq %rcx, %rax +; CHECK64-NEXT: cqto +; CHECK64-NEXT: idivq %r8 +; CHECK64-NEXT: movq %rdx, %rax +; CHECK64-NEXT: retq + %ret = srem i64 %a, %b + ret i64 %ret +} + +define i64 @test_udiv_i64(i64 %a, i64 %b) { +; CHECK32-LABEL: test_udiv_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: calll __aulldiv +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: test_udiv_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movq %rdx, %r8 +; CHECK64-NEXT: movq %rcx, %rax +; CHECK64-NEXT: xorl %edx, %edx +; CHECK64-NEXT: divq %r8 +; CHECK64-NEXT: retq + %ret = udiv i64 %a, %b + ret i64 %ret +} + +define i64 @test_urem_i64(i64 %a, i64 %b) { +; CHECK32-LABEL: test_urem_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: pushl {{[0-9]+}}(%esp) +; CHECK32-NEXT: calll __aullrem +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: test_urem_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movq %rdx, %r8 +; CHECK64-NEXT: movq %rcx, %rax +; CHECK64-NEXT: xorl %edx, %edx +; CHECK64-NEXT: divq %r8 +; CHECK64-NEXT: movq %rdx, %rax +; CHECK64-NEXT: retq + %ret = urem i64 %a, %b + ret i64 %ret +} + +define i64 @test_mul_i64(i64 %a, i64 %b) { +; CHECK32-LABEL: test_mul_i64: +; CHECK32: # %bb.0: +; CHECK32-NEXT: pushl %esi +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: movl %ecx, %eax +; CHECK32-NEXT: mull %esi +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %ecx +; CHECK32-NEXT: addl %ecx, %edx +; CHECK32-NEXT: imull {{[0-9]+}}(%esp), %esi +; CHECK32-NEXT: addl %esi, %edx +; CHECK32-NEXT: popl %esi +; CHECK32-NEXT: retl +; +; CHECK64-LABEL: test_mul_i64: +; CHECK64: # %bb.0: +; CHECK64-NEXT: movq %rcx, %rax +; CHECK64-NEXT: imulq %rdx, %rax +; CHECK64-NEXT: retq + %ret = mul i64 %a, %b + ret i64 %ret +} From 7e0fde0c2f6b0b9d727ce9196956b36e91961ac4 Mon Sep 17 00:00:00 2001 From: Sirraide Date: Thu, 17 Jul 2025 01:30:50 +0200 Subject: [PATCH 114/813] [Clang] Reintroduce obsolete symbols in libclang.map (#149190) This is a follow-up to #149079. Seems like we forgot about the fact that the symbols also need to be in `libclang.map`. --- clang/tools/libclang/libclang.map | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map index 49c472e3833fd..3d9d2e268a611 100644 --- a/clang/tools/libclang/libclang.map +++ b/clang/tools/libclang/libclang.map @@ -327,6 +327,8 @@ LLVM_13 { clang_getRange; clang_getRangeEnd; clang_getRangeStart; + clang_getRemappings; + clang_getRemappingsFromFileList; clang_getResultType; clang_getSkippedRanges; clang_getSpecializedCursorTemplate; @@ -387,6 +389,9 @@ LLVM_13 { clang_parseTranslationUnit; clang_parseTranslationUnit2; clang_parseTranslationUnit2FullArgv; + clang_remap_dispose; + clang_remap_getFilenames; + clang_remap_getNumFiles; clang_reparseTranslationUnit; clang_saveTranslationUnit; clang_sortCodeCompletionResults; From 86c63e6bd66f9db9c7320155da7a2042407b5a1a Mon Sep 17 00:00:00 2001 From: royitaqi Date: Wed, 16 Jul 2025 16:32:40 -0700 Subject: [PATCH 115/813] [lldb] [cosmetic] Update help message of `(lldb) b` (#149114) `(lldb) b` can be used in two different ways: 1. Running `b` without arguments, it lists all existing breakpoints. 2. Running `b` with arguments, it adds breakpoints. However, the help message doesn't mention the first use case. This patch adds help message to mention it. **Without patch**: ``` (lldb) help b Set a breakpoint using one of several shorthand formats. Expects 'raw' input (see 'help raw-input'.) Syntax: _regexp-break :: main.c:12:21 // Break at line 12 and column 21 of main.c ... ``` **With patch**: ``` (lldb) help b Set a breakpoint using one of several shorthand formats, or list the existing breakpoints if no arguments are provided. Expects 'raw' input (see 'help raw-input'.) Syntax: _regexp-break :: main.c:12:21 // Break at line 12 and column 21 of main.c ... _regexp-break // List the existing breakpoints ``` --- lldb/source/Interpreter/CommandInterpreter.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index 00c3472444d2e..da545f18d9b15 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -616,7 +616,8 @@ void CommandInterpreter::LoadCommandDictionary() { std::unique_ptr break_regex_cmd_up( new CommandObjectRegexCommand( *this, "_regexp-break", - "Set a breakpoint using one of several shorthand formats.", + "Set a breakpoint using one of several shorthand formats, or list " + "the existing breakpoints if no arguments are provided.", "\n" "_regexp-break ::\n" " main.c:12:21 // Break at line 12 and column " @@ -643,7 +644,10 @@ void CommandInterpreter::LoadCommandDictionary() { " /break here/ // Break on source lines in " "current file\n" " // containing text 'break " - "here'.\n", + "here'.\n" + "_regexp-break\n" + " // List the existing " + "breakpoints\n", lldb::eSymbolCompletion | lldb::eSourceFileCompletion, false)); if (break_regex_cmd_up) { From 26b0b279deca7cd660efcae5c17bd27a15ead36d Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 16 Jul 2025 16:36:41 -0700 Subject: [PATCH 116/813] [AMDGPU] Select flat GVS stores on gfx1250 (#149203) --- llvm/lib/Target/AMDGPU/FLATInstructions.td | 63 +- llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll | 1118 ++++++++++++++++++ 2 files changed, 1159 insertions(+), 22 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 3965b5dd8c5c3..74632c71f0f95 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1275,8 +1275,8 @@ class FlatLoadSaddrPat (inst $saddr, $voffset, $offset, 0) >; -class GlobalStoreSaddrPat : GCNPat < +class FlatStoreSaddrPat : GCNPat < (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) >; @@ -1485,7 +1485,7 @@ multiclass GlobalFLATStorePats(!cast(inst)#"_SADDR"), node, vt> { + def : FlatStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { let AddedComplexity = 11; } } @@ -1495,7 +1495,7 @@ multiclass GlobalFLATStorePats_D16_t16(inst#"_SADDR_t16"), node, vt> { + def : FlatStoreSaddrPat(inst#"_SADDR_t16"), node, vt> { let AddedComplexity = 11; } } @@ -1655,6 +1655,24 @@ multiclass FlatLoadPats_D16_t16 { + def : FlatStorePat ; + + def : FlatStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + +multiclass FlatStorePats_t16 { + def : FlatStorePat (!cast(inst)#"_t16"), node, vt>; + + def : FlatStoreSaddrPat(!cast(inst)#"_SADDR_t16"), node, vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } +} + let OtherPredicates = [HasFlatAddressSpace] in { defm : FlatLoadPats ; @@ -1682,10 +1700,10 @@ let True16Predicate = p in { defm : FlatLoadPats ; defm : FlatLoadPats ; defm : FlatLoadPats ; - def : FlatStorePat ; - def : FlatStorePat ; - def : FlatStorePat ; - def : FlatStorePat ; + defm : FlatStorePats ; + defm : FlatStorePats ; + defm : FlatStorePats ; + defm : FlatStorePats ; } let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts in { @@ -1697,8 +1715,8 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats_D16_t16; defm : FlatLoadPats_D16_t16; defm : FlatLoadPats_D16_t16; - def : FlatStorePat ; - def : FlatStorePat ; + defm : FlatStorePats_t16 ; + defm : FlatStorePats_t16 ; def : FlatStorePat ; def : FlatStorePat ; } // End let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predicate = UseRealTrue16Insts @@ -1706,30 +1724,31 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats ; defm : FlatLoadPats ; -def : FlatStorePat ; -def : FlatStorePat ; +defm : FlatStorePats ; +defm : FlatStorePats ; foreach vt = Reg32Types.types in { defm : FlatLoadPats ; -def : FlatStorePat ; +defm : FlatStorePats ; } foreach vt = VReg_64.RegTypes in { -def : FlatStorePat ; +defm : FlatStorePats ; def : FlatLoadPat ; } -def : FlatStorePat ; +defm : FlatStorePats ; foreach vt = VReg_128.RegTypes in { defm : FlatLoadPats ; -def : FlatStorePat ; +defm : FlatStorePats ; } -def : FlatStorePat ; -def : FlatStorePat ; -def : FlatStorePat ; -def : FlatStorePat ; +defm : FlatStorePats ; +defm : FlatStorePats ; +defm : FlatStorePats ; +defm : FlatStorePats ; + foreach as = [ "flat", "global" ] in { defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>; @@ -1780,8 +1799,8 @@ let SubtargetPredicate = isGFX12Plus in { } let OtherPredicates = [HasD16LoadStore] in { -def : FlatStorePat ; -def : FlatStorePat ; +defm : FlatStorePats ; +defm : FlatStorePats ; } let OtherPredicates = [D16PreservesUnusedBits] in { diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll new file mode 100644 index 0000000000000..32888d2acf1cd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll @@ -0,0 +1,1118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +; Test using saddr addressing mode of flat_*store_* instructions. + +define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { +; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] +; GFX1250-NEXT: s_endpgm + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i8 %data, ptr %gep0 + ret void +} + +; Maximum positive offset on gfx10 +define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { +; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_2047: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:2047 +; GFX1250-NEXT: s_endpgm + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047 + store i8 %data, ptr %gep1 + ret void +} + +; Maximum negative offset on gfx10 +define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) { +; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_neg2048: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b32 v0, v[0:1] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: flat_store_b8 v0, v2, s[2:3] offset:-2048 +; GFX1250-NEXT: s_endpgm + %voffset = load i32, ptr %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 + store i8 %data, ptr %gep1 + ret void +} + +; -------------------------------------------------------------------------------- +; Uniformity edge cases +; -------------------------------------------------------------------------------- + +@ptr.in.lds = internal addrspace(3) global ptr undef + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_store_b8 v0, v1, s[0:1] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_store_b8 v[2:3], v1 +; GFX1250-GISEL-NEXT: s_endpgm + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i8 %data, ptr %gep0 + ret void +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_store_b8 v0, v1, s[0:1] offset:-120 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_store_b8 v[2:3], v1 offset:-120 +; GFX1250-GISEL-NEXT: s_endpgm + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -120 + store i8 %data, ptr %gep1 + ret void +} + +; -------------------------------------------------------------------------------- +; Stress various type stores +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, i16 %data) { +; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i16 %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i16 %data) { +; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store i16 %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, half %data) { +; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store half %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, half %data) { +; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b16 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store half %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i32 %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store i32 %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, float %data) { +; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store float %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, float %data) { +; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store float %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) { +; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store ptr addrspace(3) %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) { +; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store ptr addrspace(3) %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store i64 %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store i64 %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, double %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store double %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, double %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store double %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x i32> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x i32> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x float> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x float> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x i16> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x i16> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x half> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x half> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store ptr %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store ptr %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <3 x i32> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <3 x i32> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <3 x float> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <3 x float> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <6 x i16> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <6 x i16> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <6 x half> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b96 v0, v[2:4], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v6, v3 +; GFX1250-GISEL-NEXT: flat_store_b96 v0, v[4:6], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <6 x half> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x i32> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x i32> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x float> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x float> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x float> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x i64> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x i64> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x double> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x double> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x double> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x double> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <8 x i16> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <8 x i16> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <8 x half> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x half> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <8 x half> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <2 x ptr> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <2 x ptr> %data, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store <4 x ptr addrspace(3)> %data, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) { +; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: flat_store_b128 v0, v[2:5], s[2:3] offset:-128 +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4 +; GFX1250-GISEL-NEXT: flat_store_b128 v0, v[6:9], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store <4 x ptr addrspace(3)> %data, ptr %gep1 + ret void +} + +; -------------------------------------------------------------------------------- +; Atomic store +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store atomic i32 %data, ptr %gep0 seq_cst, align 4 + ret void +} + +define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store atomic i32 %data, ptr %gep1 seq_cst, align 4 + ret void +} + +define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + store atomic i64 %data, ptr %gep0 seq_cst, align 8 + ret void +} + +define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: flat_store_b64 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: flat_store_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + store atomic i64 %data, ptr %gep1 seq_cst, align 8 + ret void +} + +; -------------------------------------------------------------------------------- +; D16 HI store (hi 16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %data.hi = extractelement <2 x i16> %data, i32 1 + store i16 %data.hi, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_d16_hi_b16 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %data.hi = extractelement <2 x i16> %data, i32 1 + store i16 %data.hi, ptr %gep1 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %data.hi = extractelement <2 x i16> %data, i32 1 + %data.hi.trunc = trunc i16 %data.hi to i8 + store i8 %data.hi.trunc, ptr %gep0 + ret void +} + +define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_store_d16_hi_b8 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %data.hi = extractelement <2 x i16> %data, i32 1 + %data.hi.trunc = trunc i16 %data.hi to i8 + store i8 %data.hi.trunc, ptr %gep1 + ret void +} From 81eb7defa23dcf48a8e51391543eb210df232440 Mon Sep 17 00:00:00 2001 From: Cristian Assaiante Date: Thu, 17 Jul 2025 01:51:58 +0200 Subject: [PATCH 117/813] [OptBisect][IR] Adding a new OptPassGate for disabling passes via name (#145059) This commit adds a new pass gate that allows selective disabling of one or more passes via the clang command line using the `-opt-disable` option. Passes to be disabled should be specified as a comma-separated list of their names. The implementation resides in the same file as the bisection tool. The `getGlobalPassGate()` function returns the currently enabled gate. Example: `-opt-disable="PassA,PassB"` Pass names are matched using case-insensitive comparisons. However, note that special characters, including spaces, must be included exactly as they appear in the pass names. Additionally, a `-opt-disable-enable-verbosity` flag has been introduced to enable verbose output when this functionality is in use. When enabled, it prints the status of all passes (either running or NOT running), similar to the default behavior of `-opt-bisect-limit`. This flag is disabled by default, which is the opposite of the `-opt-bisect-verbose` flag (which defaults to enabled). To validate this functionality, a test file has also been provided. It reuses the same infrastructure as the opt-bisect test, but disables three specific passes and checks the output to ensure the expected behavior. --------- Co-authored-by: Nikita Popov --- .../CodeGen/new-pass-manager-opt-bisect.c | 2 +- llvm/include/llvm/IR/OptBisect.h | 35 ++++++- llvm/include/llvm/Pass.h | 4 + llvm/lib/IR/OptBisect.cpp | 42 ++++++++- llvm/lib/IR/Pass.cpp | 26 +++++- llvm/lib/Passes/StandardInstrumentations.cpp | 10 +- .../test/Other/opt-bisect-new-pass-manager.ll | 76 ++++++++-------- llvm/test/Other/opt-disable.ll | 91 +++++++++++++++++++ 8 files changed, 237 insertions(+), 49 deletions(-) create mode 100644 llvm/test/Other/opt-disable.ll diff --git a/clang/test/CodeGen/new-pass-manager-opt-bisect.c b/clang/test/CodeGen/new-pass-manager-opt-bisect.c index 91a0adf252bb5..5d5fdd473422a 100644 --- a/clang/test/CodeGen/new-pass-manager-opt-bisect.c +++ b/clang/test/CodeGen/new-pass-manager-opt-bisect.c @@ -7,6 +7,6 @@ // CHECK: BISECT: running pass (1) // CHECK-NOT: BISECT: running pass (1) // Make sure that legacy pass manager is running -// CHECK: Instruction Selection +// CHECK: -isel int func(int a) { return a; } diff --git a/llvm/include/llvm/IR/OptBisect.h b/llvm/include/llvm/IR/OptBisect.h index ea3c1defeb100..d813ae933d65e 100644 --- a/llvm/include/llvm/IR/OptBisect.h +++ b/llvm/include/llvm/IR/OptBisect.h @@ -15,6 +15,7 @@ #define LLVM_IR_OPTBISECT_H #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSet.h" #include "llvm/Support/Compiler.h" #include @@ -82,8 +83,38 @@ class LLVM_ABI OptBisect : public OptPassGate { mutable int LastBisectNum = 0; }; -/// Singleton instance of the OptBisect class, so multiple pass managers don't -/// need to coordinate their uses of OptBisect. +/// This class implements a mechanism to disable passes and individual +/// optimizations at compile time based on a command line option +/// (-opt-disable) in order to study how single transformations, or +/// combinations thereof, affect the IR. +class LLVM_ABI OptDisable : public OptPassGate { +public: + /// Checks the pass name to determine if the specified pass should run. + /// + /// It returns true if the pass should run, i.e. if its name is was + /// not provided via command line. + /// If -opt-disable-enable-verbosity is given, the method prints the + /// name of the pass, and whether or not the pass will be executed. + /// + /// Most passes should not call this routine directly. Instead, it is called + /// through helper routines provided by the base classes of the pass. For + /// instance, function passes should call FunctionPass::skipFunction(). + bool shouldRunPass(StringRef PassName, + StringRef IRDescription) const override; + + /// Parses the command line argument to extract the names of the passes + /// to be disabled. Multiple pass names can be provided with comma separation. + void setDisabled(StringRef Pass); + + /// isEnabled() should return true before calling shouldRunPass(). + bool isEnabled() const override { return !DisabledPasses.empty(); } + +private: + StringSet<> DisabledPasses = {}; +}; + +/// Singleton instance of the OptPassGate class, so multiple pass managers don't +/// need to coordinate their uses of OptBisect and OptDisable. LLVM_ABI OptPassGate &getGlobalPassGate(); } // end namespace llvm diff --git a/llvm/include/llvm/Pass.h b/llvm/include/llvm/Pass.h index 2ecd47dd10bde..f3962c3556c95 100644 --- a/llvm/include/llvm/Pass.h +++ b/llvm/include/llvm/Pass.h @@ -114,6 +114,10 @@ class LLVM_ABI Pass { /// Registration templates, but can be overloaded directly. virtual StringRef getPassName() const; + /// Return a nice clean name for a pass + /// corresponding to that used to enable the pass in opt. + StringRef getPassArgument() const; + /// getPassID - Return the PassID number that corresponds to this pass. AnalysisID getPassID() const { return PassID; diff --git a/llvm/lib/IR/OptBisect.cpp b/llvm/lib/IR/OptBisect.cpp index 427e8b78fd03f..29ca268408265 100644 --- a/llvm/lib/IR/OptBisect.cpp +++ b/llvm/lib/IR/OptBisect.cpp @@ -25,6 +25,11 @@ static OptBisect &getOptBisector() { return OptBisector; } +static OptDisable &getOptDisabler() { + static OptDisable OptDisabler; + return OptDisabler; +} + static cl::opt OptBisectLimit("opt-bisect-limit", cl::Hidden, cl::init(OptBisect::Disabled), cl::Optional, cl::cb([](int Limit) { @@ -37,6 +42,18 @@ static cl::opt OptBisectVerbose( cl::desc("Show verbose output when opt-bisect-limit is set"), cl::Hidden, cl::init(true), cl::Optional); +static cl::list OptDisablePasses( + "opt-disable", cl::Hidden, cl::CommaSeparated, cl::Optional, + cl::cb([](const std::string &Pass) { + getOptDisabler().setDisabled(Pass); + }), + cl::desc("Optimization pass(es) to disable (comma-separated list)")); + +static cl::opt + OptDisableVerbose("opt-disable-enable-verbosity", + cl::desc("Show verbose output when opt-disable is set"), + cl::Hidden, cl::init(false), cl::Optional); + static void printPassMessage(StringRef Name, int PassNum, StringRef TargetDesc, bool Running) { StringRef Status = Running ? "" : "NOT "; @@ -55,4 +72,27 @@ bool OptBisect::shouldRunPass(StringRef PassName, return ShouldRun; } -OptPassGate &llvm::getGlobalPassGate() { return getOptBisector(); } +static void printDisablePassMessage(const StringRef &Name, StringRef TargetDesc, + bool Running) { + StringRef Status = Running ? "" : "NOT "; + dbgs() << "OptDisable: " << Status << "running pass " << Name << " on " + << TargetDesc << "\n"; +} + +void OptDisable::setDisabled(StringRef Pass) { DisabledPasses.insert(Pass); } + +bool OptDisable::shouldRunPass(StringRef PassName, + StringRef IRDescription) const { + assert(isEnabled()); + + const bool ShouldRun = !DisabledPasses.contains(PassName); + if (OptDisableVerbose) + printDisablePassMessage(PassName, IRDescription, ShouldRun); + return ShouldRun; +} + +OptPassGate &llvm::getGlobalPassGate() { + if (getOptDisabler().isEnabled()) + return getOptDisabler(); + return getOptBisector(); +} diff --git a/llvm/lib/IR/Pass.cpp b/llvm/lib/IR/Pass.cpp index 2c5ef7193b463..dec7c9a9ab18c 100644 --- a/llvm/lib/IR/Pass.cpp +++ b/llvm/lib/IR/Pass.cpp @@ -62,8 +62,12 @@ static std::string getDescription(const Module &M) { bool ModulePass::skipModule(const Module &M) const { const OptPassGate &Gate = M.getContext().getOptPassGate(); - return Gate.isEnabled() && - !Gate.shouldRunPass(this->getPassName(), getDescription(M)); + + StringRef PassName = getPassArgument(); + if (PassName.empty()) + PassName = this->getPassName(); + + return Gate.isEnabled() && !Gate.shouldRunPass(PassName, getDescription(M)); } bool Pass::mustPreserveAnalysisID(char &AID) const { @@ -86,6 +90,16 @@ StringRef Pass::getPassName() const { return "Unnamed pass: implement Pass::getPassName()"; } +/// getPassArgument - Return a nice clean name for a pass +/// corresponding to that used to enable the pass in opt +StringRef Pass::getPassArgument() const { + AnalysisID AID = getPassID(); + const PassInfo *PI = Pass::lookupPassInfo(AID); + if (PI) + return PI->getPassArgument(); + return ""; +} + void Pass::preparePassManager(PMStack &) { // By default, don't do anything. } @@ -173,8 +187,12 @@ static std::string getDescription(const Function &F) { bool FunctionPass::skipFunction(const Function &F) const { OptPassGate &Gate = F.getContext().getOptPassGate(); - if (Gate.isEnabled() && - !Gate.shouldRunPass(this->getPassName(), getDescription(F))) + + StringRef PassName = getPassArgument(); + if (PassName.empty()) + PassName = this->getPassName(); + + if (Gate.isEnabled() && !Gate.shouldRunPass(PassName, getDescription(F))) return true; if (F.hasOptNone()) { diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp index 0623e66772047..f165e85baf611 100644 --- a/llvm/lib/Passes/StandardInstrumentations.cpp +++ b/llvm/lib/Passes/StandardInstrumentations.cpp @@ -1078,9 +1078,13 @@ void OptPassGateInstrumentation::registerCallbacks( if (!PassGate.isEnabled()) return; - PIC.registerShouldRunOptionalPassCallback([this](StringRef PassName, Any IR) { - return this->shouldRun(PassName, IR); - }); + PIC.registerShouldRunOptionalPassCallback( + [this, &PIC](StringRef ClassName, Any IR) { + StringRef PassName = PIC.getPassNameForClassName(ClassName); + if (PassName.empty()) + return this->shouldRun(ClassName, IR); + return this->shouldRun(PassName, IR); + }); } raw_ostream &PrintPassInstrumentation::print() { diff --git a/llvm/test/Other/opt-bisect-new-pass-manager.ll b/llvm/test/Other/opt-bisect-new-pass-manager.ll index 01dad705ec362..8f8078d4d8409 100644 --- a/llvm/test/Other/opt-bisect-new-pass-manager.ll +++ b/llvm/test/Other/opt-bisect-new-pass-manager.ll @@ -11,84 +11,84 @@ ; RUN: opt -disable-output -disable-verify \ ; RUN: -passes=inferattrs -opt-bisect-limit=-1 %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-MODULE-PASS -; CHECK-MODULE-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module] +; CHECK-MODULE-PASS: BISECT: running pass (1) inferattrs on [module] ; RUN: opt -disable-output -disable-verify \ ; RUN: -passes=inferattrs -opt-bisect-limit=0 %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-LIMIT-MODULE-PASS -; CHECK-LIMIT-MODULE-PASS: BISECT: NOT running pass (1) InferFunctionAttrsPass on [module] +; CHECK-LIMIT-MODULE-PASS: BISECT: NOT running pass (1) inferattrs on [module] ; RUN: opt -disable-output -debug-pass-manager \ ; RUN: -passes=inferattrs -opt-bisect-limit=-1 %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-REQUIRED-PASS -; CHECK-REQUIRED-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module] +; CHECK-REQUIRED-PASS: BISECT: running pass (1) inferattrs on [module] ; CHECK-REQUIRED-PASS-NOT: BISECT: {{.*}}VerifierPass ; CHECK-REQUIRED-PASS: Running pass: VerifierPass ; RUN: opt -disable-output -debug-pass-manager \ ; RUN: -passes=inferattrs -opt-bisect-limit=0 %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-LIMIT-REQUIRED-PASS -; CHECK-LIMIT-REQUIRED-PASS: BISECT: NOT running pass (1) InferFunctionAttrsPass on [module] +; CHECK-LIMIT-REQUIRED-PASS: BISECT: NOT running pass (1) inferattrs on [module] ; CHECK-LIMIT-REQUIRED-PASS-NOT: BISECT: {{.*}}VerifierPass ; CHECK-LIMIT-REQUIRED-PASS: Running pass: VerifierPass ; RUN: opt -disable-output -disable-verify \ ; RUN: -passes=early-cse -opt-bisect-limit=-1 %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-FUNCTION-PASS -; CHECK-FUNCTION-PASS: BISECT: running pass (1) EarlyCSEPass on f1 -; CHECK-FUNCTION-PASS: BISECT: running pass (2) EarlyCSEPass on f2 -; CHECK-FUNCTION-PASS: BISECT: running pass (3) EarlyCSEPass on f3 -; CHECK-FUNCTION-PASS: BISECT: running pass (4) EarlyCSEPass on f4 +; CHECK-FUNCTION-PASS: BISECT: running pass (1) early-cse on f1 +; CHECK-FUNCTION-PASS: BISECT: running pass (2) early-cse on f2 +; CHECK-FUNCTION-PASS: BISECT: running pass (3) early-cse on f3 +; CHECK-FUNCTION-PASS: BISECT: running pass (4) early-cse on f4 ; RUN: opt -disable-output -disable-verify \ ; RUN: -passes=early-cse -opt-bisect-limit=2 %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-LIMIT-FUNCTION-PASS -; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (1) EarlyCSEPass on f1 -; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (2) EarlyCSEPass on f2 -; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (3) EarlyCSEPass on f3 -; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (4) EarlyCSEPass on f4 +; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (1) early-cse on f1 +; CHECK-LIMIT-FUNCTION-PASS: BISECT: running pass (2) early-cse on f2 +; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (3) early-cse on f3 +; CHECK-LIMIT-FUNCTION-PASS: BISECT: NOT running pass (4) early-cse on f4 ; RUN: opt -disable-output -disable-verify \ ; RUN: -passes=function-attrs -opt-bisect-limit=-1 %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-CGSCC-PASS -; CHECK-CGSCC-PASS: BISECT: running pass (1) PostOrderFunctionAttrsPass on (f1) -; CHECK-CGSCC-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f2) -; CHECK-CGSCC-PASS: BISECT: running pass (3) PostOrderFunctionAttrsPass on (f3) -; CHECK-CGSCC-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f4) +; CHECK-CGSCC-PASS: BISECT: running pass (1) function-attrs on (f1) +; CHECK-CGSCC-PASS: BISECT: running pass (2) function-attrs on (f2) +; CHECK-CGSCC-PASS: BISECT: running pass (3) function-attrs on (f3) +; CHECK-CGSCC-PASS: BISECT: running pass (4) function-attrs on (f4) ; RUN: opt -disable-output -disable-verify \ ; RUN: -passes=function-attrs -opt-bisect-limit=3 %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-LIMIT-CGSCC-PASS -; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (1) PostOrderFunctionAttrsPass on (f1) -; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f2) -; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (3) PostOrderFunctionAttrsPass on (f3) -; CHECK-LIMIT-CGSCC-PASS: BISECT: NOT running pass (4) PostOrderFunctionAttrsPass on (f4) +; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (1) function-attrs on (f1) +; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (2) function-attrs on (f2) +; CHECK-LIMIT-CGSCC-PASS: BISECT: running pass (3) function-attrs on (f3) +; CHECK-LIMIT-CGSCC-PASS: BISECT: NOT running pass (4) function-attrs on (f4) ; RUN: opt -disable-output -disable-verify -opt-bisect-limit=-1 \ ; RUN: -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-MULTI-PASS -; CHECK-MULTI-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module] -; CHECK-MULTI-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f1) -; CHECK-MULTI-PASS: BISECT: running pass (3) EarlyCSEPass on f1 -; CHECK-MULTI-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f2) -; CHECK-MULTI-PASS: BISECT: running pass (5) EarlyCSEPass on f2 -; CHECK-MULTI-PASS: BISECT: running pass (6) PostOrderFunctionAttrsPass on (f3) -; CHECK-MULTI-PASS: BISECT: running pass (7) EarlyCSEPass on f3 -; CHECK-MULTI-PASS: BISECT: running pass (8) PostOrderFunctionAttrsPass on (f4) -; CHECK-MULTI-PASS: BISECT: running pass (9) EarlyCSEPass on f4 +; CHECK-MULTI-PASS: BISECT: running pass (1) inferattrs on [module] +; CHECK-MULTI-PASS: BISECT: running pass (2) function-attrs on (f1) +; CHECK-MULTI-PASS: BISECT: running pass (3) early-cse on f1 +; CHECK-MULTI-PASS: BISECT: running pass (4) function-attrs on (f2) +; CHECK-MULTI-PASS: BISECT: running pass (5) early-cse on f2 +; CHECK-MULTI-PASS: BISECT: running pass (6) function-attrs on (f3) +; CHECK-MULTI-PASS: BISECT: running pass (7) early-cse on f3 +; CHECK-MULTI-PASS: BISECT: running pass (8) function-attrs on (f4) +; CHECK-MULTI-PASS: BISECT: running pass (9) early-cse on f4 ; RUN: opt -disable-output -disable-verify -opt-bisect-limit=7 \ ; RUN: -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \ ; RUN: | FileCheck %s --check-prefix=CHECK-LIMIT-MULTI-PASS -; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (1) InferFunctionAttrsPass on [module] -; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (2) PostOrderFunctionAttrsPass on (f1) -; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (3) EarlyCSEPass on f1 -; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (4) PostOrderFunctionAttrsPass on (f2) -; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (5) EarlyCSEPass on f2 -; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (6) PostOrderFunctionAttrsPass on (f3) -; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (7) EarlyCSEPass on f3 -; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (8) PostOrderFunctionAttrsPass on (f4) -; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (9) EarlyCSEPass on f4 +; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (1) inferattrs on [module] +; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (2) function-attrs on (f1) +; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (3) early-cse on f1 +; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (4) function-attrs on (f2) +; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (5) early-cse on f2 +; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (6) function-attrs on (f3) +; CHECK-LIMIT-MULTI-PASS: BISECT: running pass (7) early-cse on f3 +; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (8) function-attrs on (f4) +; CHECK-LIMIT-MULTI-PASS: BISECT: NOT running pass (9) early-cse on f4 ; Make sure we don't skip writing the output to stdout. ; RUN: opt %s -opt-bisect-limit=0 -passes=early-cse | opt -S | FileCheck %s -check-prefix=CHECK-OUTPUT diff --git a/llvm/test/Other/opt-disable.ll b/llvm/test/Other/opt-disable.ll new file mode 100644 index 0000000000000..4506042215cbf --- /dev/null +++ b/llvm/test/Other/opt-disable.ll @@ -0,0 +1,91 @@ +; This test uses the same IR functions of the opt-bisect test +; but it checks the correctness of the -opt-disable flag. +; -opt-disable-enable-verbosity is required to have output. + +; RUN: opt -disable-output -disable-verify \ +; RUN: -opt-disable-enable-verbosity \ +; RUN: -passes=inferattrs -opt-disable=inferattrs %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-MODULE-PASS +; CHECK-MODULE-PASS: OptDisable: NOT running pass inferattrs on [module] + +; RUN: opt -disable-output -disable-verify \ +; RUN: -opt-disable-enable-verbosity \ +; RUN: -passes=sroa -opt-disable=sroa %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-FUNCTION-PASS +; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f1 +; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f2 +; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f3 +; CHECK-FUNCTION-PASS: OptDisable: NOT running pass sroa on f4 + +; RUN: opt -disable-output -disable-verify \ +; RUN: -opt-disable=inferattrs,function-attrs \ +; RUN: -opt-disable-enable-verbosity \ +; RUN: -passes='inferattrs,cgscc(function-attrs,function(early-cse))' %s 2>&1 \ +; RUN: | FileCheck %s --check-prefix=CHECK-MULTI-PASS +; CHECK-MULTI-PASS: OptDisable: NOT running pass inferattrs on [module] +; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f1) +; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f1 +; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f2) +; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f2 +; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f3) +; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f3 +; CHECK-MULTI-PASS: OptDisable: NOT running pass function-attrs on (f4) +; CHECK-MULTI-PASS: OptDisable: running pass early-cse on f4 + +declare i32 @g() + +define void @f1(i1 %arg) { +entry: + br label %loop.0 +loop.0: + br i1 %arg, label %loop.0.0, label %loop.1 +loop.0.0: + br i1 %arg, label %loop.0.0, label %loop.0.1 +loop.0.1: + br i1 %arg, label %loop.0.1, label %loop.0 +loop.1: + br i1 %arg, label %loop.1, label %loop.1.bb1 +loop.1.bb1: + br i1 %arg, label %loop.1, label %loop.1.bb2 +loop.1.bb2: + br i1 %arg, label %end, label %loop.1.0 +loop.1.0: + br i1 %arg, label %loop.1.0, label %loop.1 +end: + ret void +} + +define i32 @f2() { +entry: + ret i32 0 +} + +define i32 @f3() { +entry: + %temp = call i32 @g() + %icmp = icmp ugt i32 %temp, 2 + br i1 %icmp, label %bb.true, label %bb.false +bb.true: + %temp2 = call i32 @f2() + ret i32 %temp2 +bb.false: + ret i32 0 +} + +define void @f4(i1 %arg) { +entry: + %i = alloca i32, align 4 + call void @llvm.lifetime.start(i64 4, ptr %i) + br label %for.cond + +for.cond: + br i1 %arg, label %for.body, label %for.end + +for.body: + br label %for.cond + +for.end: + ret void +} + +declare void @llvm.lifetime.start(i64, ptr nocapture) From 210cf010c3362e5648d037ea5e4b27c2673837ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 16 Jul 2025 17:02:43 -0700 Subject: [PATCH 118/813] [flang][cuda] Lower globaltimer to NVVM op (#149217) --- flang/include/flang/Optimizer/Builder/IntrinsicCall.h | 1 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 8 ++++++++ flang/module/cudadevice.f90 | 5 +++++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 3 +++ 4 files changed, 17 insertions(+) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index d38c5b6d09a82..363b1d5844d1b 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -282,6 +282,7 @@ struct IntrinsicLibrary { llvm::ArrayRef args); mlir::Value genGetUID(mlir::Type resultType, llvm::ArrayRef args); + mlir::Value genGlobalTimer(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genHostnm(std::optional resultType, llvm::ArrayRef args); fir::ExtendedValue genIall(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 8d0a511744e25..ddfa27475fa7a 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -503,6 +503,7 @@ static constexpr IntrinsicHandler handlers[]{ {"getgid", &I::genGetGID}, {"getpid", &I::genGetPID}, {"getuid", &I::genGetUID}, + {"globaltimer", &I::genGlobalTimer, {}, /*isElemental=*/false}, {"hostnm", &I::genHostnm, {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}}, @@ -4319,6 +4320,13 @@ mlir::Value IntrinsicLibrary::genGetUID(mlir::Type resultType, fir::runtime::genGetUID(builder, loc)); } +// GLOBALTIMER +mlir::Value IntrinsicLibrary::genGlobalTimer(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 0 && "globalTimer takes no args"); + return builder.create(loc, resultType).getResult(); +} + // GET_COMMAND_ARGUMENT void IntrinsicLibrary::genGetCommandArgument( llvm::ArrayRef args) { diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index f8a30da8b9615..52a619e07165c 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1613,6 +1613,11 @@ attributes(device,host) logical function on_device() bind(c) end function end interface + interface + attributes(device) integer(8) function globalTimer() + end function + end interface + contains attributes(device) subroutine syncthreads() diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 42ee7657966e2..888c7961ee2b4 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -46,6 +46,8 @@ attributes(global) subroutine devsub() ai = atomicdec(ai, 1_4) time = clock64() + + time = globalTimer() end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} @@ -83,6 +85,7 @@ end ! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 ! CHECK: fir.call @llvm.nvvm.read.ptx.sreg.clock64() +! CHECK: %{{.*}} = nvvm.read.ptx.sreg.globaltimer : i64 subroutine host1() integer, device :: a(32) From 9d78eb5cc51820bd7076861a9ad175e5666b90d3 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Thu, 17 Jul 2025 08:04:33 +0800 Subject: [PATCH 119/813] [libclc] Enable -fdiscard-value-names build flag to reduce bitcode size (#149016) The flag reduces nvptx64--nvidiacl.bc size from 10.6MB to 5.2MB. --- libclc/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 2570d1a106d21..e4e9a74639b17 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -405,6 +405,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) -I${CMAKE_CURRENT_SOURCE_DIR}/clc/include # Error on undefined macros -Werror=undef + -fdiscard-value-names ) if( NOT "${cpu}" STREQUAL "" ) From b52cf756ced2aefd05b7e2f01026c941f9a04c47 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Wed, 16 Jul 2025 17:07:48 -0700 Subject: [PATCH 120/813] AMDGPU: Treat WMMA XDL ops as TRANS in S_DELAY_ALU insertion for gfx1250 (#149208) WMMA XDL instructions are tracked as TRANs ops and the compiler should consider them the same as TRANS in S_DELAY_ALU insertion. We use a searchable table for the InsertDelayAlu pass to recognize these WMMA XDL instructions. Co-authored-by: Stefan Stipanovic --- .../Target/AMDGPU/AMDGPUInsertDelayAlu.cpp | 24 +++--- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 15 +++- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 + .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 6 ++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 9 ++ .../AMDGPU/insert-delay-alu-wmma-xdl.mir | 84 +++++++++++++++++++ 6 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index 44eaebffb70dc..9a90787963d7b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -25,6 +25,7 @@ namespace { class AMDGPUInsertDelayAlu { public: + const GCNSubtarget *ST; const SIInstrInfo *SII; const TargetRegisterInfo *TRI; @@ -65,13 +66,16 @@ class AMDGPUInsertDelayAlu { // Types of delay that can be encoded in an s_delay_alu instruction. enum DelayType { VALU, TRANS, SALU, OTHER }; - // Get the delay type for an instruction with the specified TSFlags. - static DelayType getDelayType(uint64_t TSFlags) { - if (TSFlags & SIInstrFlags::TRANS) + // Get the delay type for a MachineInstr. + DelayType getDelayType(const MachineInstr &MI) { + if (SIInstrInfo::isTRANS(MI)) return TRANS; - if (TSFlags & SIInstrFlags::VALU) + // WMMA XDL ops are treated the same as TRANS. + if (AMDGPU::isGFX1250(*ST) && SII->isXDLWMMA(MI)) + return TRANS; + if (SIInstrInfo::isVALU(MI)) return VALU; - if (TSFlags & SIInstrFlags::SALU) + if (SIInstrInfo::isSALU(MI)) return SALU; return OTHER; } @@ -368,7 +372,7 @@ class AMDGPUInsertDelayAlu { continue; } - DelayType Type = getDelayType(MI.getDesc().TSFlags); + DelayType Type = getDelayType(MI); if (instructionWaitsForSGPRWrites(MI)) { auto It = State.find(LastSGPRFromVALU); @@ -456,12 +460,12 @@ class AMDGPUInsertDelayAlu { LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName() << "\n"); - const GCNSubtarget &ST = MF.getSubtarget(); - if (!ST.hasDelayAlu()) + ST = &MF.getSubtarget(); + if (!ST->hasDelayAlu()) return false; - SII = ST.getInstrInfo(); - TRI = ST.getRegisterInfo(); + SII = ST->getInstrInfo(); + TRI = ST->getRegisterInfo(); SchedModel = &SII->getSchedModel(); // Calculate the delay state for each basic block, iterating until we reach diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 6109a2c4dfc7f..88d30fb555513 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10466,10 +10466,23 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const { return TargetInstrInfo::isGlobalMemoryObject(MI); } +bool SIInstrInfo::isXDLWMMA(const MachineInstr &MI) const { + if (!isWMMA(MI) && !isSWMMAC(MI)) + return false; + + if (AMDGPU::isGFX1250(ST)) + return AMDGPU::getWMMAIsXDL(MI.getOpcode()); + + return true; +} + bool SIInstrInfo::isXDL(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); - if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) || + if (AMDGPU::isGFX12Plus(ST)) + return isDOT(MI) || isXDLWMMA(MI); + + if (!isMAI(MI) || isDGEMM(Opcode) || Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64) return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index a380199977616..3a48e6579238e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -867,6 +867,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return get(Opcode).TSFlags & SIInstrFlags::IsDOT; } + bool isXDLWMMA(const MachineInstr &MI) const; + bool isXDL(const MachineInstr &MI) const; static bool isDGEMM(unsigned Opcode) { return AMDGPU::getMAIIsDGEMM(Opcode); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 9df2bdededa13..77258810dd68c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -296,6 +296,7 @@ unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) { #define GET_MIMGOffsetMappingTable_IMPL #define GET_MIMGG16MappingTable_IMPL #define GET_MAIInstInfoTable_IMPL +#define GET_WMMAInstInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, @@ -568,6 +569,11 @@ bool getMAIIsGFX940XDL(unsigned Opc) { return Info && Info->is_gfx940_xdl; } +bool getWMMAIsXDL(unsigned Opc) { + const WMMAInstInfo *Info = getWMMAInstInfoHelper(Opc); + return Info ? Info->is_wmma_xdl : false; +} + uint8_t mfmaScaleF8F6F4FormatToNumRegs(unsigned EncodingVal) { switch (EncodingVal) { case MFMAScaleFormats::FP6_E2M3: diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 6708e0a3f4549..c9d2c286bf237 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -119,6 +119,11 @@ struct True16D16Info { unsigned LoOp; }; +struct WMMAInstInfo { + uint16_t Opcode; + bool is_wmma_xdl; +}; + #define GET_MIMGBaseOpcode_DECL #define GET_MIMGDim_DECL #define GET_MIMGEncoding_DECL @@ -129,6 +134,7 @@ struct True16D16Info { #define GET_isMFMA_F8F6F4Table_DECL #define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL #define GET_True16D16Table_DECL +#define GET_WMMAInstInfoTable_DECL #include "AMDGPUGenSearchableTables.inc" namespace IsaInfo { @@ -593,6 +599,9 @@ bool getMAIIsDGEMM(unsigned Opc); LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +LLVM_READONLY +bool getWMMAIsXDL(unsigned Opc); + // Get an equivalent BitOp3 for a binary logical \p Opc. // \returns BitOp3 modifier for the logical operation or zero. // Used in VOPD3 conversion. diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir new file mode 100644 index 0000000000000..7c3170d8d1e9f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir @@ -0,0 +1,84 @@ +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -start-before=amdgpu-insert-delay-alu %s -o - | FileCheck %s + +--- +name: wmma_xdl_twoaddr_trans +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: {{^}}wmma_xdl_twoaddr_trans: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[8:15], v[0:7], v[0:7], v[8:15] + ; CHECK-NEXT: v_exp_f32_e32 v16, v16 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v17, v17, v8 + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16, $vgpr17 + $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_WMMA_F32_16X16X64_FP8_FP8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 0, 0, 0, 0, implicit $exec + $vgpr16 = V_EXP_F32_e32 $vgpr16, implicit $exec, implicit $mode + $vgpr17 = V_ADD_U32_e32 $vgpr17, $vgpr8, implicit $exec +... + +--- +name: wmma_xdl_threeaddr_trans +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: {{^}}wmma_xdl_threeaddr_trans: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[8:15], v[0:7], v[0:7], v[16:23] + ; CHECK-NEXT: v_exp_f32_e32 v24, v24 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v25, v25, v8 + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24, $vgpr25 + $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_WMMA_F32_16X16X64_FP8_FP8_w32_threeaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, 0, 0, implicit $exec + $vgpr24 = V_EXP_F32_e32 $vgpr24, implicit $exec, implicit $mode + $vgpr25 = V_ADD_U32_e32 $vgpr25, $vgpr8, implicit $exec +... + +name: swmmac_xdl_twoaddr_trans +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: {{^}}swmmac_xdl_twoaddr_trans: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] + ; CHECK-NEXT: v_exp_f32_e32 v30, v30 + ; CHECK-NEXT: s_delay_alu instid0(TRANS32_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v31, v31, v24 + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31 + $vgpr24_vgpr25_vgpr26_vgpr27 = V_SWMMAC_F16_16X16X128_BF8_BF8_w32_twoaddr $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, $vgpr24_vgpr25_vgpr26_vgpr27, $vgpr28_vgpr29, 0, 0, 0, implicit $exec + $vgpr30 = V_EXP_F32_e32 $vgpr30, implicit $exec, implicit $mode + $vgpr31 = V_ADD_U32_e32 $vgpr31, $vgpr24, implicit $exec +... + +name: wmma_non_xdl_large_data_valu +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: {{^}}wmma_non_xdl_large_data_valu: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse + ; CHECK-NEXT: v_exp_f32_e32 v12, v12 + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) + ; CHECK-NEXT: v_add_nc_u32_e32 v13, v13, v8 + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, $vgpr12, $vgpr13 + $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = V_WMMA_F32_16X16X4_F32_w32_twoaddr 8, $vgpr0_vgpr1, 8, $vgpr2_vgpr3, 8, $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, 0, -1, 0, 0, implicit $exec + $vgpr12 = V_EXP_F32_e32 $vgpr12, implicit $exec, implicit $mode + $vgpr13 = V_ADD_U32_e32 $vgpr13, $vgpr8, implicit $exec +... + +--- +name: dot_xdl_dep_2 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: {{^}}dot_xdl_dep_2: + ; CHECK: %bb.0: + ; CHECK-NEXT: v_dot4_i32_iu8 v0, s2, s3, v0 neg_lo:[1,1,0] + ; CHECK-NEXT: v_dot4_i32_iu8 v1, s2, s3, v2 neg_lo:[1,1,0] + ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) + ; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v0 + liveins: $vgpr0, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2 + $vgpr0 = V_DOT4_I32_IU8 9, $sgpr2, 9, $sgpr3, 8, $vgpr0, 0, 0, 0, implicit $exec + $vgpr1 = V_DOT4_I32_IU8 9, $sgpr2, 9, $sgpr3, 8, $vgpr2, 0, 0, 0, implicit $exec + $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec +... From 4cf7670b01fb5b01995cf89fe4304bfb0c69a4c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 16 Jul 2025 17:24:17 -0700 Subject: [PATCH 121/813] [flang][cuda] Lower clock() to NNVM op (#149228) Also use a same gen function for all NVVM time ops. --- .../flang/Optimizer/Builder/IntrinsicCall.h | 3 ++- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 21 ++++++++++++------- flang/module/cudadevice.f90 | 15 ++++++++----- flang/test/Lower/CUDA/cuda-device-proc.cuf | 3 +++ 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 363b1d5844d1b..b15dd29d68f65 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -282,7 +282,6 @@ struct IntrinsicLibrary { llvm::ArrayRef args); mlir::Value genGetUID(mlir::Type resultType, llvm::ArrayRef args); - mlir::Value genGlobalTimer(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genHostnm(std::optional resultType, llvm::ArrayRef args); fir::ExtendedValue genIall(mlir::Type, llvm::ArrayRef); @@ -377,6 +376,8 @@ struct IntrinsicLibrary { fir::ExtendedValue genNorm2(mlir::Type, llvm::ArrayRef); mlir::Value genNot(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef); + template + mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef); void genPerror(llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index ddfa27475fa7a..bfbc26e5e6c19 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -385,6 +385,7 @@ static constexpr IntrinsicHandler handlers[]{ &I::genChdir, {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}}, /*isElemental=*/false}, + {"clock", &I::genNVVMTime, {}, /*isElemental=*/false}, {"clock64", &I::genClock64, {}, /*isElemental=*/false}, {"cmplx", &I::genCmplx, @@ -503,7 +504,10 @@ static constexpr IntrinsicHandler handlers[]{ {"getgid", &I::genGetGID}, {"getpid", &I::genGetPID}, {"getuid", &I::genGetUID}, - {"globaltimer", &I::genGlobalTimer, {}, /*isElemental=*/false}, + {"globaltimer", + &I::genNVVMTime, + {}, + /*isElemental=*/false}, {"hostnm", &I::genHostnm, {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}}, @@ -4320,13 +4324,6 @@ mlir::Value IntrinsicLibrary::genGetUID(mlir::Type resultType, fir::runtime::genGetUID(builder, loc)); } -// GLOBALTIMER -mlir::Value IntrinsicLibrary::genGlobalTimer(mlir::Type resultType, - llvm::ArrayRef args) { - assert(args.size() == 0 && "globalTimer takes no args"); - return builder.create(loc, resultType).getResult(); -} - // GET_COMMAND_ARGUMENT void IntrinsicLibrary::genGetCommandArgument( llvm::ArrayRef args) { @@ -7207,6 +7204,14 @@ IntrinsicLibrary::genNull(mlir::Type, llvm::ArrayRef args) { return fir::MutableBoxValue(boxStorage, mold->nonDeferredLenParams(), {}); } +// CLOCK, GLOBALTIMER +template +mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 0 && "expect no arguments"); + return builder.create(loc, resultType).getResult(); +} + // PACK fir::ExtendedValue IntrinsicLibrary::genPack(mlir::Type resultType, diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 52a619e07165c..d0c312c09353f 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -957,11 +957,21 @@ attributes(device) pure integer function atomicxori(address, val) ! Time function + interface + attributes(device) integer function clock() + end function + end interface + interface attributes(device) integer(8) function clock64() end function end interface + interface + attributes(device) integer(8) function globalTimer() + end function + end interface + ! Warp Match Functions interface match_all_sync @@ -1613,11 +1623,6 @@ attributes(device,host) logical function on_device() bind(c) end function end interface - interface - attributes(device) integer(8) function globalTimer() - end function - end interface - contains attributes(device) subroutine syncthreads() diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 888c7961ee2b4..2d6f734670740 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -10,6 +10,7 @@ attributes(global) subroutine devsub() integer(4) :: ai integer(8) :: al integer(8) :: time + integer :: smalltime call syncthreads() call syncwarp(1) @@ -45,6 +46,7 @@ attributes(global) subroutine devsub() ai = atomicinc(ai, 1_4) ai = atomicdec(ai, 1_4) + smalltime = clock() time = clock64() time = globalTimer() @@ -84,6 +86,7 @@ end ! CHECK: %{{.*}} = llvm.atomicrmw uinc_wrap %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 ! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 +! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock : i32 ! CHECK: fir.call @llvm.nvvm.read.ptx.sreg.clock64() ! CHECK: %{{.*}} = nvvm.read.ptx.sreg.globaltimer : i64 From f56211ebfa7f9ca71b9eeb119012b0f6e1a2b2c9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 16 Jul 2025 17:31:14 -0700 Subject: [PATCH 122/813] [RISCV] Add/remove/simplify tests in xqcisls.ll. NFC -Don't use i64 for GEP indices. -Remove tests that zero extended GEP indices from i32 to i64. -Add i64 load/store tests taken from xtheadmemidx that get split into two i32 load/store. -Don't extend load results to i64. I'm working on improvements to SelectAddrRegRegScale. --- llvm/test/CodeGen/RISCV/xqcisls.ll | 282 ++++++++++++++--------------- 1 file changed, 140 insertions(+), 142 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/xqcisls.ll b/llvm/test/CodeGen/RISCV/xqcisls.ll index 2bc4834ad3559..828a0760044aa 100644 --- a/llvm/test/CodeGen/RISCV/xqcisls.ll +++ b/llvm/test/CodeGen/RISCV/xqcisls.ll @@ -206,7 +206,7 @@ define void @sw_ri(i32* %a, i32 %b, i32 %c) { ret void } -define i8 @lrb_anyext(ptr %a, i64 %b) { +define i8 @lrb_anyext(ptr %a, i32 %b) { ; RV32I-LABEL: lrb_anyext: ; RV32I: # %bb.0: ; RV32I-NEXT: add a0, a0, a1 @@ -223,175 +223,106 @@ define i8 @lrb_anyext(ptr %a, i64 %b) { ; RV32IZBAXQCISLS: # %bb.0: ; RV32IZBAXQCISLS-NEXT: qc.lrbu a0, a0, a1, 0 ; RV32IZBAXQCISLS-NEXT: ret - %1 = getelementptr i8, ptr %a, i64 %b + %1 = getelementptr i8, ptr %a, i32 %b %2 = load i8, ptr %1, align 1 ret i8 %2 } -define i64 @lrb(ptr %a, i64 %b) { +define i32 @lrb(ptr %a, i32 %b) { ; RV32I-LABEL: lrb: ; RV32I: # %bb.0: ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lb a1, 0(a0) -; RV32I-NEXT: srai a2, a1, 31 -; RV32I-NEXT: add a0, a1, a1 -; RV32I-NEXT: sltu a1, a0, a1 -; RV32I-NEXT: add a2, a2, a2 -; RV32I-NEXT: add a1, a2, a1 +; RV32I-NEXT: lb a0, 0(a0) +; RV32I-NEXT: add a0, a0, a0 ; RV32I-NEXT: ret ; ; RV32IZBA-LABEL: lrb: ; RV32IZBA: # %bb.0: ; RV32IZBA-NEXT: add a0, a0, a1 -; RV32IZBA-NEXT: lb a1, 0(a0) -; RV32IZBA-NEXT: srai a2, a1, 31 -; RV32IZBA-NEXT: add a0, a1, a1 -; RV32IZBA-NEXT: sltu a1, a0, a1 -; RV32IZBA-NEXT: add a2, a2, a2 -; RV32IZBA-NEXT: add a1, a2, a1 +; RV32IZBA-NEXT: lb a0, 0(a0) +; RV32IZBA-NEXT: add a0, a0, a0 ; RV32IZBA-NEXT: ret ; ; RV32IZBAXQCISLS-LABEL: lrb: ; RV32IZBAXQCISLS: # %bb.0: -; RV32IZBAXQCISLS-NEXT: qc.lrb a1, a0, a1, 0 -; RV32IZBAXQCISLS-NEXT: srai a2, a1, 31 -; RV32IZBAXQCISLS-NEXT: add a0, a1, a1 -; RV32IZBAXQCISLS-NEXT: sltu a1, a0, a1 -; RV32IZBAXQCISLS-NEXT: add a2, a2, a2 -; RV32IZBAXQCISLS-NEXT: add a1, a2, a1 +; RV32IZBAXQCISLS-NEXT: qc.lrb a0, a0, a1, 0 +; RV32IZBAXQCISLS-NEXT: add a0, a0, a0 ; RV32IZBAXQCISLS-NEXT: ret - %1 = getelementptr i8, ptr %a, i64 %b + %1 = getelementptr i8, ptr %a, i32 %b %2 = load i8, ptr %1, align 1 - %3 = sext i8 %2 to i64 - %4 = add i64 %3, %3 - ret i64 %4 + %3 = sext i8 %2 to i32 + %4 = add i32 %3, %3 + ret i32 %4 } -define i8 @lurb_anyext(ptr %a, i32 %b) { -; RV32I-LABEL: lurb_anyext: +define i32 @lrbu(ptr %a, i32 %b) { +; RV32I-LABEL: lrbu: ; RV32I: # %bb.0: ; RV32I-NEXT: add a0, a0, a1 ; RV32I-NEXT: lbu a0, 0(a0) +; RV32I-NEXT: add a0, a0, a0 ; RV32I-NEXT: ret ; -; RV32IZBA-LABEL: lurb_anyext: +; RV32IZBA-LABEL: lrbu: ; RV32IZBA: # %bb.0: ; RV32IZBA-NEXT: add a0, a0, a1 ; RV32IZBA-NEXT: lbu a0, 0(a0) +; RV32IZBA-NEXT: add a0, a0, a0 ; RV32IZBA-NEXT: ret ; -; RV32IZBAXQCISLS-LABEL: lurb_anyext: +; RV32IZBAXQCISLS-LABEL: lrbu: ; RV32IZBAXQCISLS: # %bb.0: ; RV32IZBAXQCISLS-NEXT: qc.lrbu a0, a0, a1, 0 +; RV32IZBAXQCISLS-NEXT: add a0, a0, a0 ; RV32IZBAXQCISLS-NEXT: ret - %1 = zext i32 %b to i64 - %2 = getelementptr i8, ptr %a, i64 %1 - %3 = load i8, ptr %2, align 1 - ret i8 %3 + %1 = getelementptr i8, ptr %a, i32 %b + %2 = load i8, ptr %1, align 1 + %3 = zext i8 %2 to i32 + %4 = add i32 %3, %3 + ret i32 %4 } -define i64 @lurb(ptr %a, i32 %b) { -; RV32I-LABEL: lurb: +define i64 @lrd(ptr %a, i32 %b) { +; RV32I-LABEL: lrd: ; RV32I: # %bb.0: +; RV32I-NEXT: slli a1, a1, 3 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lb a1, 0(a0) -; RV32I-NEXT: srai a2, a1, 31 +; RV32I-NEXT: lw a1, 0(a0) +; RV32I-NEXT: lw a2, 4(a0) ; RV32I-NEXT: add a0, a1, a1 ; RV32I-NEXT: sltu a1, a0, a1 ; RV32I-NEXT: add a2, a2, a2 ; RV32I-NEXT: add a1, a2, a1 ; RV32I-NEXT: ret ; -; RV32IZBA-LABEL: lurb: +; RV32IZBA-LABEL: lrd: ; RV32IZBA: # %bb.0: -; RV32IZBA-NEXT: add a0, a0, a1 -; RV32IZBA-NEXT: lb a1, 0(a0) -; RV32IZBA-NEXT: srai a2, a1, 31 +; RV32IZBA-NEXT: sh3add a0, a1, a0 +; RV32IZBA-NEXT: lw a1, 0(a0) +; RV32IZBA-NEXT: lw a2, 4(a0) ; RV32IZBA-NEXT: add a0, a1, a1 ; RV32IZBA-NEXT: sltu a1, a0, a1 ; RV32IZBA-NEXT: add a2, a2, a2 ; RV32IZBA-NEXT: add a1, a2, a1 ; RV32IZBA-NEXT: ret ; -; RV32IZBAXQCISLS-LABEL: lurb: -; RV32IZBAXQCISLS: # %bb.0: -; RV32IZBAXQCISLS-NEXT: qc.lrb a1, a0, a1, 0 -; RV32IZBAXQCISLS-NEXT: srai a2, a1, 31 -; RV32IZBAXQCISLS-NEXT: add a0, a1, a1 -; RV32IZBAXQCISLS-NEXT: sltu a1, a0, a1 -; RV32IZBAXQCISLS-NEXT: add a2, a2, a2 -; RV32IZBAXQCISLS-NEXT: add a1, a2, a1 -; RV32IZBAXQCISLS-NEXT: ret - %1 = zext i32 %b to i64 - %2 = getelementptr i8, ptr %a, i64 %1 - %3 = load i8, ptr %2, align 1 - %4 = sext i8 %3 to i64 - %5 = add i64 %4, %4 - ret i64 %5 -} - -define i64 @lrbu(ptr %a, i64 %b) { -; RV32I-LABEL: lrbu: -; RV32I: # %bb.0: -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: add a0, a1, a1 -; RV32I-NEXT: sltu a1, a0, a1 -; RV32I-NEXT: ret -; -; RV32IZBA-LABEL: lrbu: -; RV32IZBA: # %bb.0: -; RV32IZBA-NEXT: add a0, a0, a1 -; RV32IZBA-NEXT: lbu a1, 0(a0) -; RV32IZBA-NEXT: add a0, a1, a1 -; RV32IZBA-NEXT: sltu a1, a0, a1 -; RV32IZBA-NEXT: ret -; -; RV32IZBAXQCISLS-LABEL: lrbu: -; RV32IZBAXQCISLS: # %bb.0: -; RV32IZBAXQCISLS-NEXT: qc.lrbu a1, a0, a1, 0 -; RV32IZBAXQCISLS-NEXT: add a0, a1, a1 -; RV32IZBAXQCISLS-NEXT: sltu a1, a0, a1 -; RV32IZBAXQCISLS-NEXT: ret - %1 = getelementptr i8, ptr %a, i64 %b - %2 = load i8, ptr %1, align 1 - %3 = zext i8 %2 to i64 - %4 = add i64 %3, %3 - ret i64 %4 -} - -define i64 @lurbu(ptr %a, i32 %b) { -; RV32I-LABEL: lurbu: -; RV32I: # %bb.0: -; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: lbu a1, 0(a0) -; RV32I-NEXT: add a0, a1, a1 -; RV32I-NEXT: sltu a1, a0, a1 -; RV32I-NEXT: ret -; -; RV32IZBA-LABEL: lurbu: -; RV32IZBA: # %bb.0: -; RV32IZBA-NEXT: add a0, a0, a1 -; RV32IZBA-NEXT: lbu a1, 0(a0) -; RV32IZBA-NEXT: add a0, a1, a1 -; RV32IZBA-NEXT: sltu a1, a0, a1 -; RV32IZBA-NEXT: ret -; -; RV32IZBAXQCISLS-LABEL: lurbu: +; RV32IZBAXQCISLS-LABEL: lrd: ; RV32IZBAXQCISLS: # %bb.0: -; RV32IZBAXQCISLS-NEXT: qc.lrbu a1, a0, a1, 0 -; RV32IZBAXQCISLS-NEXT: add a0, a1, a1 -; RV32IZBAXQCISLS-NEXT: sltu a1, a0, a1 +; RV32IZBAXQCISLS-NEXT: qc.lrw a2, a0, a1, 3 +; RV32IZBAXQCISLS-NEXT: sh3add a0, a1, a0 +; RV32IZBAXQCISLS-NEXT: lw a1, 4(a0) +; RV32IZBAXQCISLS-NEXT: add a0, a2, a2 +; RV32IZBAXQCISLS-NEXT: sltu a2, a0, a2 +; RV32IZBAXQCISLS-NEXT: add a1, a1, a1 +; RV32IZBAXQCISLS-NEXT: add a1, a1, a2 ; RV32IZBAXQCISLS-NEXT: ret - %1 = zext i32 %b to i64 - %2 = getelementptr i8, ptr %a, i64 %1 - %3 = load i8, ptr %2, align 1 - %4 = zext i8 %3 to i64 - %5 = add i64 %4, %4 - ret i64 %5 + %1 = getelementptr i64, ptr %a, i32 %b + %2 = load i64, ptr %1, align 8 + %3 = add i64 %2, %2 + ret i64 %3 } -define i64 @lrd_2(ptr %a, i64 %b) { +define i64 @lrd_2(ptr %a, i32 %b) { ; RV32I-LABEL: lrd_2: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 3 @@ -426,67 +357,134 @@ define i64 @lrd_2(ptr %a, i64 %b) { ; RV32IZBAXQCISLS-NEXT: add a1, a1, a1 ; RV32IZBAXQCISLS-NEXT: add a1, a1, a2 ; RV32IZBAXQCISLS-NEXT: ret - %1 = add i64 %b, 12 - %2 = getelementptr i64, ptr %a, i64 %1 + %1 = add i32 %b, 12 + %2 = getelementptr i64, ptr %a, i32 %1 %3 = load i64, ptr %2, align 8 %4 = add i64 %3, %3 ret i64 %4 } -define void @srb(ptr %a, i64 %b, i8 %c) { +define void @srb(ptr %a, i32 %b, i8 %c) { ; RV32I-LABEL: srb: ; RV32I: # %bb.0: -; RV32I-NEXT: add a3, a3, a3 +; RV32I-NEXT: add a2, a2, a2 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: sb a3, 0(a0) +; RV32I-NEXT: sb a2, 0(a0) ; RV32I-NEXT: ret ; ; RV32IZBA-LABEL: srb: ; RV32IZBA: # %bb.0: -; RV32IZBA-NEXT: add a3, a3, a3 +; RV32IZBA-NEXT: add a2, a2, a2 ; RV32IZBA-NEXT: add a0, a0, a1 -; RV32IZBA-NEXT: sb a3, 0(a0) +; RV32IZBA-NEXT: sb a2, 0(a0) ; RV32IZBA-NEXT: ret ; ; RV32IZBAXQCISLS-LABEL: srb: ; RV32IZBAXQCISLS: # %bb.0: -; RV32IZBAXQCISLS-NEXT: add a3, a3, a3 -; RV32IZBAXQCISLS-NEXT: qc.srb a3, a0, a1, 0 +; RV32IZBAXQCISLS-NEXT: add a2, a2, a2 +; RV32IZBAXQCISLS-NEXT: qc.srb a2, a0, a1, 0 ; RV32IZBAXQCISLS-NEXT: ret %1 = add i8 %c, %c - %2 = getelementptr i8, ptr %a, i64 %b + %2 = getelementptr i8, ptr %a, i32 %b store i8 %1, ptr %2, align 1 ret void } -define void @surb(ptr %a, i32 %b, i8 %c) { -; RV32I-LABEL: surb: +define void @srh(ptr %a, i32 %b, i16 %c) { +; RV32I-LABEL: srh: ; RV32I: # %bb.0: ; RV32I-NEXT: add a2, a2, a2 +; RV32I-NEXT: slli a1, a1, 1 ; RV32I-NEXT: add a0, a0, a1 -; RV32I-NEXT: sb a2, 0(a0) +; RV32I-NEXT: sh a2, 0(a0) ; RV32I-NEXT: ret ; -; RV32IZBA-LABEL: surb: +; RV32IZBA-LABEL: srh: ; RV32IZBA: # %bb.0: ; RV32IZBA-NEXT: add a2, a2, a2 -; RV32IZBA-NEXT: add a0, a0, a1 -; RV32IZBA-NEXT: sb a2, 0(a0) +; RV32IZBA-NEXT: sh1add a0, a1, a0 +; RV32IZBA-NEXT: sh a2, 0(a0) ; RV32IZBA-NEXT: ret ; -; RV32IZBAXQCISLS-LABEL: surb: +; RV32IZBAXQCISLS-LABEL: srh: ; RV32IZBAXQCISLS: # %bb.0: ; RV32IZBAXQCISLS-NEXT: add a2, a2, a2 -; RV32IZBAXQCISLS-NEXT: qc.srb a2, a0, a1, 0 +; RV32IZBAXQCISLS-NEXT: qc.srh a2, a0, a1, 1 +; RV32IZBAXQCISLS-NEXT: ret + %1 = add i16 %c, %c + %2 = getelementptr i16, ptr %a, i32 %b + store i16 %1, ptr %2, align 2 + ret void +} + +define void @srw(ptr %a, i32 %b, i32 %c) { +; RV32I-LABEL: srw: +; RV32I: # %bb.0: +; RV32I-NEXT: add a2, a2, a2 +; RV32I-NEXT: slli a1, a1, 2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: sw a2, 0(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: srw: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: add a2, a2, a2 +; RV32IZBA-NEXT: sh2add a0, a1, a0 +; RV32IZBA-NEXT: sw a2, 0(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: srw: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: add a2, a2, a2 +; RV32IZBAXQCISLS-NEXT: qc.srw a2, a0, a1, 2 +; RV32IZBAXQCISLS-NEXT: ret + %1 = add i32 %c, %c + %2 = getelementptr i32, ptr %a, i32 %b + store i32 %1, ptr %2, align 4 + ret void +} + +define void @srd(ptr %a, i32 %b, i64 %c) { +; RV32I-LABEL: srd: +; RV32I: # %bb.0: +; RV32I-NEXT: add a4, a2, a2 +; RV32I-NEXT: add a3, a3, a3 +; RV32I-NEXT: slli a1, a1, 3 +; RV32I-NEXT: sltu a2, a4, a2 +; RV32I-NEXT: add a0, a0, a1 +; RV32I-NEXT: add a2, a3, a2 +; RV32I-NEXT: sw a4, 0(a0) +; RV32I-NEXT: sw a2, 4(a0) +; RV32I-NEXT: ret +; +; RV32IZBA-LABEL: srd: +; RV32IZBA: # %bb.0: +; RV32IZBA-NEXT: add a4, a2, a2 +; RV32IZBA-NEXT: add a3, a3, a3 +; RV32IZBA-NEXT: sh3add a0, a1, a0 +; RV32IZBA-NEXT: sltu a1, a4, a2 +; RV32IZBA-NEXT: add a1, a3, a1 +; RV32IZBA-NEXT: sw a4, 0(a0) +; RV32IZBA-NEXT: sw a1, 4(a0) +; RV32IZBA-NEXT: ret +; +; RV32IZBAXQCISLS-LABEL: srd: +; RV32IZBAXQCISLS: # %bb.0: +; RV32IZBAXQCISLS-NEXT: add a4, a2, a2 +; RV32IZBAXQCISLS-NEXT: add a3, a3, a3 +; RV32IZBAXQCISLS-NEXT: sltu a2, a4, a2 +; RV32IZBAXQCISLS-NEXT: add a2, a3, a2 +; RV32IZBAXQCISLS-NEXT: sh3add a3, a1, a0 +; RV32IZBAXQCISLS-NEXT: qc.srw a4, a0, a1, 3 +; RV32IZBAXQCISLS-NEXT: sw a2, 4(a3) ; RV32IZBAXQCISLS-NEXT: ret - %1 = zext i32 %b to i64 - %2 = add i8 %c, %c - %3 = getelementptr i8, ptr %a, i64 %1 - store i8 %2, ptr %3, align 1 + %1 = add i64 %c, %c + %2 = getelementptr i64, ptr %a, i32 %b + store i64 %1, ptr %2, align 8 ret void } -define i64 @lrd_large_shift(ptr %a, i64 %b) { +define i64 @lrd_large_shift(ptr %a, i32 %b) { ; RV32I-LABEL: lrd_large_shift: ; RV32I: # %bb.0: ; RV32I-NEXT: slli a1, a1, 5 @@ -510,9 +508,9 @@ define i64 @lrd_large_shift(ptr %a, i64 %b) { ; RV32IZBAXQCISLS-NEXT: qc.lrw a0, a2, a1, 5 ; RV32IZBAXQCISLS-NEXT: qc.lrw a1, a3, a1, 5 ; RV32IZBAXQCISLS-NEXT: ret - %1 = add i64 %b, 12 - %2 = shl i64 %1, 2 - %3 = getelementptr i64, ptr %a, i64 %2 + %1 = add i32 %b, 12 + %2 = shl i32 %1, 2 + %3 = getelementptr i64, ptr %a, i32 %2 %4 = load i64, ptr %3, align 8 ret i64 %4 } From 0692572e040979b2de0dceb8f0537aa16caf351f Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Thu, 17 Jul 2025 10:01:10 +0900 Subject: [PATCH 123/813] [clang][CodeGen] Fix metadata when vectorization is disabled by pragma (#135163) Currently, when specifying `vectorize(disable) unroll_count(8)`, the generated metadata appears as follows: ``` !loop0 = !{!"loop0", !vectorize_width, !followup} !vectorize_width = !{!"llvm.loop.vectorize.width", i32 1} !followup = !{!"llvm.loop.vectorize.followup_all", !unroll} !unroll = !{!"llvm.loop.unroll_count", i32 8} ``` Since the metadata `!vectorize_width` implies that the vectorization is disabled, the vectorization process is skipped, and the `!followup` metadata is not processed correctly. This patch addresses the issue by directly appending properties to the metadata node when vectorization is disabled, instead of creating a new follow-up MDNode. In the above case, the generated metadata will now look like this: ``` !loop0 = !{!"loop0", !vectorize_width, !vectorize_width, !unroll} !vectorize_width = !{!"llvm.loop.vectorize.width", i32 1} !unroll = !{!"llvm.loop.unroll_count", i32 8} ``` --- clang/lib/CodeGen/CGLoopInfo.cpp | 46 +++++++++++++++++---------- clang/test/CodeGenCXX/pragma-loop.cpp | 41 ++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 17 deletions(-) diff --git a/clang/lib/CodeGen/CGLoopInfo.cpp b/clang/lib/CodeGen/CGLoopInfo.cpp index 4a9092842858b..b2b569a43038c 100644 --- a/clang/lib/CodeGen/CGLoopInfo.cpp +++ b/clang/lib/CodeGen/CGLoopInfo.cpp @@ -221,18 +221,6 @@ LoopInfo::createLoopVectorizeMetadata(const LoopAttributes &Attrs, return createUnrollAndJamMetadata(Attrs, LoopProperties, HasUserTransforms); } - // Apply all loop properties to the vectorized loop. - SmallVector FollowupLoopProperties; - FollowupLoopProperties.append(LoopProperties.begin(), LoopProperties.end()); - - // Don't vectorize an already vectorized loop. - FollowupLoopProperties.push_back( - MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.isvectorized"))); - - bool FollowupHasTransforms = false; - SmallVector Followup = createUnrollAndJamMetadata( - Attrs, FollowupLoopProperties, FollowupHasTransforms); - SmallVector Args; Args.append(LoopProperties.begin(), LoopProperties.end()); @@ -286,22 +274,46 @@ LoopInfo::createLoopVectorizeMetadata(const LoopAttributes &Attrs, // 5) it is implied when vectorize.width is unset (0) and the user // explicitly requested fixed-width vectorization, i.e. // vectorize.scalable.enable is false. + bool VectorizeEnabled = false; if (Attrs.VectorizeEnable != LoopAttributes::Unspecified || (IsVectorPredicateEnabled && Attrs.VectorizeWidth != 1) || Attrs.VectorizeWidth > 1 || Attrs.VectorizeScalable == LoopAttributes::Enable || (Attrs.VectorizeScalable == LoopAttributes::Disable && Attrs.VectorizeWidth != 1)) { - bool AttrVal = Attrs.VectorizeEnable != LoopAttributes::Disable; + VectorizeEnabled = Attrs.VectorizeEnable != LoopAttributes::Disable; Args.push_back( MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), ConstantAsMetadata::get(ConstantInt::get( - llvm::Type::getInt1Ty(Ctx), AttrVal))})); + llvm::Type::getInt1Ty(Ctx), VectorizeEnabled))})); } - if (FollowupHasTransforms) - Args.push_back( - createFollowupMetadata("llvm.loop.vectorize.followup_all", Followup)); + // Apply all loop properties to the vectorized loop. + SmallVector FollowupLoopProperties; + + // If vectorization is not explicitly enabled, the follow-up metadata will be + // directly appended to the list currently being created. In that case, adding + // LoopProperties to FollowupLoopProperties would result in duplication. + if (VectorizeEnabled) + FollowupLoopProperties.append(LoopProperties.begin(), LoopProperties.end()); + + // Don't vectorize an already vectorized loop. + FollowupLoopProperties.push_back( + MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.isvectorized"))); + + bool FollowupHasTransforms = false; + SmallVector Followup = createUnrollAndJamMetadata( + Attrs, FollowupLoopProperties, FollowupHasTransforms); + + if (FollowupHasTransforms) { + // If vectorization is explicitly enabled, we create a follow-up metadata, + // otherwise directly add the contents of it to Args. + if (VectorizeEnabled) + Args.push_back( + createFollowupMetadata("llvm.loop.vectorize.followup_all", Followup)); + else + Args.append(Followup.begin(), Followup.end()); + } HasUserTransforms = true; return Args; diff --git a/clang/test/CodeGenCXX/pragma-loop.cpp b/clang/test/CodeGenCXX/pragma-loop.cpp index 4857299f1c037..8cb3346247daf 100644 --- a/clang/test/CodeGenCXX/pragma-loop.cpp +++ b/clang/test/CodeGenCXX/pragma-loop.cpp @@ -203,6 +203,43 @@ void for_test_scalable_1(int *List, int Length) { } } +// Verify for loop is not performing vectorization +void for_test_width_1(int *List, int Length) { +#pragma clang loop vectorize_width(1) interleave_count(4) unroll(disable) distribute(disable) + for (int i = 0; i < Length; i++) { + // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_20:.*]] + List[i] = i * 2; + } +} + +// Verify for loop is not performing vectorization +void for_test_fixed_1(int *List, int Length) { +#pragma clang loop vectorize_width(1, fixed) interleave_count(4) unroll(disable) distribute(disable) + for (int i = 0; i < Length; i++) { + // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_21:.*]] + List[i] = i * 2; + } +} + + +// Verify unroll attributes are directly attached to the loop metadata +void for_test_vectorize_disable_unroll(int *List, int Length) { +#pragma clang loop vectorize(disable) unroll_count(8) + for (int i = 0; i < Length; i++) { + // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_22:.*]] + List[i] = i * 2; + } +} + +// Verify unroll attributes are directly attached to the loop metadata +void for_test_interleave_vectorize_disable_unroll(int *List, int Length) { +#pragma clang loop vectorize(disable) interleave_count(4) unroll_count(8) + for (int i = 0; i < Length; i++) { + // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_23:.*]] + List[i] = i * 2; + } +} + // CHECK-DAG: ![[MP:[0-9]+]] = !{!"llvm.loop.mustprogress"} // CHECK-DAG: ![[UNROLL_DISABLE:[0-9]+]] = !{!"llvm.loop.unroll.disable"} @@ -270,3 +307,7 @@ void for_test_scalable_1(int *List, int Length) { // CHECK-DAG: ![[LOOP_17]] = distinct !{![[LOOP_17]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[FIXED_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} // CHECK-DAG: ![[LOOP_18]] = distinct !{![[LOOP_18]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} // CHECK-DAG: ![[LOOP_19]] = distinct !{![[LOOP_19]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[SCALABLE_VEC]], ![[INTERLEAVE_4]], ![[VECTORIZE_ENABLE]]} +// CHECK-DAG: ![[LOOP_20]] = distinct !{![[LOOP_20]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[FIXED_VEC]], ![[INTERLEAVE_4]]} +// CHECK-DAG: ![[LOOP_21]] = distinct !{![[LOOP_21]], ![[MP]], ![[UNROLL_DISABLE]], ![[DISTRIBUTE_DISABLE]], ![[WIDTH_1]], ![[FIXED_VEC]], ![[INTERLEAVE_4]]} +// CHECK-DAG: ![[LOOP_22]] = distinct !{![[LOOP_22]], ![[MP]], ![[WIDTH_1]], ![[ISVECTORIZED]], ![[UNROLL_8]]} +// CHECK-DAG: ![[LOOP_23]] = distinct !{![[LOOP_23]], ![[MP]], ![[WIDTH_1]], ![[INTERLEAVE_4]], ![[ISVECTORIZED]], ![[UNROLL_8]]} From b41398294c85eacdb37b9637eed6f0e91edf35cf Mon Sep 17 00:00:00 2001 From: Wenju He Date: Thu, 17 Jul 2025 09:02:10 +0800 Subject: [PATCH 124/813] [SPIR] Set MaxAtomicInlineWidth minimum size to 32 for spir32 and 64 for spir64 (#148997) Set MaxAtomicInlineWidth the same way as SPIR-V targets in 3cfd0c0d3697. This PR fixes build warning in scoped atomic built-in in #146814: `warning: large atomic operation may incur significant performance penalty; ; the access size (2 bytes) exceeds the max lock-free size (0 bytes) [-Watomic-alignment]` --- clang/lib/Basic/Targets/SPIR.h | 6 ++++++ clang/test/CodeGenOpenCL/scoped-atomic.cl | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 clang/test/CodeGenOpenCL/scoped-atomic.cl diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 1abf798d93129..c13b286cd7916 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -264,6 +264,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR32TargetInfo : public SPIRTargetInfo { PointerWidth = PointerAlign = 32; SizeType = TargetInfo::UnsignedInt; PtrDiffType = IntPtrType = TargetInfo::SignedInt; + // SPIR32 has support for atomic ops if atomic extension is enabled. + // Take the maximum because it's possible the Host supports wider types. + MaxAtomicInlineWidth = std::max(MaxAtomicInlineWidth, 32); resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-" "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"); } @@ -281,6 +284,9 @@ class LLVM_LIBRARY_VISIBILITY SPIR64TargetInfo : public SPIRTargetInfo { PointerWidth = PointerAlign = 64; SizeType = TargetInfo::UnsignedLong; PtrDiffType = IntPtrType = TargetInfo::SignedLong; + // SPIR64 has support for atomic ops if atomic extension is enabled. + // Take the maximum because it's possible the Host supports wider types. + MaxAtomicInlineWidth = std::max(MaxAtomicInlineWidth, 64); resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-" "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"); } diff --git a/clang/test/CodeGenOpenCL/scoped-atomic.cl b/clang/test/CodeGenOpenCL/scoped-atomic.cl new file mode 100644 index 0000000000000..ec7e936684a3a --- /dev/null +++ b/clang/test/CodeGenOpenCL/scoped-atomic.cl @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir-unknown-unknown -verify +// RUN: %clang_cc1 %s -cl-std=CL2.0 -emit-llvm -o - -triple spir64-unknown-unknown -verify + +// expected-no-diagnostics + +int fi1a(int *i) { + int v; + __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); + return v; +} + +#ifdef __SPIR64__ +long fl1a(long *i) { + long v; + __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); + return v; +} +#endif From b9adc4a59cb50c98ec0e01645ea5eb64e6628afd Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 16 Jul 2025 18:09:27 -0700 Subject: [PATCH 125/813] [IA] Use a single callback for lowerInterleaveIntrinsic [nfc] (#148978) (#149168) This continues in the direction started by commit 4b81dc7. We essentially merges the handling for VPStore - currently in lowerInterleavedVPStore which is shared between shuffle and intrinsic based interleaves - into the existing dedicated routine. --- llvm/include/llvm/CodeGen/TargetLowering.h | 8 ++- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 19 +++--- .../Target/AArch64/AArch64ISelLowering.cpp | 7 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 +- .../Target/RISCV/RISCVInterleavedAccess.cpp | 68 +++++++++++++------ 6 files changed, 70 insertions(+), 38 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 238d07a20eec8..084b788d51828 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3261,10 +3261,14 @@ class LLVM_ABI TargetLoweringBase { /// Return true on success. Currently only supports /// llvm.vector.interleave{2,3,5,7} /// - /// \p SI is the accompanying store instruction + /// \p Store is the accompanying store instruction. Can be either a plain + /// store or a vp.store intrinsic. + /// \p Mask is a per-segment (i.e. number of lanes equal to that of one + /// component being interwoven) mask. Can be nullptr, in which case the + /// result is uncondiitional. /// \p InterleaveValues contains the interleaved values. virtual bool - lowerInterleaveIntrinsicToStore(StoreInst *SI, + lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef InterleaveValues) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 68a956921c8e0..d43cd46f8ad82 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -662,23 +662,19 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( const unsigned Factor = getInterleaveIntrinsicFactor(II->getIntrinsicID()); assert(Factor && "unexpected interleave intrinsic"); + Value *Mask = nullptr; if (auto *VPStore = dyn_cast(StoredBy)) { if (VPStore->getIntrinsicID() != Intrinsic::vp_store) return false; Value *WideMask = VPStore->getOperand(2); - Value *Mask = getMask(WideMask, Factor, - cast(InterleaveValues[0]->getType())); + Mask = getMask(WideMask, Factor, + cast(InterleaveValues[0]->getType())); if (!Mask) return false; LLVM_DEBUG(dbgs() << "IA: Found a vp.store with interleave intrinsic " << *II << " and factor = " << Factor << "\n"); - - // Since lowerInterleavedStore expects Shuffle and StoreInst, use special - // TLI function to emit target-specific interleaved instruction. - if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues)) - return false; } else { auto *SI = cast(StoredBy); if (!SI->isSimple()) @@ -686,12 +682,13 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( LLVM_DEBUG(dbgs() << "IA: Found a store with interleave intrinsic " << *II << " and factor = " << Factor << "\n"); - - // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(SI, InterleaveValues)) - return false; } + // Try and match this with target specific intrinsics. + if (!TLI->lowerInterleaveIntrinsicToStore(cast(StoredBy), Mask, + InterleaveValues)) + return false; + // We now have a target-specific store, so delete the old one. DeadInsts.insert(cast(StoredBy)); DeadInsts.insert(II); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 46c53843ba3a4..ff23f76fadccd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17564,12 +17564,17 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( } bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef InterleavedValues) const { + Instruction *Store, Value *Mask, + ArrayRef InterleavedValues) const { unsigned Factor = InterleavedValues.size(); if (Factor != 2 && Factor != 4) { LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n"); return false; } + StoreInst *SI = dyn_cast(Store); + if (!SI) + return false; + assert(!Mask && "Unexpected mask on plain store"); VectorType *VTy = cast(InterleavedValues[0]->getType()); const DataLayout &DL = SI->getModule()->getDataLayout(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index a19bf19387a8c..7b1de3d3254f2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -222,7 +222,8 @@ class AArch64TargetLowering : public TargetLowering { IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef InterleaveValues) const override; + Instruction *Store, Value *Mask, + ArrayRef InterleaveValues) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalAddScalableImmediate(int64_t) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index e8adf561c9c35..a5d735c407e5c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -441,7 +441,8 @@ class RISCVTargetLowering : public TargetLowering { IntrinsicInst *DI) const override; bool lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef InterleaveValues) const override; + Instruction *Store, Value *Mask, + ArrayRef InterleaveValues) const override; bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, ArrayRef DeinterleaveRes) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 025054d5a2a60..71d4a353c69c2 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -349,47 +349,71 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( } bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef InterleaveValues) const { + Instruction *Store, Value *Mask, ArrayRef InterleaveValues) const { unsigned Factor = InterleaveValues.size(); if (Factor > 8) return false; - assert(SI->isSimple()); - IRBuilder<> Builder(SI); + IRBuilder<> Builder(Store); auto *InVTy = cast(InterleaveValues[0]->getType()); - auto *PtrTy = SI->getPointerOperandType(); - const DataLayout &DL = SI->getDataLayout(); + const DataLayout &DL = Store->getDataLayout(); + Type *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), DL)) - return false; + Value *Ptr, *VL; + Align Alignment; + if (auto *SI = dyn_cast(Store)) { + assert(SI->isSimple()); + Ptr = SI->getPointerOperand(); + Alignment = SI->getAlign(); + assert(!Mask && "Unexpected mask on a store"); + Mask = Builder.getAllOnesMask(InVTy->getElementCount()); + VL = isa(InVTy) + ? Builder.CreateElementCount(XLenTy, InVTy->getElementCount()) + : Constant::getAllOnesValue(XLenTy); + } else { + auto *VPStore = cast(Store); + assert(VPStore->getIntrinsicID() == Intrinsic::vp_store && + "Unexpected intrinsic"); + Ptr = VPStore->getMemoryPointerParam(); + Alignment = VPStore->getPointerAlignment().value_or( + DL.getABITypeAlign(InVTy->getElementType())); + + assert(Mask && "vp.store needs a mask!"); + + Value *WideEVL = VPStore->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, DL, Factor)) + return false; - Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + VL = Builder.CreateZExt( + Builder.CreateUDiv(WideEVL, + ConstantInt::get(WideEVL->getType(), Factor)), + XLenTy); + } + Type *PtrTy = Ptr->getType(); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL)) + return false; if (isa(InVTy)) { Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy}); - + Store->getModule(), FixedVssegIntrIds[Factor - 2], + {InVTy, PtrTy, XLenTy}); SmallVector Ops(InterleaveValues); - Value *VL = Builder.CreateElementCount(XLenTy, InVTy->getElementCount()); - Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount()); - Ops.append({SI->getPointerOperand(), Mask, VL}); - + Ops.append({Ptr, Mask, VL}); Builder.CreateCall(VssegNFunc, Ops); return true; } unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType()); unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); Type *VecTupTy = TargetExtType::get( - SI->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(SI->getContext()), + Store->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), NumElts * SEW / 8), Factor); - Value *VL = Constant::getAllOnesValue(XLenTy); - Value *Mask = Builder.getAllOnesMask(InVTy->getElementCount()); - Value *StoredVal = PoisonValue::get(VecTupTy); for (unsigned i = 0; i < Factor; ++i) StoredVal = Builder.CreateIntrinsic( @@ -397,10 +421,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( {StoredVal, InterleaveValues[i], Builder.getInt32(i)}); Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), ScalableVssegIntrIds[Factor - 2], + Store->getModule(), ScalableVssegIntrIds[Factor - 2], {VecTupTy, PtrTy, Mask->getType(), VL->getType()}); - Value *Operands[] = {StoredVal, SI->getPointerOperand(), Mask, VL, + Value *Operands[] = {StoredVal, Ptr, Mask, VL, ConstantInt::get(XLenTy, Log2_64(SEW))}; Builder.CreateCall(VssegNFunc, Operands); return true; From 34951f7de80c4b4ac2b884d08dd919efed23c024 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Wed, 16 Jul 2025 18:12:18 -0700 Subject: [PATCH 126/813] [flang][cuda] Use NVVM op for clock64 (#149223) --- .../flang/Optimizer/Builder/IntrinsicCall.h | 1 - flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 17 +++++------------ flang/test/Lower/CUDA/cuda-device-proc.cuf | 3 +-- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index b15dd29d68f65..01801dbdaffca 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -241,7 +241,6 @@ struct IntrinsicLibrary { void genCFProcPointer(llvm::ArrayRef); fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genCLoc(mlir::Type, llvm::ArrayRef); - mlir::Value genClock64(mlir::Type, llvm::ArrayRef); template fir::ExtendedValue genCPtrCompare(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index bfbc26e5e6c19..7aa5602d2bc84 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -386,7 +386,10 @@ static constexpr IntrinsicHandler handlers[]{ {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}}, /*isElemental=*/false}, {"clock", &I::genNVVMTime, {}, /*isElemental=*/false}, - {"clock64", &I::genClock64, {}, /*isElemental=*/false}, + {"clock64", + &I::genNVVMTime, + {}, + /*isElemental=*/false}, {"cmplx", &I::genCmplx, {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}}, @@ -3565,16 +3568,6 @@ IntrinsicLibrary::genChdir(std::optional resultType, return {}; } -// CLOCK64 -mlir::Value IntrinsicLibrary::genClock64(mlir::Type resultType, - llvm::ArrayRef args) { - constexpr llvm::StringLiteral funcName = "llvm.nvvm.read.ptx.sreg.clock64"; - mlir::MLIRContext *context = builder.getContext(); - mlir::FunctionType ftype = mlir::FunctionType::get(context, {}, {resultType}); - auto funcOp = builder.createFunction(loc, funcName, ftype); - return builder.create(loc, funcOp, args).getResult(0); -} - // CMPLX mlir::Value IntrinsicLibrary::genCmplx(mlir::Type resultType, llvm::ArrayRef args) { @@ -7204,7 +7197,7 @@ IntrinsicLibrary::genNull(mlir::Type, llvm::ArrayRef args) { return fir::MutableBoxValue(boxStorage, mold->nonDeferredLenParams(), {}); } -// CLOCK, GLOBALTIMER +// CLOCK, CLOCK64, GLOBALTIMER template mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType, llvm::ArrayRef args) { diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 2d6f734670740..d5e614a83b354 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -48,7 +48,6 @@ attributes(global) subroutine devsub() smalltime = clock() time = clock64() - time = globalTimer() end @@ -87,7 +86,7 @@ end ! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 ! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock : i32 -! CHECK: fir.call @llvm.nvvm.read.ptx.sreg.clock64() +! CHECK: %{{.*}} = nvvm.read.ptx.sreg.clock64 : i64 ! CHECK: %{{.*}} = nvvm.read.ptx.sreg.globaltimer : i64 subroutine host1() From b36188514a76ba979439a1dcab58e68478e3f0ad Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Wed, 16 Jul 2025 19:11:32 -0700 Subject: [PATCH 127/813] [RISCV][IA] Check nuw on multiply when analyzing EVL (#149205) If we're checking to see if a number is a multiple of a small constant, we need to be sure the multiply doesn't overflow for the mul logic to hold. The VL is a unsigned number, so we care about unsigned overflow. Once we've proven a number of a multiple, we can also use an exact udiv as we know we're not discarding any bits. This fixes what is technically a miscompile with EVL vectorization, but I doubt we'd ever have seen it in practice since most EVLs are going to much less than UINT_MAX. --- .../Target/RISCV/RISCVInterleavedAccess.cpp | 26 ++- .../RISCV/rvv/vp-vector-interleaved-access.ll | 164 +++--------------- 2 files changed, 38 insertions(+), 152 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 71d4a353c69c2..8fb6ccaac2c9a 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -244,7 +244,7 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { // Right now we're only recognizing the simplest pattern. uint64_t C; if (match(V, m_CombineOr(m_ConstantInt(C), - m_c_Mul(m_Value(), m_ConstantInt(C)))) && + m_NUWMul(m_Value(), m_ConstantInt(C)))) && C && C % N == 0) return true; @@ -296,10 +296,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) return false; - VL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, - ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); } Type *PtrTy = Ptr->getType(); @@ -387,10 +385,8 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( if (!isMultipleOfN(WideEVL, DL, Factor)) return false; - VL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, - ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); } Type *PtrTy = Ptr->getType(); unsigned AS = Ptr->getType()->getPointerAddressSpace(); @@ -489,9 +485,9 @@ bool RISCVTargetLowering::lowerInterleavedVPLoad( auto *PtrTy = Load->getArgOperand(0)->getType(); auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + Value *EVL = + Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); Value *Return = nullptr; if (isa(VTy)) { @@ -596,9 +592,9 @@ bool RISCVTargetLowering::lowerInterleavedVPStore( auto *PtrTy = Store->getArgOperand(1)->getType(); auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + Value *EVL = + Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); if (isa(VTy)) { SmallVector Operands(InterleaveOperands); diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 27ecbe56bda42..8cfa237858aca 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -18,7 +18,7 @@ define {, } @load_factor2_v2(ptr %ptr, i32 % ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg2e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 2 + %rvl = mul nuw i32 %evl, 2 %wide.masked.load = call @llvm.vp.load.nxv4i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %deinterleaved.results = call { , } @llvm.vector.deinterleave2.nxv4i32( %wide.masked.load) %t0 = extractvalue { , } %deinterleaved.results, 0 @@ -31,30 +31,18 @@ define {, } @load_factor2_v2(ptr %ptr, i32 % define {, , } @load_factor3_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor3_v2: ; RV32: # %bb.0: -; RV32-NEXT: slli a2, a1, 1 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: lui a2, 699051 -; RV32-NEXT: addi a2, a2, -1365 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg3e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: load_factor3_v2: ; RV64: # %bb.0: -; RV64-NEXT: slli a2, a1, 1 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: lui a2, 699051 -; RV64-NEXT: addi a2, a2, -1365 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: mulhu a1, a1, a2 -; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg3e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 3 + %rvl = mul nuw i32 %evl, 3 %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) %deinterleaved.results = call { , , } @llvm.vector.deinterleave3( %wide.masked.load) %t0 = extractvalue { , , } %deinterleaved.results, 0 @@ -69,12 +57,6 @@ define {, , } @load_factor define {, } @load_factor3_partial(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor3_partial: ; RV32: # %bb.0: -; RV32-NEXT: slli a2, a1, 1 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: lui a2, 699051 -; RV32-NEXT: addi a2, a2, -1365 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg3e32.v v7, (a0) ; RV32-NEXT: vmv1r.v v8, v7 @@ -82,19 +64,13 @@ define {, } @load_factor3_partial(ptr %ptr, ; ; RV64-LABEL: load_factor3_partial: ; RV64: # %bb.0: -; RV64-NEXT: slli a2, a1, 1 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: lui a2, 699051 -; RV64-NEXT: addi a2, a2, -1365 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: mulhu a1, a1, a2 -; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg3e32.v v7, (a0) ; RV64-NEXT: vmv1r.v v8, v7 ; RV64-NEXT: ret - %rvl = mul i32 %evl, 3 + %rvl = mul nuw i32 %evl, 3 %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) %deinterleaved.results = call { , , } @llvm.vector.deinterleave3( %wide.masked.load) %t0 = extractvalue { , , } %deinterleaved.results, 0 @@ -111,12 +87,6 @@ define {, } @load_factor3_no_extract(ptr %pt ; RV32-NEXT: li a2, 12 ; RV32-NEXT: beq a1, a2, .LBB3_2 ; RV32-NEXT: # %bb.1: # %bb0 -; RV32-NEXT: slli a2, a1, 1 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: lui a2, 699051 -; RV32-NEXT: addi a2, a2, -1365 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg3e32.v v7, (a0) ; RV32-NEXT: j .LBB3_3 @@ -134,14 +104,8 @@ define {, } @load_factor3_no_extract(ptr %pt ; RV64-NEXT: li a3, 12 ; RV64-NEXT: beq a2, a3, .LBB3_2 ; RV64-NEXT: # %bb.1: # %bb0 -; RV64-NEXT: slli a2, a1, 1 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: lui a2, 699051 -; RV64-NEXT: addi a2, a2, -1365 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: mulhu a1, a1, a2 -; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg3e32.v v7, (a0) ; RV64-NEXT: j .LBB3_3 @@ -156,7 +120,7 @@ define {, } @load_factor3_no_extract(ptr %pt br i1 %p, label %bb0, label %bb1 bb0: - %rvl.0 = mul i32 %evl, 3 + %rvl.0 = mul nuw i32 %evl, 3 %wide.load.0 = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl.0) %deinterleaved.results.0 = call { , , } @llvm.vector.deinterleave3( %wide.load.0) br label %merge @@ -191,7 +155,7 @@ define {, , , @llvm.vp.load.nxv8i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d = call { , , , } @llvm.vector.deinterleave4.nxv8i32( %wide.masked.load) %t0 = extractvalue { , , , } %d, 0 @@ -209,30 +173,18 @@ define {, , , , , , , } @load_factor5_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor5_v2: ; RV32: # %bb.0: -; RV32-NEXT: slli a2, a1, 2 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: lui a2, 838861 -; RV32-NEXT: addi a2, a2, -819 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg5e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: load_factor5_v2: ; RV64: # %bb.0: -; RV64-NEXT: slli a2, a1, 2 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: lui a2, 838861 -; RV64-NEXT: addi a2, a2, -819 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: mulhu a1, a1, a2 -; RV64-NEXT: srli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg5e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 5 + %rvl = mul nuw i32 %evl, 5 %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) %deinterleaved.results = call { , , , , } @llvm.vector.deinterleave5( %wide.masked.load) %t0 = extractvalue { , , , , } %deinterleaved.results, 0 @@ -251,37 +203,18 @@ define {, , , , , , , , , } @load_factor7_v2(ptr %ptr, i32 %evl) { ; RV32-LABEL: load_factor7_v2: ; RV32: # %bb.0: -; RV32-NEXT: slli a2, a1, 3 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: lui a1, 149797 -; RV32-NEXT: addi a1, a1, -1755 -; RV32-NEXT: mulhu a1, a2, a1 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: srli a2, a2, 1 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV32-NEXT: vlseg7e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: load_factor7_v2: ; RV64: # %bb.0: -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: lui a3, 149797 -; RV64-NEXT: subw a2, a2, a1 -; RV64-NEXT: addi a1, a3, -1755 -; RV64-NEXT: slli a3, a2, 32 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: mulhu a1, a3, a1 ; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: subw a2, a2, a1 -; RV64-NEXT: srliw a2, a2, 1 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma ; RV64-NEXT: vlseg7e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 7 + %rvl = mul nuw i32 %evl, 7 %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) %deinterleaved.results = call { , , , , , , } @llvm.vector.deinterleave7( %wide.masked.load) %t0 = extractvalue { , , , , , , } %deinterleaved.results, 0 @@ -317,7 +250,7 @@ define {, , , @llvm.vp.load.nxv16i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) %d = call { , , , , , , , } @llvm.vector.deinterleave8.nxv16i32( %wide.masked.load) %t0 = extractvalue { , , , , , , , } %d, 0 @@ -356,7 +289,7 @@ define void @store_factor2_v2( %v0, %v1, pt ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vsseg2e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 2 + %rvl = mul nuw i32 %evl, 2 %interleaved.vec = call @llvm.vector.interleave2.nxv2i32( %v0, %v1) call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void @@ -365,30 +298,18 @@ define void @store_factor2_v2( %v0, %v1, pt define void @store_factor3_v2( %v0, %v1, %v2, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor3_v2: ; RV32: # %bb.0: -; RV32-NEXT: slli a2, a1, 1 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: lui a2, 699051 -; RV32-NEXT: addi a2, a2, -1365 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: srli a1, a1, 1 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV32-NEXT: vsseg3e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: store_factor3_v2: ; RV64: # %bb.0: -; RV64-NEXT: slli a2, a1, 1 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: lui a2, 699051 -; RV64-NEXT: addi a2, a2, -1365 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: mulhu a1, a1, a2 -; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vsseg3e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 3 + %rvl = mul nuw i32 %evl, 3 %interleaved.vec = call @llvm.vector.interleave3( %v0, %v1, %v2) call void @llvm.vp.store( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void @@ -414,7 +335,7 @@ define void @store_factor4_v2( %v0, %v1, pt ; RV64-NEXT: vmv1r.v v11, v9 ; RV64-NEXT: vsseg4e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 8 + %rvl = mul nuw i32 %evl, 8 %interleaved.vec = call @llvm.vector.interleave4.nxv4i32( %v0, %v1, %v0, %v1) call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void @@ -423,30 +344,18 @@ define void @store_factor4_v2( %v0, %v1, pt define void @store_factor5_v2( %v0, %v1, %v2, %v3, %v4, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor5_v2: ; RV32: # %bb.0: -; RV32-NEXT: slli a2, a1, 2 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: lui a2, 838861 -; RV32-NEXT: addi a2, a2, -819 -; RV32-NEXT: mulhu a1, a1, a2 -; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV32-NEXT: vsseg5e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: store_factor5_v2: ; RV64: # %bb.0: -; RV64-NEXT: slli a2, a1, 2 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: lui a2, 838861 -; RV64-NEXT: addi a2, a2, -819 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: slli a2, a2, 32 -; RV64-NEXT: mulhu a1, a1, a2 -; RV64-NEXT: srli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vsseg5e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 5 + %rvl = mul nuw i32 %evl, 5 %interleaved.vec = call @llvm.vector.interleave5( %v0, %v1, %v2, %v3, %v4) call void @llvm.vp.store( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void @@ -455,37 +364,18 @@ define void @store_factor5_v2( %v0, %v1, %v0, %v1, %v2, %v3, %v4, %v5, %v6, ptr %ptr, i32 %evl) { ; RV32-LABEL: store_factor7_v2: ; RV32: # %bb.0: -; RV32-NEXT: slli a2, a1, 3 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: lui a1, 149797 -; RV32-NEXT: addi a1, a1, -1755 -; RV32-NEXT: mulhu a1, a2, a1 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: srli a2, a2, 1 -; RV32-NEXT: add a1, a2, a1 -; RV32-NEXT: srli a1, a1, 2 ; RV32-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV32-NEXT: vsseg7e32.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: store_factor7_v2: ; RV64: # %bb.0: -; RV64-NEXT: slli a2, a1, 3 -; RV64-NEXT: lui a3, 149797 -; RV64-NEXT: subw a2, a2, a1 -; RV64-NEXT: addi a1, a3, -1755 -; RV64-NEXT: slli a3, a2, 32 ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: mulhu a1, a3, a1 ; RV64-NEXT: srli a1, a1, 32 -; RV64-NEXT: subw a2, a2, a1 -; RV64-NEXT: srliw a2, a2, 1 -; RV64-NEXT: add a1, a2, a1 -; RV64-NEXT: srli a1, a1, 2 ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vsseg7e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 7 + %rvl = mul nuw i32 %evl, 7 %interleaved.vec = call @llvm.vector.interleave7( %v0, %v1, %v2, %v3, %v4, %v5, %v6) call void @llvm.vp.store( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void @@ -519,7 +409,7 @@ define void @store_factor8_v2( %v0, %v1, pt ; RV64-NEXT: vmv1r.v v15, v9 ; RV64-NEXT: vsseg8e32.v v8, (a0) ; RV64-NEXT: ret - %rvl = mul i32 %evl, 8 + %rvl = mul nuw i32 %evl, 8 %interleaved.vec = call @llvm.vector.interleave8.nxv8i32( %v0, %v1, %v0, %v1, %v0, %v1, %v0, %v1) call void @llvm.vp.store.nxv8i32.p0( %interleaved.vec, ptr %ptr, splat (i1 true), i32 %rvl) ret void @@ -541,7 +431,7 @@ define {, } @masked_load_factor2_v2( @llvm.vector.interleave2.nxv4i1( %mask, %mask) %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) @@ -568,7 +458,7 @@ define {, , , @llvm.vector.interleave4.nxv8i1( %mask, %mask, %mask, %mask) %wide.masked.load = call @llvm.vp.load.nxv8i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %d = call { , , , } @llvm.vector.deinterleave4.nxv8i32( %wide.masked.load) @@ -604,7 +494,7 @@ define void @masked_store_factor2_v2( %mask, ; RV64-NEXT: vsetvli zero, a1, e32, mf2, ta, ma ; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t ; RV64-NEXT: ret - %rvl = mul i32 %evl, 2 + %rvl = mul nuw i32 %evl, 2 %interleaved.mask = tail call @llvm.vector.interleave2.nxv2i1( %mask, %mask) %interleaved.vec = tail call @llvm.vector.interleave2.nxv2i32( %v0, %v0) tail call void @llvm.vp.store.nxv2i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) @@ -629,7 +519,7 @@ define void @masked_load_store_factor2_v2_shared_mask( %mask, p ; RV64-NEXT: vlseg2e32.v v8, (a0), v0.t ; RV64-NEXT: vsseg2e32.v v8, (a0), v0.t ; RV64-NEXT: ret - %rvl = mul i32 %evl, 2 + %rvl = mul nuw i32 %evl, 2 %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) @@ -719,7 +609,7 @@ define i32 @masked_load_store_factor2_v2_shared_mask_extract( % ; RV64-NEXT: vsseg2e32.v v12, (a0), v0.t ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: ret - %rvl = mul i32 %evl, 2 + %rvl = mul nuw i32 %evl, 2 %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask, %mask) %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) @@ -751,7 +641,7 @@ define void @masked_store_factor4_v2( %mask, ; RV64-NEXT: vmv1r.v v11, v9 ; RV64-NEXT: vsseg4e32.v v8, (a0), v0.t ; RV64-NEXT: ret - %rvl = mul i32 %evl, 4 + %rvl = mul nuw i32 %evl, 4 %interleaved.mask = call @llvm.vector.interleave4.nxv4i1( %mask, %mask, %mask, %mask) %interleaved.vec = call @llvm.vector.interleave4.nxv2i32( %v0, %v1, %v0, %v1) call void @llvm.vp.store.nxv4i32.p0( %interleaved.vec, ptr %ptr, %interleaved.mask, i32 %rvl) @@ -833,7 +723,7 @@ define {, } @not_same_mask( ; RV64-NEXT: vnsrl.wx v9, v10, a0 ; RV64-NEXT: vnsrl.wi v8, v10, 0 ; RV64-NEXT: ret - %rvl = mul i32 %evl, 2 + %rvl = mul nuw i32 %evl, 2 %interleaved.mask = tail call @llvm.vector.interleave2.nxv4i1( %mask0, %mask1) %wide.masked.load = tail call @llvm.vp.load.nxv4i32.p0(ptr %ptr, %interleaved.mask, i32 %rvl) %deinterleaved.results = tail call { , } @llvm.vector.deinterleave2.nxv16i32( %wide.masked.load) From 34b3ea367c4299ebd7c37edc7c748c9627ee66cb Mon Sep 17 00:00:00 2001 From: Christopher Ferris Date: Wed, 16 Jul 2025 19:25:57 -0700 Subject: [PATCH 128/813] [scudo] Make release to OS test more specific. (#147852) The original version of ResidentMemorySize could be a little flaky. Replace the test with a version that verifies exactly how much of the map is resident. --- .../scudo/standalone/tests/common_test.cpp | 70 ++++++++++++------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp index e6ddbb00b843c..71f810e9d9724 100644 --- a/compiler-rt/lib/scudo/standalone/tests/common_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/common_test.cpp @@ -11,44 +11,60 @@ #include "common.h" #include "mem_map.h" + +#include +#include +#include + #include -#include +#include namespace scudo { -static uptr getResidentMemorySize() { - if (!SCUDO_LINUX) - UNREACHABLE("Not implemented!"); - uptr Size; - uptr Resident; - std::ifstream IFS("/proc/self/statm"); - IFS >> Size; - IFS >> Resident; - return Resident * getPageSizeCached(); +static void getResidentPages(void *BaseAddress, size_t TotalPages, + size_t *ResidentPages) { + std::vector Pages(TotalPages, 0); + ASSERT_EQ( + 0, mincore(BaseAddress, TotalPages * getPageSizeCached(), Pages.data())) + << strerror(errno); + *ResidentPages = 0; + for (unsigned char Value : Pages) { + if (Value & 1) { + ++*ResidentPages; + } + } } -// Fuchsia needs getResidentMemorySize implementation. +// Fuchsia needs getResidentPages implementation. TEST(ScudoCommonTest, SKIP_ON_FUCHSIA(ResidentMemorySize)) { - uptr OnStart = getResidentMemorySize(); - EXPECT_GT(OnStart, 0UL); - - const uptr Size = 1ull << 30; - const uptr Threshold = Size >> 3; + // Make sure to have the size of the map on a page boundary. + const uptr PageSize = getPageSizeCached(); + const size_t NumPages = 1000; + const uptr SizeBytes = NumPages * PageSize; MemMapT MemMap; - ASSERT_TRUE(MemMap.map(/*Addr=*/0U, Size, "ResidentMemorySize")); + ASSERT_TRUE(MemMap.map(/*Addr=*/0U, SizeBytes, "ResidentMemorySize")); ASSERT_NE(MemMap.getBase(), 0U); - void *P = reinterpret_cast(MemMap.getBase()); - EXPECT_LT(getResidentMemorySize(), OnStart + Threshold); - - memset(P, 1, Size); - EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold); - - MemMap.releasePagesToOS(MemMap.getBase(), Size); - EXPECT_LT(getResidentMemorySize(), OnStart + Threshold); - memset(P, 1, Size); - EXPECT_GT(getResidentMemorySize(), OnStart + Size - Threshold); + void *P = reinterpret_cast(MemMap.getBase()); + size_t ResidentPages; + getResidentPages(P, NumPages, &ResidentPages); + EXPECT_EQ(0U, ResidentPages); + + // Make the entire map resident. + memset(P, 1, SizeBytes); + getResidentPages(P, NumPages, &ResidentPages); + EXPECT_EQ(NumPages, ResidentPages); + + // Should release the memory to the kernel immediately. + MemMap.releasePagesToOS(MemMap.getBase(), SizeBytes); + getResidentPages(P, NumPages, &ResidentPages); + EXPECT_EQ(0U, ResidentPages); + + // Make the entire map resident again. + memset(P, 1, SizeBytes); + getResidentPages(P, NumPages, &ResidentPages); + EXPECT_EQ(NumPages, ResidentPages); MemMap.unmap(); } From 1f1fd07c325f174be27d6f10a512882770a976a8 Mon Sep 17 00:00:00 2001 From: Ryan Buchner Date: Wed, 16 Jul 2025 19:42:41 -0700 Subject: [PATCH 129/813] [InstCombine] Optimize (select %x, op(%x), 0) to op(%x) for operations where op(0) == 0 (#147605) Currently this optimization only occurs for `mul`, but this generalizes that for any operation that has a fixed point of `0`. There is similar logic within `EarlyCSE` pass, but that is stricter in terms of `poison` propagation so will not optimize for many operations. Alive2 Proofs: `and`: https://alive2.llvm.org/ce/z/RraasX ; base-case https://alive2.llvm.org/ce/z/gzfFTX ; commuted-case https://alive2.llvm.org/ce/z/63XaoX ; compare against undef https://alive2.llvm.org/ce/z/MVRVNd ; select undef https://alive2.llvm.org/ce/z/2bsoYG ; vector https://alive2.llvm.org/ce/z/xByeX- ; vector compare against undef https://alive2.llvm.org/ce/z/zNdzmZ ; vector select undef `fshl`: https://alive2.llvm.org/ce/z/U3_PG3 ; base-case https://alive2.llvm.org/ce/z/BWCnxT ; compare against undef https://alive2.llvm.org/ce/z/8HGAE_ ; select undef ; vector times out `fshr`: https://alive2.llvm.org/ce/z/o6F47G ; base-case https://alive2.llvm.org/ce/z/fVnBXy ; compare against undef https://alive2.llvm.org/ce/z/suymYJ ; select undef ; vector times out `umin`: https://alive2.llvm.org/ce/z/GGMqf6 ; base-case https://alive2.llvm.org/ce/z/6cx5-k ; commuted-case https://alive2.llvm.org/ce/z/W5d9tz ; compare against undef https://alive2.llvm.org/ce/z/nKbaUn ; select undef https://alive2.llvm.org/ce/z/gxEGqc ; vector https://alive2.llvm.org/ce/z/_SDpi_ ; vector compare against undef `sdiv`: https://alive2.llvm.org/ce/z/5XGs3q `srem`: https://alive2.llvm.org/ce/z/vXAnQM `udiv`: https://alive2.llvm.org/ce/z/e6_8Ug `urem`: https://alive2.llvm.org/ce/z/VmM2SL `shl`: https://alive2.llvm.org/ce/z/aCZr3u ; Argument with range https://alive2.llvm.org/ce/z/YgDy8C ; Instruction with known bits https://alive2.llvm.org/ce/z/6pIxR6 ; Constant `lshr`: https://alive2.llvm.org/ce/z/WCCBej `ashr: https://alive2.llvm.org/ce/z/egV4TR --------- Co-authored-by: Ryan Buchner Co-authored-by: Yingwei Zheng --- .../InstCombine/InstCombineSelect.cpp | 39 +++- .../Transforms/InstCombine/icmp-select.ll | 7 +- .../InstCombine/select-fixed-zero.ll | 221 ++++++++++++++++++ llvm/test/Transforms/InstCombine/select.ll | 7 +- 4 files changed, 258 insertions(+), 16 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/select-fixed-zero.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 73ba0f78e8053..eb4332fbc0959 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -878,7 +878,11 @@ static Instruction *foldSetClearBits(SelectInst &Sel, // is a vector consisting of 0 and undefs. If a constant compared with x // is a scalar undefined value or undefined vector then an expression // should be already folded into a constant. -static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) { +// +// This also holds all operations such that Op(0) == 0 +// e.g. Shl, Umin, etc +static Instruction *foldSelectZeroOrFixedOp(SelectInst &SI, + InstCombinerImpl &IC) { auto *CondVal = SI.getCondition(); auto *TrueVal = SI.getTrueValue(); auto *FalseVal = SI.getFalseValue(); @@ -900,10 +904,23 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) { // non-zero elements that are masked by undef elements in the compare // constant. auto *TrueValC = dyn_cast(TrueVal); - if (TrueValC == nullptr || - !match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) || - !isa(FalseVal)) + if (TrueValC == nullptr || !isa(FalseVal)) + return nullptr; + + bool FreezeY; + if (match(FalseVal, m_c_Mul(m_Specific(X), m_Value(Y))) || + match(FalseVal, m_c_And(m_Specific(X), m_Value(Y))) || + match(FalseVal, m_FShl(m_Specific(X), m_Specific(X), m_Value(Y))) || + match(FalseVal, m_FShr(m_Specific(X), m_Specific(X), m_Value(Y))) || + match(FalseVal, + m_c_Intrinsic(m_Specific(X), m_Value(Y)))) { + FreezeY = true; + } else if (match(FalseVal, m_IDiv(m_Specific(X), m_Value(Y))) || + match(FalseVal, m_IRem(m_Specific(X), m_Value(Y)))) { + FreezeY = false; + } else { return nullptr; + } auto *ZeroC = cast(cast(CondVal)->getOperand(1)); auto *MergedC = Constant::mergeUndefsWith(TrueValC, ZeroC); @@ -914,9 +931,15 @@ static Instruction *foldSelectZeroOrMul(SelectInst &SI, InstCombinerImpl &IC) { return nullptr; auto *FalseValI = cast(FalseVal); - auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"), - FalseValI->getIterator()); - IC.replaceOperand(*FalseValI, FalseValI->getOperand(0) == Y ? 0 : 1, FrY); + if (FreezeY) { + auto *FrY = IC.InsertNewInstBefore(new FreezeInst(Y, Y->getName() + ".fr"), + FalseValI->getIterator()); + IC.replaceOperand(*FalseValI, + FalseValI->getOperand(0) == Y + ? 0 + : (FalseValI->getOperand(1) == Y ? 1 : 2), + FrY); + } return IC.replaceInstUsesWith(SI, FalseValI); } @@ -4104,7 +4127,7 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { return Add; if (Instruction *Or = foldSetClearBits(SI, Builder)) return Or; - if (Instruction *Mul = foldSelectZeroOrMul(SI, *this)) + if (Instruction *Mul = foldSelectZeroOrFixedOp(SI, *this)) return Mul; // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z)) diff --git a/llvm/test/Transforms/InstCombine/icmp-select.ll b/llvm/test/Transforms/InstCombine/icmp-select.ll index a038731abbc48..c6c0ba385a6fd 100644 --- a/llvm/test/Transforms/InstCombine/icmp-select.ll +++ b/llvm/test/Transforms/InstCombine/icmp-select.ll @@ -248,10 +248,9 @@ define i1 @icmp_select_implied_cond_relational_off_by_one(i8 %x, i8 %y) { define i1 @umin_seq_comparison(i8 %x, i8 %y) { ; CHECK-LABEL: @umin_seq_comparison( -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[X:%.*]], 0 -; CHECK-NEXT: [[CMP21:%.*]] = icmp ule i8 [[X]], [[Y:%.*]] -; CHECK-NEXT: [[CMP2:%.*]] = select i1 [[CMP1]], i1 true, i1 [[CMP21]] -; CHECK-NEXT: ret i1 [[CMP2]] +; CHECK-NEXT: [[Y:%.*]] = freeze i8 [[Y1:%.*]] +; CHECK-NEXT: [[CMP21:%.*]] = icmp ule i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: ret i1 [[CMP21]] ; %min = call i8 @llvm.umin.i8(i8 %x, i8 %y) %cmp1 = icmp eq i8 %x, 0 diff --git a/llvm/test/Transforms/InstCombine/select-fixed-zero.ll b/llvm/test/Transforms/InstCombine/select-fixed-zero.ll new file mode 100644 index 0000000000000..7f326d158776b --- /dev/null +++ b/llvm/test/Transforms/InstCombine/select-fixed-zero.ll @@ -0,0 +1,221 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +; (select (icmp x, 0, eq), 0, (umin x, y)) -> (umin x, y) +define i64 @umin_select(i64 %a, i64 %b) { +; CHECK-LABEL: @umin_select( +; CHECK-NEXT: [[B_FR:%.*]] = freeze i64 [[B:%.*]] +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[A:%.*]], i64 [[B_FR]]) +; CHECK-NEXT: ret i64 [[UMIN]] +; + %cond = icmp eq i64 %a, 0 + %umin = call i64 @llvm.umin.i64(i64 %a, i64 %b) + %select = select i1 %cond, i64 0, i64 %umin + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (mul x, y)) -> (mul x, y) +define i64 @mul_select(i64 %a, i64 %b) { +; CHECK-LABEL: @mul_select( +; CHECK-NEXT: [[B_FR:%.*]] = freeze i64 [[B:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[A:%.*]], [[B_FR]] +; CHECK-NEXT: ret i64 [[MUL]] +; + %cond = icmp eq i64 %a, 0 + %mul = mul i64 %a, %b + %select = select i1 %cond, i64 0, i64 %mul + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (mul x, y)) -> (mul x, y) +define i64 @mul_select_comm(i64 %a, i64 %b) { +; CHECK-LABEL: @mul_select_comm( +; CHECK-NEXT: [[B_FR:%.*]] = freeze i64 [[B:%.*]] +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[B_FR]], [[A:%.*]] +; CHECK-NEXT: ret i64 [[MUL]] +; + %cond = icmp eq i64 %a, 0 + %mul = mul i64 %b, %a + %select = select i1 %cond, i64 0, i64 %mul + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (shl x, y)) -> (shl x, y) +define i64 @shl_select(i64 %a, i64 %b) { +; CHECK-LABEL: @shl_select( +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[A:%.*]], 0 +; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[A]], [[B_FR:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], i64 0, i64 [[SHL]] +; CHECK-NEXT: ret i64 [[SELECT]] +; + %cond = icmp eq i64 %a, 0 + %shl = shl i64 %a, %b + %select = select i1 %cond, i64 0, i64 %shl + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (and x, y)) -> (and x, y) +define i64 @and_select(i64 %a, i64 %b) { +; CHECK-LABEL: @and_select( +; CHECK-NEXT: [[B_FR:%.*]] = freeze i64 [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[A:%.*]], [[B_FR]] +; CHECK-NEXT: ret i64 [[AND]] +; + %cond = icmp eq i64 %a, 0 + %and = and i64 %a, %b + %select = select i1 %cond, i64 0, i64 %and + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (and x, y)) -> (and x, y) +define i64 @and_select_comm(i64 %a, i64 %b) { +; CHECK-LABEL: @and_select_comm( +; CHECK-NEXT: [[B_FR:%.*]] = freeze i64 [[B:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i64 [[B_FR]], [[A:%.*]] +; CHECK-NEXT: ret i64 [[AND]] +; + %cond = icmp eq i64 %a, 0 + %and = and i64 %b, %a + %select = select i1 %cond, i64 0, i64 %and + ret i64 %select +} + +; (select (icmp x, 0, ne), (ashr x, y), 0) -> (ashr x, y) +define i64 @ashr_select(i64 %a, i64 %b) { +; CHECK-LABEL: @ashr_select( +; CHECK-NEXT: [[COND_NOT:%.*]] = icmp eq i64 [[A:%.*]], 0 +; CHECK-NEXT: [[ASHR:%.*]] = ashr i64 [[A]], [[B_FR:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND_NOT]], i64 0, i64 [[ASHR]] +; CHECK-NEXT: ret i64 [[SELECT]] +; + %cond = icmp ne i64 0, %a + %ashr = ashr i64 %a, %b + %select = select i1 %cond, i64 %ashr, i64 0 + ret i64 %select +} + +; (select (icmp x, 0, ne), (lshr x, y), 0) -> (lshr x, y) +define i64 @lshr_select(i64 %a, i64 %b) { +; CHECK-LABEL: @lshr_select( +; CHECK-NEXT: [[COND_NOT:%.*]] = icmp eq i64 [[A:%.*]], 0 +; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[A]], [[B_FR:%.*]] +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND_NOT]], i64 0, i64 [[LSHR]] +; CHECK-NEXT: ret i64 [[SELECT]] +; + %cond = icmp ne i64 0, %a + %lshr = lshr i64 %a, %b + %select = select i1 %cond, i64 %lshr, i64 0 + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, fshr(x, x, y)) -> fshr(x, x, y) +define i64 @fshr_select(i64 %a, i64 %b) { +; CHECK-LABEL: @fshr_select( +; CHECK-NEXT: [[B_FR:%.*]] = freeze i64 [[B:%.*]] +; CHECK-NEXT: [[FSHR:%.*]] = call i64 @llvm.fshr.i64(i64 [[A:%.*]], i64 [[A]], i64 [[B_FR]]) +; CHECK-NEXT: ret i64 [[FSHR]] +; + %cond = icmp eq i64 %a, 0 + %fshr = call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b) + %select = select i1 %cond, i64 0, i64 %fshr + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (fshl x, x, y)) -> (fshl x, x, y) +define i64 @fshl_select(i64 %a, i64 %b) { +; CHECK-LABEL: @fshl_select( +; CHECK-NEXT: [[B_FR:%.*]] = freeze i64 [[B:%.*]] +; CHECK-NEXT: [[FSHL:%.*]] = call i64 @llvm.fshl.i64(i64 [[A:%.*]], i64 [[A]], i64 [[B_FR]]) +; CHECK-NEXT: ret i64 [[FSHL]] +; + %cond = icmp eq i64 %a, 0 + %fshl = call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b) + %select = select i1 %cond, i64 0, i64 %fshl + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (fshr x, z, y)) -> leave as is +define i64 @fshr_select_no_combine(i64 %a, i64 %b, i64 %c) { +; CHECK-LABEL: @fshr_select_no_combine( +; CHECK-NEXT: [[COND:%.*]] = icmp eq i64 [[A:%.*]], 0 +; CHECK-NEXT: [[FSHR:%.*]] = call i64 @llvm.fshr.i64(i64 [[A]], i64 [[B:%.*]], i64 [[C:%.*]]) +; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], i64 0, i64 [[FSHR]] +; CHECK-NEXT: ret i64 [[SELECT]] +; + %cond = icmp eq i64 %a, 0 + %fshr = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %c) + %select = select i1 %cond, i64 0, i64 %fshr + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (sdiv x, y)) -> (sdiv x, y) +define i64 @sdiv_select(i64 %a, i64 %b) { +; CHECK-LABEL: @sdiv_select( +; CHECK-NEXT: [[DIV:%.*]] = sdiv i64 [[A:%.*]], [[B_FR:%.*]] +; CHECK-NEXT: ret i64 [[DIV]] +; + %cond = icmp eq i64 %a, 0 + %div = sdiv i64 %a, %b + %select = select i1 %cond, i64 0, i64 %div + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (udiv x, y)) -> (udiv x, y) +define i64 @udiv_select(i64 %a, i64 %b) { +; CHECK-LABEL: @udiv_select( +; CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[A:%.*]], [[B_FR:%.*]] +; CHECK-NEXT: ret i64 [[DIV]] +; + %cond = icmp eq i64 %a, 0 + %div = udiv i64 %a, %b + %select = select i1 %cond, i64 0, i64 %div + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (srem x, y)) -> (srem x, y) +define i64 @srem_select(i64 %a, i64 %b) { +; CHECK-LABEL: @srem_select( +; CHECK-NEXT: [[REM:%.*]] = srem i64 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[REM]] +; + %cond = icmp eq i64 %a, 0 + %rem = srem i64 %a, %b + %select = select i1 %cond, i64 0, i64 %rem + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (urem x, y)) -> (urem x, y) +define i64 @urem_select(i64 %a, i64 %b) { +; CHECK-LABEL: @urem_select( +; CHECK-NEXT: [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: ret i64 [[REM]] +; + %cond = icmp eq i64 %a, 0 + %rem = urem i64 %a, %b + %select = select i1 %cond, i64 0, i64 %rem + ret i64 %select +} + +; (select (icmp x, 0, eq), 0, (icmp x, 0, slt)) -> (icmp x, 0, slt) +define i1 @icmp_slt_select(i64 %a) { +; CHECK-LABEL: @icmp_slt_select( +; CHECK-NEXT: [[ICMP:%.*]] = icmp slt i64 [[A:%.*]], 0 +; CHECK-NEXT: ret i1 [[ICMP]] +; + %cond = icmp eq i64 %a, 0 + %icmp = icmp slt i64 %a, 0 + %select = select i1 %cond, i1 0, i1 %icmp + ret i1 %select +} + +; (select (icmp x, 0, eq), 0, (sub 0, x)) -> (sub 0, x) +define i64 @sub_select(i64 %a) { +; CHECK-LABEL: @sub_select( +; CHECK-NEXT: [[SUB:%.*]] = sub i64 0, [[A:%.*]] +; CHECK-NEXT: ret i64 [[SUB]] +; + %cond = icmp eq i64 %a, 0 + %sub = sub i64 0, %a + %select = select i1 %cond, i64 0, i64 %sub + ret i64 %select +} diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index ef5874ffd46ad..1f9ee83536016 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -893,10 +893,9 @@ define i32 @test56(i16 %x) { define i32 @test57(i32 %x, i32 %y) { ; CHECK-LABEL: @test57( -; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[X]], 0 -; CHECK-NEXT: [[DOTAND:%.*]] = select i1 [[TOBOOL]], i32 0, i32 [[AND]] -; CHECK-NEXT: ret i32 [[DOTAND]] +; CHECK-NEXT: [[Y:%.*]] = freeze i32 [[Y1:%.*]] +; CHECK-NEXT: [[AND:%.*]] = and i32 [[X:%.*]], [[Y]] +; CHECK-NEXT: ret i32 [[AND]] ; %and = and i32 %x, %y %tobool = icmp eq i32 %x, 0 From 283a62fa5b9f2b07fb74336dbce91f346801225f Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 16 Jul 2025 19:59:38 -0700 Subject: [PATCH 130/813] [AMDGPU] NFC: Decouple getRealRegPressure from current region (#149219) We're already accepting a RegionIdx for the LiveIns, also use this for the instruction iterators. Enables querying RP for other regions -- useful for function wide transformations (e.g. rematerialization, rewriting, etc). --- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index fce8f36d45969..a6553083d722b 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -803,7 +803,8 @@ void GCNScheduleDAGMILive::schedule() { GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure(unsigned RegionIdx) const { GCNDownwardRPTracker RPTracker(*LIS); - RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); + RPTracker.advance(Regions[RegionIdx].first, Regions[RegionIdx].second, + &LiveIns[RegionIdx]); return RPTracker.moveMaxPressure(); } From 1c541aa9f9a2453324724bfb9d661bc672778d10 Mon Sep 17 00:00:00 2001 From: thetruestblue Date: Wed, 16 Jul 2025 20:17:37 -0700 Subject: [PATCH 131/813] [Apple][NFC] Update macOS aligned version for lit config (#143576) This updates the aligned version for version 26. Note: This change is for correctness only and has no functional impact currently. `get_macos_aligned_version` is currently only consumed when substituting flags based on min version. rdar://152851947 --- compiler-rt/test/lit.common.cfg.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index f5576ce0e013d..3c7323ecf4473 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -602,12 +602,12 @@ def get_ios_commands_dir(): def get_macos_aligned_version(macos_vers): platform = config.apple_platform - if platform == "osx": + macos_major, macos_minor = macos_vers + + if platform == "osx" or macos_major >= 26: return macos_vers - macos_major, macos_minor = macos_vers assert macos_major >= 10 - if macos_major == 10: # macOS 10.x major = macos_minor minor = 0 From 0f09f2cf00caf3e6c755197d7c897ce8e607db4d Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Wed, 16 Jul 2025 21:28:41 -0700 Subject: [PATCH 132/813] [Sanitize] fix crash in -fsanitize-annotate-debug-info (#149237) --- clang/lib/CodeGen/CGDebugInfo.cpp | 13 ++++++++----- .../null-sanitizer-debug-info-regression.cpp | 5 +++++ 2 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 75ee08a2bcfa6..446cf8d9e05c6 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -6480,24 +6480,27 @@ SanitizerOrdinalToCheckLabel(SanitizerKind::SanitizerOrdinal Ordinal) { llvm::DILocation *CodeGenFunction::SanitizerAnnotateDebugInfo( ArrayRef Ordinals, SanitizerHandler Handler) { + llvm::DILocation *CheckDebugLoc = Builder.getCurrentDebugLocation(); + auto *DI = getDebugInfo(); + if (!DI) + return CheckDebugLoc; + std::string Label; if (Ordinals.size() == 1) Label = SanitizerOrdinalToCheckLabel(Ordinals[0]); else Label = SanitizerHandlerToCheckLabel(Handler); - llvm::DILocation *CheckDI = Builder.getCurrentDebugLocation(); - for (auto Ord : Ordinals) { // TODO: deprecate ClArrayBoundsPseudoFn if (((ClArrayBoundsPseudoFn && Ord == SanitizerKind::SO_ArrayBounds) || CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo.has(Ord)) && - CheckDI) { - return getDebugInfo()->CreateSyntheticInlineAt(CheckDI, Label); + CheckDebugLoc) { + return DI->CreateSyntheticInlineAt(CheckDebugLoc, Label); } } - return CheckDI; + return CheckDebugLoc; } SanitizerDebugLocation::SanitizerDebugLocation( diff --git a/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp b/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp new file mode 100644 index 0000000000000..0b62f24177bbd --- /dev/null +++ b/clang/test/CodeGen/null-sanitizer-debug-info-regression.cpp @@ -0,0 +1,5 @@ +// RUN: %clangxx -g -fsanitize=null -fsanitize-trap=all -fsanitize-annotate-debug-info=all -O2 -std=c++17 -c -o /dev/null %s + +struct foo { + foo(int, long, const int & = int()); +} foo(0, 0); From e51ea1c7571c9d7698ab01317fd4ab3b250cb8f9 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Thu, 17 Jul 2025 09:24:36 +0800 Subject: [PATCH 133/813] [RISCV] Remove unneeded declarations in llvm/test/CodeGen/RISCV/xandesbfhcvt.ll. NFC. --- llvm/test/CodeGen/RISCV/xandesbfhcvt.ll | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll b/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll index e4524394b9991..854d0b659ea73 100644 --- a/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll +++ b/llvm/test/CodeGen/RISCV/xandesbfhcvt.ll @@ -4,8 +4,6 @@ ; RUN: llc -mtriple=riscv64 -mattr=+xandesbfhcvt -target-abi lp64f \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -declare bfloat @llvm.riscv.nds.fcvt.bf16.s(float) - define float @fcvt_s_bf16(bfloat %a) nounwind { ; CHECK-LABEL: fcvt_s_bf16: ; CHECK: # %bb.0: @@ -15,8 +13,6 @@ define float @fcvt_s_bf16(bfloat %a) nounwind { ret float %1 } -declare float @llvm.riscv.nds.fcvt.s.bf16(bfloat) - define bfloat @fcvt_bf16_s(float %a) nounwind { ; CHECK-LABEL: fcvt_bf16_s: ; CHECK: # %bb.0: From 9753ea83dd34bfb169fbed3995cc79e621ca6963 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 16 Jul 2025 22:53:18 -0700 Subject: [PATCH 134/813] MC,test: Migrate away from the .reloc constant hack The initial .reloc support for MIPS incorrectly interpreted .reloc 0 as .reloc .+0 . I was misled when porting .reloc to other targets in 2019 and 2020. Many PRINT: prefixes are unnecessary. The MCAsmStreamer implementation is generic, and it is unnecessary to test too many variants. --- .../MC/AArch64/directives-case_insensitive.s | 4 +- llvm/test/MC/AArch64/reloc-directive-err.s | 4 +- llvm/test/MC/AArch64/reloc-directive.s | 44 ++++----- llvm/test/MC/AMDGPU/reloc-directive.s | 58 ++++------- llvm/test/MC/ARM/reloc-directive-err.s | 4 +- llvm/test/MC/ARM/reloc-directive.s | 35 +++---- llvm/test/MC/AVR/reloc-directive-err.s | 8 +- llvm/test/MC/AVR/reloc-directive.s | 29 +++--- .../Relocations/reloc-directive-err.s | 4 +- .../LoongArch/Relocations/reloc-directive.s | 32 +++---- llvm/test/MC/Mips/reloc-directive-bad.s | 4 +- llvm/test/MC/Mips/reloc-directive.s | 95 ++++++++++--------- llvm/test/MC/PowerPC/ppc32-reloc-directive.s | 38 ++++---- llvm/test/MC/PowerPC/ppc64-reloc-directive.s | 42 ++++---- llvm/test/MC/RISCV/reloc-directive-err.s | 4 +- llvm/test/MC/RISCV/reloc-directive.s | 36 +++---- .../MC/Sparc/Relocations/reloc-directive.s | 32 +++---- llvm/test/MC/SystemZ/reloc-directive.s | 42 +++----- llvm/test/MC/X86/reloc-directive-elf-32.s | 33 +++---- llvm/test/MC/X86/reloc-directive-elf-64.s | 39 +++----- llvm/test/MC/X86/reloc-directive.s | 10 +- 21 files changed, 254 insertions(+), 343 deletions(-) diff --git a/llvm/test/MC/AArch64/directives-case_insensitive.s b/llvm/test/MC/AArch64/directives-case_insensitive.s index 35a90a1bffea8..c2bdec73e349e 100644 --- a/llvm/test/MC/AArch64/directives-case_insensitive.s +++ b/llvm/test/MC/AArch64/directives-case_insensitive.s @@ -15,8 +15,8 @@ tlbi vmalle1os .INST 0x5e104020 // CHECK: .inst 0x5e104020 -.RELOC 0, R_AARCH64_NONE, 8 -// CHECK: .reloc 0, R_AARCH64_NONE, 8 +.RELOC ., R_AARCH64_NONE, 8 +// CHECK: .reloc {{.*}}, R_AARCH64_NONE, 8 .HWORD 0x1234 // CHECK: .hword 4660 diff --git a/llvm/test/MC/AArch64/reloc-directive-err.s b/llvm/test/MC/AArch64/reloc-directive-err.s index 6eec2ae10c0a6..883dd2a06f28e 100644 --- a/llvm/test/MC/AArch64/reloc-directive-err.s +++ b/llvm/test/MC/AArch64/reloc-directive-err.s @@ -1,6 +1,6 @@ # RUN: llvm-mc -triple=aarch64 %s 2>&1 | FileCheck --check-prefix=PRINT %s # RUN: not llvm-mc -filetype=obj -triple=aarch64 %s -o /dev/null 2>&1 | FileCheck %s -# PRINT: .reloc 0, R_INVALID, 0 +# PRINT: .reloc {{.*}}, R_INVALID, 0 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name -.reloc 0, R_INVALID, 0 +.reloc ., R_INVALID, 0 diff --git a/llvm/test/MC/AArch64/reloc-directive.s b/llvm/test/MC/AArch64/reloc-directive.s index 09b0f0d3cb9d3..a502201fb9291 100644 --- a/llvm/test/MC/AArch64/reloc-directive.s +++ b/llvm/test/MC/AArch64/reloc-directive.s @@ -2,32 +2,32 @@ # RUN: llvm-mc -filetype=obj -triple=aarch64-linux-musl %s | llvm-readobj -r - | FileCheck %s -# PRINT: .reloc 8, R_AARCH64_NONE, .data -# PRINT: .reloc 4, R_AARCH64_NONE, foo+4 -# PRINT: .reloc 0, R_AARCH64_NONE, 8 -# PRINT: .reloc 0, R_AARCH64_ABS64, .data+2 -# PRINT: .reloc 0, R_AARCH64_TLSDESC, foo+3 -# PRINT: .reloc 0, R_AARCH64_IRELATIVE, 5 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT: .reloc 0, BFD_RELOC_16, 9 -# PRINT: .reloc 0, BFD_RELOC_32, 9 -# PRINT: .reloc 0, BFD_RELOC_64, 9 +# PRINT: .reloc {{.*}}+8, R_AARCH64_NONE, .data +# PRINT: .reloc {{.*}}+4, R_AARCH64_NONE, foo+4 +# PRINT: .reloc {{.*}}+0, R_AARCH64_NONE, 8 +# PRINT: .reloc {{.*}}+0, R_AARCH64_ABS64, .data+2 +# PRINT: .reloc {{.*}}+0, R_AARCH64_TLSDESC, foo+3 +# PRINT: .reloc {{.*}}+0, R_AARCH64_IRELATIVE, 5 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_64, 9 .text + .reloc .+8, R_AARCH64_NONE, .data + .reloc .+4, R_AARCH64_NONE, foo+4 + .reloc .+0, R_AARCH64_NONE, 8 + + .reloc .+0, R_AARCH64_ABS64, .data+2 + .reloc .+0, R_AARCH64_TLSDESC, foo+3 + .reloc .+0, R_AARCH64_IRELATIVE, 5 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_16, 9 + .reloc .+0, BFD_RELOC_32, 9 + .reloc .+0, BFD_RELOC_64, 9 ret nop nop - .reloc 8, R_AARCH64_NONE, .data - .reloc 4, R_AARCH64_NONE, foo+4 - .reloc 0, R_AARCH64_NONE, 8 - - .reloc 0, R_AARCH64_ABS64, .data+2 - .reloc 0, R_AARCH64_TLSDESC, foo+3 - .reloc 0, R_AARCH64_IRELATIVE, 5 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_16, 9 - .reloc 0, BFD_RELOC_32, 9 - .reloc 0, BFD_RELOC_64, 9 .data .globl foo diff --git a/llvm/test/MC/AMDGPU/reloc-directive.s b/llvm/test/MC/AMDGPU/reloc-directive.s index 99e972b3105a2..5d55fde4945ef 100644 --- a/llvm/test/MC/AMDGPU/reloc-directive.s +++ b/llvm/test/MC/AMDGPU/reloc-directive.s @@ -3,25 +3,7 @@ # RUN: llvm-mc -filetype=obj -triple=amdgcn--amdhsa %s -o %t # RUN: llvm-readobj -r %t | FileCheck %s -# PRINT: .reloc 2, R_AMDGPU_NONE, .data -# PRINT-NEXT: .reloc 1, R_AMDGPU_NONE, foo+4 -# PRINT-NEXT: .reloc 0, R_AMDGPU_NONE, 8 -# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32_LO, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32_HI, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS64, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_REL64, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_ABS32, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL32_LO, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_GOTPCREL32_HI, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32_LO, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_REL32_HI, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_RELATIVE64, .data -# PRINT-NEXT: .reloc 0, R_AMDGPU_REL16, .data -# PRINT-NEXT: .reloc 0, BFD_RELOC_NONE, .data -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, .data -# PRINT-NEXT: .reloc 0, BFD_RELOC_64, .data +# PRINT: .reloc {{.*}}+2, R_AMDGPU_NONE, .data # CHECK: 0x2 R_AMDGPU_NONE .data # CHECK-NEXT: 0x1 R_AMDGPU_NONE foo 0x4 @@ -44,27 +26,27 @@ # CHECK-NEXT: 0x0 R_AMDGPU_ABS64 .data .text + .reloc .+2, R_AMDGPU_NONE, .data + .reloc .+1, R_AMDGPU_NONE, foo+4 + .reloc .+0, R_AMDGPU_NONE, 8 + .reloc .+0, R_AMDGPU_ABS32_LO, .data + .reloc .+0, R_AMDGPU_ABS32_HI, .data + .reloc .+0, R_AMDGPU_ABS64, .data + .reloc .+0, R_AMDGPU_REL32, .data + .reloc .+0, R_AMDGPU_REL64, .data + .reloc .+0, R_AMDGPU_ABS32, .data + .reloc .+0, R_AMDGPU_GOTPCREL, .data + .reloc .+0, R_AMDGPU_GOTPCREL32_LO, .data + .reloc .+0, R_AMDGPU_GOTPCREL32_HI, .data + .reloc .+0, R_AMDGPU_REL32_LO, .data + .reloc .+0, R_AMDGPU_REL32_HI, .data + .reloc .+0, R_AMDGPU_RELATIVE64, .data + .reloc .+0, R_AMDGPU_REL16, .data + .reloc .+0, BFD_RELOC_NONE, .data + .reloc .+0, BFD_RELOC_32, .data + .reloc .+0, BFD_RELOC_64, .data s_nop 0 s_nop 0 - .reloc 2, R_AMDGPU_NONE, .data - .reloc 1, R_AMDGPU_NONE, foo+4 - .reloc 0, R_AMDGPU_NONE, 8 - .reloc 0, R_AMDGPU_ABS32_LO, .data - .reloc 0, R_AMDGPU_ABS32_HI, .data - .reloc 0, R_AMDGPU_ABS64, .data - .reloc 0, R_AMDGPU_REL32, .data - .reloc 0, R_AMDGPU_REL64, .data - .reloc 0, R_AMDGPU_ABS32, .data - .reloc 0, R_AMDGPU_GOTPCREL, .data - .reloc 0, R_AMDGPU_GOTPCREL32_LO, .data - .reloc 0, R_AMDGPU_GOTPCREL32_HI, .data - .reloc 0, R_AMDGPU_REL32_LO, .data - .reloc 0, R_AMDGPU_REL32_HI, .data - .reloc 0, R_AMDGPU_RELATIVE64, .data - .reloc 0, R_AMDGPU_REL16, .data - .reloc 0, BFD_RELOC_NONE, .data - .reloc 0, BFD_RELOC_32, .data - .reloc 0, BFD_RELOC_64, .data .data .globl foo diff --git a/llvm/test/MC/ARM/reloc-directive-err.s b/llvm/test/MC/ARM/reloc-directive-err.s index c291fd62d2ba9..113158c51c36b 100644 --- a/llvm/test/MC/ARM/reloc-directive-err.s +++ b/llvm/test/MC/ARM/reloc-directive-err.s @@ -1,6 +1,6 @@ # RUN: llvm-mc -triple=armv7 %s 2>&1 | FileCheck --check-prefix=PRINT %s # RUN: not llvm-mc -filetype=obj -triple=armv7 %s -o /dev/null 2>&1 | FileCheck %s -# PRINT: .reloc 0, R_INVALID, 0 +# PRINT: .reloc {{.*}}, R_INVALID, 0 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name -.reloc 0, R_INVALID, 0 +.reloc ., R_INVALID, 0 diff --git a/llvm/test/MC/ARM/reloc-directive.s b/llvm/test/MC/ARM/reloc-directive.s index 682f0e1185c72..6a3b2496cfc8d 100644 --- a/llvm/test/MC/ARM/reloc-directive.s +++ b/llvm/test/MC/ARM/reloc-directive.s @@ -10,21 +10,21 @@ # RUN: llvm-readelf -x .data %t | FileCheck --check-prefix=HEX %s .text + .reloc .+8, R_ARM_NONE, .data + .reloc .+4, R_ARM_NONE, foo+4 + .reloc .+0, R_ARM_NONE, 8 + + .reloc .+0, R_ARM_ALU_PC_G0, .data+2 + .reloc .+0, R_ARM_LDR_PC_G0, foo+3 + .reloc .+0, R_ARM_THM_ALU_PREL_11_0, 5 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_8, 9 + .reloc .+0, BFD_RELOC_16, 9 + .reloc .+0, BFD_RELOC_32, 9 bx lr nop nop - .reloc 8, R_ARM_NONE, .data - .reloc 4, R_ARM_NONE, foo+4 - .reloc 0, R_ARM_NONE, 8 - - .reloc 0, R_ARM_ALU_PC_G0, .data+2 - .reloc 0, R_ARM_LDR_PC_G0, foo+3 - .reloc 0, R_ARM_THM_ALU_PREL_11_0, 5 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_8, 9 - .reloc 0, BFD_RELOC_16, 9 - .reloc 0, BFD_RELOC_32, 9 .data .globl foo @@ -33,16 +33,7 @@ foo: .word 0 .word 0 -# PRINT: .reloc 8, R_ARM_NONE, .data -# PRINT: .reloc 4, R_ARM_NONE, foo+4 -# PRINT: .reloc 0, R_ARM_NONE, 8 -# PRINT: .reloc 0, R_ARM_ALU_PC_G0, .data+2 -# PRINT: .reloc 0, R_ARM_LDR_PC_G0, foo+3 -# PRINT: .reloc 0, R_ARM_THM_ALU_PREL_11_0, 5 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9 +# PRINT: .reloc {{.*}}+8, R_ARM_NONE, .data # ARM relocations use the Elf32_Rel format. Addends are neither stored in the # relocation entries nor applied in the referenced locations. diff --git a/llvm/test/MC/AVR/reloc-directive-err.s b/llvm/test/MC/AVR/reloc-directive-err.s index d660bde487e3a..8494a66c6b727 100644 --- a/llvm/test/MC/AVR/reloc-directive-err.s +++ b/llvm/test/MC/AVR/reloc-directive-err.s @@ -1,10 +1,10 @@ # RUN: llvm-mc -triple=avr %s 2>&1 | FileCheck --check-prefix=PRINT %s # RUN: not llvm-mc -filetype=obj -triple=avr %s -o /dev/null 2>&1 | FileCheck %s -# PRINT: .reloc 0, R_INVALID, 0 +# PRINT: .reloc {{.*}}, R_INVALID, 0 # CHECK: {{.*}}.s:[[#@LINE+1]]:11: error: unknown relocation name -.reloc 0, R_INVALID, 0 +.reloc ., R_INVALID, 0 -# PRINT: .reloc 0, BFD_RELOC_64, 0 +# PRINT: .reloc {{.*}}, BFD_RELOC_64, 0 # CHECK: {{.*}}.s:[[#@LINE+1]]:11: error: unknown relocation name -.reloc 0, BFD_RELOC_64, 0 +.reloc ., BFD_RELOC_64, 0 diff --git a/llvm/test/MC/AVR/reloc-directive.s b/llvm/test/MC/AVR/reloc-directive.s index 60913172502cf..9940842171eef 100644 --- a/llvm/test/MC/AVR/reloc-directive.s +++ b/llvm/test/MC/AVR/reloc-directive.s @@ -1,14 +1,7 @@ # RUN: llvm-mc -triple=avr %s | FileCheck --check-prefix=PRINT %s # RUN: llvm-mc -filetype=obj -triple=avr %s | llvm-readobj -r - | FileCheck %s -# PRINT: .reloc 4, R_AVR_NONE, .data -# PRINT-NEXT: .reloc 2, R_AVR_NONE, foo+4 -# PRINT-NEXT: .reloc 0, R_AVR_NONE, 8 -# PRINT: .reloc 0, R_AVR_32, .data+2 -# PRINT-NEXT: .reloc 0, R_AVR_16, foo+3 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9 +# PRINT: .reloc {{.*}}+4, R_AVR_NONE, .data # CHECK: Section ({{.*}}) .rela.text { # CHECK-NEXT: 0x4 R_AVR_NONE .data 0x0 @@ -22,19 +15,19 @@ # CHECK-NEXT: } .text + .reloc .+4, R_AVR_NONE, .data + .reloc .+2, R_AVR_NONE, foo+4 + .reloc .+0, R_AVR_NONE, 8 + + .reloc .+0, R_AVR_32, .data+2 + .reloc .+0, R_AVR_16, foo+3 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_16, 9 + .reloc .+0, BFD_RELOC_32, 9 ret nop nop - .reloc 4, R_AVR_NONE, .data - .reloc 2, R_AVR_NONE, foo+4 - .reloc 0, R_AVR_NONE, 8 - - .reloc 0, R_AVR_32, .data+2 - .reloc 0, R_AVR_16, foo+3 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_16, 9 - .reloc 0, BFD_RELOC_32, 9 .data .globl foo diff --git a/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s b/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s index 60fd145564ae5..7658865b0f083 100644 --- a/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s +++ b/llvm/test/MC/LoongArch/Relocations/reloc-directive-err.s @@ -2,6 +2,6 @@ # RUN: not llvm-mc --filetype=obj --triple=loongarch64 %s -o /dev/null 2>&1 \ # RUN: | FileCheck %s -# PRINT: .reloc 0, R_INVALID, 0 +# PRINT: .reloc {{.*}}, R_INVALID, 0 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name -.reloc 0, R_INVALID, 0 +.reloc ., R_INVALID, 0 diff --git a/llvm/test/MC/LoongArch/Relocations/reloc-directive.s b/llvm/test/MC/LoongArch/Relocations/reloc-directive.s index f900f17c06c39..2fc0c816d7057 100644 --- a/llvm/test/MC/LoongArch/Relocations/reloc-directive.s +++ b/llvm/test/MC/LoongArch/Relocations/reloc-directive.s @@ -2,31 +2,23 @@ # RUN: llvm-mc --filetype=obj --triple=loongarch64 %s \ # RUN: | llvm-readobj -r - | FileCheck %s -# PRINT: .reloc 8, R_LARCH_NONE, .data -# PRINT: .reloc 4, R_LARCH_NONE, foo+4 -# PRINT: .reloc 0, R_LARCH_NONE, 8 -# PRINT: .reloc 0, R_LARCH_32, .data+2 -# PRINT: .reloc 0, R_LARCH_TLS_DTPMOD32, foo+3 -# PRINT: .reloc 0, R_LARCH_IRELATIVE, 5 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9 +# PRINT: .reloc {{.*}}+8, R_LARCH_NONE, .data .text + .reloc .+8, R_LARCH_NONE, .data + .reloc .+4, R_LARCH_NONE, foo+4 + .reloc .+0, R_LARCH_NONE, 8 + + .reloc .+0, R_LARCH_32, .data+2 + .reloc .+0, R_LARCH_TLS_DTPMOD32, foo+3 + .reloc .+0, R_LARCH_IRELATIVE, 5 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_32, 9 + .reloc .+0, BFD_RELOC_64, 9 ret nop nop - .reloc 8, R_LARCH_NONE, .data - .reloc 4, R_LARCH_NONE, foo+4 - .reloc 0, R_LARCH_NONE, 8 - - .reloc 0, R_LARCH_32, .data+2 - .reloc 0, R_LARCH_TLS_DTPMOD32, foo+3 - .reloc 0, R_LARCH_IRELATIVE, 5 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_32, 9 - .reloc 0, BFD_RELOC_64, 9 .data .globl foo diff --git a/llvm/test/MC/Mips/reloc-directive-bad.s b/llvm/test/MC/Mips/reloc-directive-bad.s index bb056b752fb9f..f09a73b962b01 100644 --- a/llvm/test/MC/Mips/reloc-directive-bad.s +++ b/llvm/test/MC/Mips/reloc-directive-bad.s @@ -2,6 +2,6 @@ # RUN: -target-abi=o32 2>&1 | FileCheck %s .text foo: - .reloc 0, R_MIPS_32, .text+.text # CHECK: :[[@LINE]]:23: error: expression must be relocatable - .reloc 0, 0, R_MIPS_32, .text # CHECK: :[[@LINE]]:12: error: expected relocation name + .reloc ., R_MIPS_32, .text+.text # CHECK: :[[@LINE]]:23: error: expression must be relocatable + .reloc ., 0, R_MIPS_32, .text # CHECK: :[[@LINE]]:12: error: expected relocation name nop diff --git a/llvm/test/MC/Mips/reloc-directive.s b/llvm/test/MC/Mips/reloc-directive.s index 2f699ec98a609..4f875687f33b7 100644 --- a/llvm/test/MC/Mips/reloc-directive.s +++ b/llvm/test/MC/Mips/reloc-directive.s @@ -15,78 +15,79 @@ # RUN: FileCheck -check-prefix=OBJ-N64 %s .text foo: - .reloc 4, R_MIPS_NONE, foo # ASM: .reloc 4, R_MIPS_NONE, foo - .reloc 0, R_MIPS_NONE, foo+4 # ASM: .reloc 0, R_MIPS_NONE, foo+4 - .reloc 8, R_MIPS_32, foo+8 # ASM: .reloc 8, R_MIPS_32, foo+8 +# ASM: .reloc {{.*}}+4, R_MIPS_NONE, foo + .reloc .+4, R_MIPS_NONE, foo + .reloc .+0, R_MIPS_NONE, foo+4 + .reloc .+8, R_MIPS_32, foo+8 nop nop nop - .reloc 12, R_MIPS_NONE # ASM: .reloc 12, R_MIPS_NONE{{$}} + .reloc ., R_MIPS_NONE nop - .reloc 16, R_MIPS_CALL_HI16, 4 # ASM: .reloc 16, R_MIPS_CALL_HI16, 4 + .reloc ., R_MIPS_CALL_HI16, 4 nop - .reloc 20, R_MIPS_CALL_LO16, 4 # ASM: .reloc 20, R_MIPS_CALL_LO16, 4 + .reloc ., R_MIPS_CALL_LO16, 4 nop - .reloc 24, R_MIPS_CALL16, 4 # ASM: .reloc 24, R_MIPS_CALL16, 4 + .reloc ., R_MIPS_CALL16, 4 nop - .reloc 28, R_MIPS_GOT16, 4 # ASM: .reloc 28, R_MIPS_GOT16, 4 + .reloc ., R_MIPS_GOT16, 4 nop - .reloc 32, R_MIPS_GOT_PAGE, 4 # ASM: .reloc 32, R_MIPS_GOT_PAGE, 4 + .reloc ., R_MIPS_GOT_PAGE, 4 nop - .reloc 36, R_MIPS_GOT_OFST, 4 # ASM: .reloc 36, R_MIPS_GOT_OFST, 4 + .reloc ., R_MIPS_GOT_OFST, 4 nop - .reloc 40, R_MIPS_GOT_DISP, 4 # ASM: .reloc 40, R_MIPS_GOT_DISP, 4 + .reloc ., R_MIPS_GOT_DISP, 4 nop - .reloc 44, R_MIPS_GOT_HI16, 4 # ASM: .reloc 44, R_MIPS_GOT_HI16, 4 + .reloc ., R_MIPS_GOT_HI16, 4 nop - .reloc 48, R_MIPS_GOT_LO16, 4 # ASM: .reloc 48, R_MIPS_GOT_LO16, 4 + .reloc ., R_MIPS_GOT_LO16, 4 nop - .reloc 52, R_MIPS_TLS_GOTTPREL, 4 # ASM: .reloc 52, R_MIPS_TLS_GOTTPREL, 4 + .reloc ., R_MIPS_TLS_GOTTPREL, 4 nop - .reloc 56, R_MIPS_TLS_DTPREL_HI16, 4 # ASM: .reloc 56, R_MIPS_TLS_DTPREL_HI16, 4 + .reloc ., R_MIPS_TLS_DTPREL_HI16, 4 nop - .reloc 60, R_MIPS_TLS_DTPREL_LO16, 4 # ASM: .reloc 60, R_MIPS_TLS_DTPREL_LO16, 4 + .reloc ., R_MIPS_TLS_DTPREL_LO16, 4 nop - .reloc 64, R_MIPS_TLS_GD, 4 # ASM: .reloc 64, R_MIPS_TLS_GD, 4 + .reloc ., R_MIPS_TLS_GD, 4 nop - .reloc 68, R_MIPS_TLS_LDM, 4 # ASM: .reloc 68, R_MIPS_TLS_LDM, 4 + .reloc ., R_MIPS_TLS_LDM, 4 nop - .reloc 72, R_MIPS_TLS_TPREL_HI16, 4 # ASM: .reloc 72, R_MIPS_TLS_TPREL_HI16, 4 + .reloc ., R_MIPS_TLS_TPREL_HI16, 4 nop - .reloc 76, R_MIPS_TLS_TPREL_LO16, 4 # ASM: .reloc 76, R_MIPS_TLS_TPREL_LO16, 4 + .reloc ., R_MIPS_TLS_TPREL_LO16, 4 nop - .reloc 80, R_MICROMIPS_CALL16, 4 # ASM: .reloc 80, R_MICROMIPS_CALL16, 4 + .reloc ., R_MICROMIPS_CALL16, 4 nop - .reloc 84, R_MICROMIPS_GOT_DISP, 4 # ASM: .reloc 84, R_MICROMIPS_GOT_DISP, 4 + .reloc ., R_MICROMIPS_GOT_DISP, 4 nop - .reloc 88, R_MICROMIPS_GOT_PAGE, 4 # ASM: .reloc 88, R_MICROMIPS_GOT_PAGE, 4 + .reloc ., R_MICROMIPS_GOT_PAGE, 4 nop - .reloc 92, R_MICROMIPS_GOT_OFST, 4 # ASM: .reloc 92, R_MICROMIPS_GOT_OFST, 4 + .reloc ., R_MICROMIPS_GOT_OFST, 4 nop - .reloc 96, R_MICROMIPS_GOT16, 4 # ASM: .reloc 96, R_MICROMIPS_GOT16, 4 + .reloc ., R_MICROMIPS_GOT16, 4 nop - .reloc 100, R_MICROMIPS_TLS_GOTTPREL, 4 # ASM: .reloc 100, R_MICROMIPS_TLS_GOTTPREL, 4 + .reloc ., R_MICROMIPS_TLS_GOTTPREL, 4 nop - .reloc 104, R_MICROMIPS_TLS_DTPREL_HI16, 4 # ASM: .reloc 104, R_MICROMIPS_TLS_DTPREL_HI16, 4 + .reloc ., R_MICROMIPS_TLS_DTPREL_HI16, 4 nop - .reloc 108, R_MICROMIPS_TLS_DTPREL_LO16, 4 # ASM: .reloc 108, R_MICROMIPS_TLS_DTPREL_LO16, 4 + .reloc ., R_MICROMIPS_TLS_DTPREL_LO16, 4 nop - .reloc 112, R_MICROMIPS_TLS_GD, 4 # ASM: .reloc 112, R_MICROMIPS_TLS_GD, 4 + .reloc ., R_MICROMIPS_TLS_GD, 4 nop - .reloc 116, R_MICROMIPS_TLS_LDM, 4 # ASM: .reloc 116, R_MICROMIPS_TLS_LDM, 4 + .reloc ., R_MICROMIPS_TLS_LDM, 4 nop - .reloc 120, R_MICROMIPS_TLS_TPREL_HI16, 4 # ASM: .reloc 120, R_MICROMIPS_TLS_TPREL_HI16, 4 + .reloc ., R_MICROMIPS_TLS_TPREL_HI16, 4 nop - .reloc 124, R_MICROMIPS_TLS_TPREL_LO16, 4 # ASM: .reloc 124, R_MICROMIPS_TLS_TPREL_LO16, 4 + .reloc ., R_MICROMIPS_TLS_TPREL_LO16, 4 nop - .reloc 128, R_MIPS_JALR, 4 # ASM: .reloc 128, R_MIPS_JALR, 4 + .reloc ., R_MIPS_JALR, 4 nop - .reloc 132, R_MICROMIPS_JALR, 4 # ASM: .reloc 132, R_MICROMIPS_JALR, 4 + .reloc ., R_MICROMIPS_JALR, 4 nop - .reloc 136, BFD_RELOC_NONE, 9 # ASM: .reloc 136, BFD_RELOC_NONE, 9 - .reloc 137, BFD_RELOC_16, 9 # ASM: .reloc 137, BFD_RELOC_16, 9 - .reloc 138, BFD_RELOC_32, 9 # ASM: .reloc 138, BFD_RELOC_32, 9 - .reloc 139, BFD_RELOC_64, 9 # ASM: .reloc 139, BFD_RELOC_64, 9 + .reloc ., BFD_RELOC_NONE, 9 + .reloc ., BFD_RELOC_16, 9 + .reloc ., BFD_RELOC_32, 9 + .reloc ., BFD_RELOC_64, 9 nop # OBJ-O32-LABEL: Name: .text @@ -134,9 +135,9 @@ foo: # OBJ-O32-NEXT: 0x80 R_MIPS_JALR - # OBJ-O32-NEXT: 0x84 R_MICROMIPS_JALR - # OBJ-O32-NEXT: 0x88 R_MIPS_NONE - -# OBJ-O32-NEXT: 0x89 R_MIPS_16 - -# OBJ-O32-NEXT: 0x8A R_MIPS_32 - -# OBJ-O32-NEXT: 0x8B R_MIPS_64 - +# OBJ-O32-NEXT: 0x88 R_MIPS_16 - +# OBJ-O32-NEXT: 0x88 R_MIPS_32 - +# OBJ-O32-NEXT: 0x88 R_MIPS_64 - # OBJ-O32-NEXT: 0x1C R_MIPS_GOT16 - # OBJ-O32-NEXT: 0x60 R_MICROMIPS_GOT16 - @@ -188,9 +189,9 @@ foo: # OBJ-N32-NEXT: 0x80 R_MIPS_JALR - 0x4 # OBJ-N32-NEXT: 0x84 R_MICROMIPS_JALR - 0x4 # OBJ-N32-NEXT: 0x88 R_MIPS_NONE - 0x9 -# OBJ-N32-NEXT: 0x89 R_MIPS_16 - 0x9 -# OBJ-N32-NEXT: 0x8A R_MIPS_32 - 0x9 -# OBJ-N32-NEXT: 0x8B R_MIPS_64 - 0x9 +# OBJ-N32-NEXT: 0x88 R_MIPS_16 - 0x9 +# OBJ-N32-NEXT: 0x88 R_MIPS_32 - 0x9 +# OBJ-N32-NEXT: 0x88 R_MIPS_64 - 0x9 # OBJ-N64-LABEL: Name: .text # OBJ-N64: 0000: 00000000 00000000 00000000 00000000 @@ -239,6 +240,6 @@ foo: # OBJ-N64-NEXT: 0x80 R_MIPS_JALR/R_MIPS_NONE/R_MIPS_NONE - 0x4 # OBJ-N64-NEXT: 0x84 R_MICROMIPS_JALR/R_MIPS_NONE/R_MIPS_NONE - 0x4 # OBJ-N64-NEXT: 0x88 R_MIPS_NONE/R_MIPS_NONE/R_MIPS_NONE - 0x9 -# OBJ-N64-NEXT: 0x89 R_MIPS_16/R_MIPS_NONE/R_MIPS_NONE - 0x9 -# OBJ-N64-NEXT: 0x8A R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE - 0x9 -# OBJ-N64-NEXT: 0x8B R_MIPS_64/R_MIPS_NONE/R_MIPS_NONE - 0x9 +# OBJ-N64-NEXT: 0x88 R_MIPS_16/R_MIPS_NONE/R_MIPS_NONE - 0x9 +# OBJ-N64-NEXT: 0x88 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE - 0x9 +# OBJ-N64-NEXT: 0x88 R_MIPS_64/R_MIPS_NONE/R_MIPS_NONE - 0x9 diff --git a/llvm/test/MC/PowerPC/ppc32-reloc-directive.s b/llvm/test/MC/PowerPC/ppc32-reloc-directive.s index 3eb6c2964c85c..ca809a750a373 100644 --- a/llvm/test/MC/PowerPC/ppc32-reloc-directive.s +++ b/llvm/test/MC/PowerPC/ppc32-reloc-directive.s @@ -2,15 +2,15 @@ # RUN: llvm-mc -filetype=obj -triple=powerpc-linux-musl %s | llvm-readobj -r - | FileCheck %s -# PRINT: .reloc 8, R_PPC_NONE, .data -# PRINT: .reloc 4, R_PPC_NONE, foo+4 -# PRINT: .reloc 0, R_PPC_NONE, 8 -# PRINT: .reloc 0, R_PPC_ADDR32, .data+2 -# PRINT: .reloc 0, R_PPC_REL16_HI, foo+3 -# PRINT: .reloc 0, R_PPC_REL16_HA, 5 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT: .reloc 0, BFD_RELOC_16, 9 -# PRINT: .reloc 0, BFD_RELOC_32, 9 +# PRINT: .reloc {{.*}}+8, R_PPC_NONE, .data +# PRINT: .reloc {{.*}}+4, R_PPC_NONE, foo+4 +# PRINT: .reloc {{.*}}+0, R_PPC_NONE, 8 +# PRINT: .reloc {{.*}}+0, R_PPC_ADDR32, .data+2 +# PRINT: .reloc {{.*}}+0, R_PPC_REL16_HI, foo+3 +# PRINT: .reloc {{.*}}+0, R_PPC_REL16_HA, 5 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9 # CHECK: 0x8 R_PPC_NONE .data 0x0 # CHECK-NEXT: 0x4 R_PPC_NONE foo 0x4 @@ -23,19 +23,19 @@ # CHECK-NEXT: 0x0 R_PPC_ADDR32 - 0x9 .text + .reloc .+8, R_PPC_NONE, .data + .reloc .+4, R_PPC_NONE, foo+4 + .reloc .+0, R_PPC_NONE, 8 + .reloc .+0, R_PPC_ADDR32, .data+2 + .reloc .+0, R_PPC_REL16_HI, foo+3 + .reloc .+0, R_PPC_REL16_HA, 5 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_16, 9 + .reloc .+0, BFD_RELOC_32, 9 blr nop nop - .reloc 8, R_PPC_NONE, .data - .reloc 4, R_PPC_NONE, foo+4 - .reloc 0, R_PPC_NONE, 8 - .reloc 0, R_PPC_ADDR32, .data+2 - .reloc 0, R_PPC_REL16_HI, foo+3 - .reloc 0, R_PPC_REL16_HA, 5 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_16, 9 - .reloc 0, BFD_RELOC_32, 9 .data .globl foo diff --git a/llvm/test/MC/PowerPC/ppc64-reloc-directive.s b/llvm/test/MC/PowerPC/ppc64-reloc-directive.s index 5f54ac73bcf16..2268a3c18bf97 100644 --- a/llvm/test/MC/PowerPC/ppc64-reloc-directive.s +++ b/llvm/test/MC/PowerPC/ppc64-reloc-directive.s @@ -4,16 +4,16 @@ # RUN: llvm-mc -filetype=obj -triple=powerpc64-linux-musl %s | llvm-readobj -r - | FileCheck %s # RUN: llvm-mc -filetype=obj -triple=powerpc64le-linux-musl %s | llvm-readobj -r - | FileCheck %s -# PRINT: .reloc 8, R_PPC64_NONE, .data -# PRINT: .reloc 4, R_PPC64_NONE, foo+4 -# PRINT: .reloc 0, R_PPC64_NONE, 8 -# PRINT: .reloc 0, R_PPC64_ADDR32, .data+2 -# PRINT: .reloc 0, R_PPC64_REL16_HI, foo+3 -# PRINT: .reloc 0, R_PPC64_REL16_HA, 5 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT: .reloc 0, BFD_RELOC_16, 9 -# PRINT: .reloc 0, BFD_RELOC_32, 9 -# PRINT: .reloc 0, BFD_RELOC_64, 9 +# PRINT: .reloc {{.*}}+8, R_PPC64_NONE, .data +# PRINT: .reloc {{.*}}+4, R_PPC64_NONE, foo+4 +# PRINT: .reloc {{.*}}+0, R_PPC64_NONE, 8 +# PRINT: .reloc {{.*}}+0, R_PPC64_ADDR32, .data+2 +# PRINT: .reloc {{.*}}+0, R_PPC64_REL16_HI, foo+3 +# PRINT: .reloc {{.*}}+0, R_PPC64_REL16_HA, 5 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_NONE, 9 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_16, 9 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_32, 9 +# PRINT: .reloc {{.*}}+0, BFD_RELOC_64, 9 # CHECK: 0x8 R_PPC64_NONE .data 0x0 # CHECK-NEXT: 0x4 R_PPC64_NONE foo 0x4 @@ -27,20 +27,20 @@ # CHECK-NEXT: 0x0 R_PPC64_ADDR64 - 0x9 .text + .reloc .+8, R_PPC64_NONE, .data + .reloc .+4, R_PPC64_NONE, foo+4 + .reloc .+0, R_PPC64_NONE, 8 + .reloc .+0, R_PPC64_ADDR32, .data+2 + .reloc .+0, R_PPC64_REL16_HI, foo+3 + .reloc .+0, R_PPC64_REL16_HA, 5 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_16, 9 + .reloc .+0, BFD_RELOC_32, 9 + .reloc .+0, BFD_RELOC_64, 9 blr nop nop - .reloc 8, R_PPC64_NONE, .data - .reloc 4, R_PPC64_NONE, foo+4 - .reloc 0, R_PPC64_NONE, 8 - .reloc 0, R_PPC64_ADDR32, .data+2 - .reloc 0, R_PPC64_REL16_HI, foo+3 - .reloc 0, R_PPC64_REL16_HA, 5 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_16, 9 - .reloc 0, BFD_RELOC_32, 9 - .reloc 0, BFD_RELOC_64, 9 .data .globl foo diff --git a/llvm/test/MC/RISCV/reloc-directive-err.s b/llvm/test/MC/RISCV/reloc-directive-err.s index 2b00019fcb8ea..370e4ceb95734 100644 --- a/llvm/test/MC/RISCV/reloc-directive-err.s +++ b/llvm/test/MC/RISCV/reloc-directive-err.s @@ -1,6 +1,6 @@ # RUN: llvm-mc -triple=riscv64 %s 2>&1 | FileCheck --check-prefix=PRINT %s # RUN: not llvm-mc -filetype=obj -triple=riscv64 %s -o /dev/null 2>&1 | FileCheck %s -# PRINT: .reloc 0, R_INVALID, 0 +# PRINT: .reloc {{.*}}, R_INVALID, 0 # CHECK: {{.*}}.s:[[# @LINE+1]]:11: error: unknown relocation name -.reloc 0, R_INVALID, 0 +.reloc ., R_INVALID, 0 diff --git a/llvm/test/MC/RISCV/reloc-directive.s b/llvm/test/MC/RISCV/reloc-directive.s index 0e217fa798482..4ab2889a17ac9 100644 --- a/llvm/test/MC/RISCV/reloc-directive.s +++ b/llvm/test/MC/RISCV/reloc-directive.s @@ -3,15 +3,7 @@ # RUN: llvm-mc -filetype=obj -triple=riscv32 %s | llvm-readobj -r - | FileCheck %s # RUN: llvm-mc -filetype=obj -triple=riscv64 %s | llvm-readobj -r - | FileCheck %s -# PRINT: .reloc 8, R_RISCV_NONE, .data -# PRINT: .reloc 4, R_RISCV_NONE, foo+4 -# PRINT: .reloc 0, R_RISCV_NONE, 8 -# PRINT: .reloc 0, R_RISCV_32, .data+2 -# PRINT: .reloc 0, R_RISCV_SET32, foo+3 -# PRINT: .reloc 0, R_RISCV_32_PCREL, 5 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9 +# PRINT: .reloc {{.*}}+8, R_RISCV_NONE, .data # CHECK: 0x8 R_RISCV_NONE .data 0x0 # CHECK-NEXT: 0x4 R_RISCV_NONE foo 0x4 @@ -37,26 +29,26 @@ # CHECK-NEXT: } .text - ret - nop - nop - .reloc 8, R_RISCV_NONE, .data - .reloc 4, R_RISCV_NONE, foo+4 - .reloc 0, R_RISCV_NONE, 8 + .reloc .+8, R_RISCV_NONE, .data + .reloc .+4, R_RISCV_NONE, foo+4 + .reloc .+0, R_RISCV_NONE, 8 - .reloc 0, R_RISCV_32, .data+2 - .reloc 0, R_RISCV_SET32, foo+3 - .reloc 0, R_RISCV_32_PCREL, 5 + .reloc .+0, R_RISCV_32, .data+2 + .reloc .+0, R_RISCV_SET32, foo+3 + .reloc .+0, R_RISCV_32_PCREL, 5 - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_32, 9 - .reloc 0, BFD_RELOC_64, 9 + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_32, 9 + .reloc .+0, BFD_RELOC_64, 9 .reloc foo, R_RISCV_32, 6 .reloc line, R_RISCV_32, 6 .reloc probe, R_RISCV_32, 6 - .reloc foo+4, R_RISCV_32, 6 + ret + nop + nop + .data .globl foo foo: diff --git a/llvm/test/MC/Sparc/Relocations/reloc-directive.s b/llvm/test/MC/Sparc/Relocations/reloc-directive.s index 8899408ee428d..26164b3c2eb38 100644 --- a/llvm/test/MC/Sparc/Relocations/reloc-directive.s +++ b/llvm/test/MC/Sparc/Relocations/reloc-directive.s @@ -3,15 +3,7 @@ # RUN: llvm-mc -filetype=obj -triple=sparc %s | llvm-readobj -r - | FileCheck %s # RUN: llvm-mc -filetype=obj -triple=sparcv9 %s | llvm-readobj -r - | FileCheck %s -# PRINT: .reloc 8, R_SPARC_NONE, .data -# PRINT: .reloc 4, R_SPARC_NONE, foo+4 -# PRINT: .reloc 0, R_SPARC_NONE, 8 -# PRINT: .reloc 0, R_SPARC_32, .data+2 -# PRINT: .reloc 0, R_SPARC_UA16, foo+3 -# PRINT: .reloc 0, R_SPARC_DISP32, foo+5 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, foo+2 -# PRINT-NEXT: .reloc 0, BFD_RELOC_64, foo+3 +# PRINT: .reloc {{.*}}+8, R_SPARC_NONE, .data # CHECK: 0x8 R_SPARC_NONE .data 0x0 # CHECK-NEXT: 0x4 R_SPARC_NONE foo 0x4 @@ -23,20 +15,20 @@ # CHECK-NEXT: 0x0 R_SPARC_32 foo 0x2 # CHECK-NEXT: 0x0 R_SPARC_64 foo 0x3 .text + .reloc .+8, R_SPARC_NONE, .data + .reloc .+4, R_SPARC_NONE, foo+4 + .reloc .+0, R_SPARC_NONE, 8 + + .reloc .+0, R_SPARC_32, .data+2 + .reloc .+0, R_SPARC_UA16, foo+3 + .reloc .+0, R_SPARC_DISP32, foo+5 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_32, foo+2 + .reloc .+0, BFD_RELOC_64, foo+3 ret nop nop - .reloc 8, R_SPARC_NONE, .data - .reloc 4, R_SPARC_NONE, foo+4 - .reloc 0, R_SPARC_NONE, 8 - - .reloc 0, R_SPARC_32, .data+2 - .reloc 0, R_SPARC_UA16, foo+3 - .reloc 0, R_SPARC_DISP32, foo+5 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_32, foo+2 - .reloc 0, BFD_RELOC_64, foo+3 .data .globl foo diff --git a/llvm/test/MC/SystemZ/reloc-directive.s b/llvm/test/MC/SystemZ/reloc-directive.s index abc6ca320642d..78c36e1434574 100644 --- a/llvm/test/MC/SystemZ/reloc-directive.s +++ b/llvm/test/MC/SystemZ/reloc-directive.s @@ -3,19 +3,7 @@ # RUN: llvm-mc -filetype=obj -triple=s390x-linux-gnu %s -o %t # RUN: llvm-readobj -r %t | FileCheck %s -# PRINT: .reloc 2, R_390_NONE, .data -# PRINT-NEXT: .reloc 1, R_390_NONE, foo+4 -# PRINT-NEXT: .reloc 0, R_390_NONE, 8 -# PRINT-NEXT: .reloc 0, R_390_64, .data+2 -# PRINT-NEXT: .reloc 0, R_390_GOTENT, foo+3 -# PRINT-NEXT: .reloc 0, R_390_PC32DBL, 6 -# PRINT-NEXT: .reloc 4, R_390_12, foo -# PRINT-NEXT: .reloc 2, R_390_20, foo -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9 +# PRINT: .reloc {{.*}}+2, R_390_NONE, .data # CHECK: 0x2 R_390_NONE .data 0x0 # CHECK-NEXT: 0x1 R_390_NONE foo 0x4 @@ -32,23 +20,23 @@ # CHECK-NEXT: 0x0 R_390_64 - 0x9 .text + .reloc .+2, R_390_NONE, .data + .reloc .+1, R_390_NONE, foo+4 + .reloc .+0, R_390_NONE, 8 + .reloc .+0, R_390_64, .data+2 + .reloc .+0, R_390_GOTENT, foo+3 + .reloc .+0, R_390_PC32DBL, 6 + .reloc .+4, R_390_12, foo + .reloc .+2, R_390_20, foo + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_8, 9 + .reloc .+0, BFD_RELOC_16, 9 + .reloc .+0, BFD_RELOC_32, 9 + .reloc .+0, BFD_RELOC_64, 9 br %r14 nop nop - .reloc 2, R_390_NONE, .data - .reloc 1, R_390_NONE, foo+4 - .reloc 0, R_390_NONE, 8 - .reloc 0, R_390_64, .data+2 - .reloc 0, R_390_GOTENT, foo+3 - .reloc 0, R_390_PC32DBL, 6 - .reloc 4, R_390_12, foo - .reloc 2, R_390_20, foo - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_8, 9 - .reloc 0, BFD_RELOC_16, 9 - .reloc 0, BFD_RELOC_32, 9 - .reloc 0, BFD_RELOC_64, 9 .data .globl foo diff --git a/llvm/test/MC/X86/reloc-directive-elf-32.s b/llvm/test/MC/X86/reloc-directive-elf-32.s index d4b612ebfcefc..d3112dd5f7daf 100644 --- a/llvm/test/MC/X86/reloc-directive-elf-32.s +++ b/llvm/test/MC/X86/reloc-directive-elf-32.s @@ -4,16 +4,7 @@ # RUN: llvm-readobj -r %t | FileCheck %s # RUN: llvm-readelf -x .data %t | FileCheck --check-prefix=HEX %s -# PRINT: .reloc 2, R_386_NONE, .data -# PRINT-NEXT: .reloc 1, R_386_NONE, foo+4 -# PRINT-NEXT: .reloc 0, R_386_NONE, 8 -# PRINT-NEXT: .reloc 0, R_386_32, .data+2 -# PRINT-NEXT: .reloc 0, R_386_IRELATIVE, foo+3 -# PRINT-NEXT: .reloc 0, R_386_GOT32X, 5 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9 +# PRINT: .reloc {{.*}}+2, R_386_NONE, .data # X86 relocations use the Elf32_Rel format. Addends are neither stored in the # relocation entries nor applied in the referenced locations. @@ -31,20 +22,20 @@ # HEX: 0x00000000 00000000 00000000 .text + .reloc .+2, R_386_NONE, .data + .reloc .+1, R_386_NONE, foo+4 + .reloc .+0, R_386_NONE, 8 + .reloc .+0, R_386_32, .data+2 + .reloc .+0, R_386_IRELATIVE, foo+3 + .reloc .+0, R_386_GOT32X, 5 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_8, 9 + .reloc .+0, BFD_RELOC_16, 9 + .reloc .+0, BFD_RELOC_32, 9 ret nop nop - .reloc 2, R_386_NONE, .data - .reloc 1, R_386_NONE, foo+4 - .reloc 0, R_386_NONE, 8 - .reloc 0, R_386_32, .data+2 - .reloc 0, R_386_IRELATIVE, foo+3 - .reloc 0, R_386_GOT32X, 5 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_8, 9 - .reloc 0, BFD_RELOC_16, 9 - .reloc 0, BFD_RELOC_32, 9 .data .globl foo diff --git a/llvm/test/MC/X86/reloc-directive-elf-64.s b/llvm/test/MC/X86/reloc-directive-elf-64.s index e0a1a5730597f..d6b8db98d5d08 100644 --- a/llvm/test/MC/X86/reloc-directive-elf-64.s +++ b/llvm/test/MC/X86/reloc-directive-elf-64.s @@ -3,18 +3,7 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux-musl %s -o %t # RUN: llvm-readobj -r %t | FileCheck %s -# PRINT: .reloc 2, R_X86_64_NONE, .data -# PRINT-NEXT: .reloc 1, R_X86_64_NONE, foo+4 -# PRINT-NEXT: .reloc 0, R_X86_64_NONE, 8 -# PRINT-NEXT: .reloc 0, R_X86_64_64, .data+2 -# PRINT-NEXT: .reloc 0, R_X86_64_GOTPCRELX, foo+3 -# PRINT-NEXT: .reloc 0, R_X86_64_REX_GOTPCRELX, 5 -# PRINT-NEXT: .reloc 0, R_X86_64_CODE_4_GOTPCRELX, 7 -# PRINT: .reloc 0, BFD_RELOC_NONE, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_32, 9 -# PRINT-NEXT: .reloc 0, BFD_RELOC_64, 9 +# PRINT: .reloc {{.*}}+2, R_X86_64_NONE, .data # CHECK: 0x2 R_X86_64_NONE .data 0x0 # CHECK-NEXT: 0x1 R_X86_64_NONE foo 0x4 @@ -30,22 +19,22 @@ # CHECK-NEXT: 0x0 R_X86_64_64 - 0x9 .text + .reloc .+2, R_X86_64_NONE, .data + .reloc .+1, R_X86_64_NONE, foo+4 + .reloc .+0, R_X86_64_NONE, 8 + .reloc .+0, R_X86_64_64, .data+2 + .reloc .+0, R_X86_64_GOTPCRELX, foo+3 + .reloc .+0, R_X86_64_REX_GOTPCRELX, 5 + .reloc .+0, R_X86_64_CODE_4_GOTPCRELX, 7 + + .reloc .+0, BFD_RELOC_NONE, 9 + .reloc .+0, BFD_RELOC_8, 9 + .reloc .+0, BFD_RELOC_16, 9 + .reloc .+0, BFD_RELOC_32, 9 + .reloc .+0, BFD_RELOC_64, 9 ret nop nop - .reloc 2, R_X86_64_NONE, .data - .reloc 1, R_X86_64_NONE, foo+4 - .reloc 0, R_X86_64_NONE, 8 - .reloc 0, R_X86_64_64, .data+2 - .reloc 0, R_X86_64_GOTPCRELX, foo+3 - .reloc 0, R_X86_64_REX_GOTPCRELX, 5 - .reloc 0, R_X86_64_CODE_4_GOTPCRELX, 7 - - .reloc 0, BFD_RELOC_NONE, 9 - .reloc 0, BFD_RELOC_8, 9 - .reloc 0, BFD_RELOC_16, 9 - .reloc 0, BFD_RELOC_32, 9 - .reloc 0, BFD_RELOC_64, 9 .data .globl foo diff --git a/llvm/test/MC/X86/reloc-directive.s b/llvm/test/MC/X86/reloc-directive.s index 5f4fc2394f5e7..124dc06951122 100644 --- a/llvm/test/MC/X86/reloc-directive.s +++ b/llvm/test/MC/X86/reloc-directive.s @@ -8,16 +8,16 @@ # RUN: FileCheck -check-prefix=OBJ-64 %s .text foo: + .reloc .+4, dir32, foo # ASM: .reloc {{.*}}+4, dir32, foo + .reloc .+0, secrel32, foo+4 # ASM: .reloc {{.*}}+0, secrel32, foo+4 + .reloc .+8, secidx, foo+8 # ASM: .reloc {{.*}}+8, secidx, foo+8 + .reloc .+12, dir32, foo@secrel32 # ASM: .reloc {{.*}}+12, dir32, foo@SECREL32 + .reloc .+16, dir32, foo@imgrel # ASM: .reloc {{.*}}+16, dir32, foo@IMGREL .long 0 .long 0 .long 0 .long 0 .long 0 - .reloc 4, dir32, foo # ASM: .reloc 4, dir32, foo - .reloc 0, secrel32, foo+4 # ASM: .reloc 0, secrel32, foo+4 - .reloc 8, secidx, foo+8 # ASM: .reloc 8, secidx, foo+8 - .reloc 12, dir32, foo@secrel32 # ASM: .reloc 12, dir32, foo@SECREL32 - .reloc 16, dir32, foo@imgrel # ASM: .reloc 16, dir32, foo@IMGREL # OBJ-32-LABEL: Name: .text # OBJ-32: 0000: 04000000 00000000 00000000 From 4a3cb437a32f5611b909fe7e067a9a9d28c2b845 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 17 Jul 2025 13:05:11 +0900 Subject: [PATCH 135/813] AMDGPU: Avoid hardcoding mov opcode --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 88d30fb555513..9da8a1c8e8fb6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6484,7 +6484,7 @@ bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const { if (OldVAddrIdx >= 0) { MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx); VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg()); - if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 || + if (!VAddrDef || !VAddrDef->isMoveImmediate() || !VAddrDef->getOperand(1).isImm() || VAddrDef->getOperand(1).getImm() != 0) return false; From 72c61a6a255cd07c449f213bef9439ab0ee85c08 Mon Sep 17 00:00:00 2001 From: Vikram Hegde <115221833+vikramRH@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:26:27 +0530 Subject: [PATCH 136/813] [AMDGPU][NPM] Fill in addPreSched2 passes (#148112) same as https://github.com/llvm/llvm-project/pull/139516 Co-authored-by : Oke, Akshat <[Akshat.Oke@amd.com](mailto:Akshat.Oke@amd.com)> --- llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 6 ++++++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 1 + llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f4dc4a483181c..31a80e00edd3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -2284,6 +2284,12 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { Base::addPostRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreSched2(AddMachinePass &addPass) const { + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SIShrinkInstructionsPass()); + addPass(SIPostRABundlerPass()); +} + void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { addPass(GCNCreateVOPDPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 3c62cd19c6e57..3b2f39c14a9bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -183,6 +183,7 @@ class AMDGPUCodeGenPassBuilder void addPreEmitPass(AddMachinePass &) const; Error addRegAssignmentOptimized(AddMachinePass &) const; void addOptimizedRegAlloc(AddMachinePass &) const; + void addPreSched2(AddMachinePass &) const; /// Check if a pass is enabled given \p Opt option. The option always /// overrides defaults if explicitly used. Otherwise its default will be used diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 25d102847ab04..243cb95d24e4e 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -8,11 +8,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) +; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) -; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) +; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) -; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) +; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) define void @empty() { ret void From 34c85337424fde5796154d0cb390b39175291373 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 17 Jul 2025 09:26:11 +0200 Subject: [PATCH 137/813] [Coroutines] Always drop lifetime markers after moving allocas to frame (#149141) https://github.com/llvm/llvm-project/pull/142551 started always dropping lifetime markers after moving allocas on the frame, as these are not useful on non-allocas but can cause issues. However, this was not done for other ABIs (retcon, retcononce, async) that go through a different code path. We should treat them the same way. --- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 7 +++++++ .../Coroutines/coro-async-addr-lifetime-start-bug.ll | 6 +++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index fe30c6dc6abe4..fbeb7218ba9a3 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1179,6 +1179,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { AllocaInst *Alloca = P.Alloca; auto *G = GetFramePointer(Alloca); + // Remove any lifetime intrinsics, now that these are no longer allocas. + for (User *U : make_early_inc_range(Alloca->users())) { + auto *I = cast(U); + if (I->isLifetimeStartOrEnd()) + I->eraseFromParent(); + } + // We are not using ReplaceInstWithInst(P.first, cast(G)) // here, as we are changing location of the instruction. G->takeName(Alloca); diff --git a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll index 2306b72a0055f..40101595092b0 100644 --- a/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll +++ b/llvm/test/Transforms/Coroutines/coro-async-addr-lifetime-start-bug.ll @@ -87,9 +87,9 @@ loop_exit: } ; CHECK: define {{.*}} void @my_async_function.resume.0( -; CHECK-NOT: call void @llvm.lifetime.start.p0(i64 4, ptr %3) -; CHECK: br i1 %exitCond, label %loop_exit, label %loop -; CHECK: lifetime.end +; CHECK-NOT: llvm.lifetime +; CHECK: br i1 %exitCond, label %common.ret, label %loop +; CHECK-NOT: llvm.lifetime ; CHECK: } declare { ptr, ptr, ptr, ptr } @llvm.coro.suspend.async.sl_p0i8p0i8p0i8p0i8s(i32, ptr, ptr, ...) From 3cb0c7f45b97802ddc13a15560fbbca2efb75326 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 17 Jul 2025 00:36:10 -0700 Subject: [PATCH 138/813] MC: Rework .reloc directive and fix the offset when it evaluates to a constant * Fix `.reloc constant` to mean section_symbol+constant instead of .+constant . The initial .reloc support from MIPS incorrectly interpreted the offset. * Delay the evaluation of the offset expression after MCAssembler::layout, deleting a lot of code working with MCFragment. * Delete many FIXME from https://reviews.llvm.org/D79625 * Some lld/ELF/Arch/LoongArch.cpp relaxation tests rely on .reloc ., R_LARCH_ALIGN generating ALIGN relocations at specific location. Sort the relocations. --- lld/ELF/Relocations.cpp | 5 +- llvm/include/llvm/MC/MCAssembler.h | 8 + llvm/include/llvm/MC/MCELFStreamer.h | 3 +- llvm/include/llvm/MC/MCObjectStreamer.h | 14 +- llvm/include/llvm/MC/MCStreamer.h | 10 +- llvm/lib/MC/MCAsmStreamer.cpp | 12 +- llvm/lib/MC/MCAssembler.cpp | 25 +++ llvm/lib/MC/MCELFStreamer.cpp | 21 ++- llvm/lib/MC/MCObjectStreamer.cpp | 142 ++---------------- llvm/lib/MC/MCParser/AsmParser.cpp | 9 +- .../MCTargetDesc/AArch64ELFStreamer.cpp | 3 +- .../Target/Mips/AsmParser/MipsAsmParser.cpp | 2 +- llvm/lib/Target/Mips/MipsAsmPrinter.cpp | 2 +- llvm/test/MC/ELF/reloc-directive.s | 51 ++++--- llvm/test/MC/Mips/reloc-directive-bad-obj.s | 4 +- .../MC/Mips/reloc-directive-label-offset.s | 8 +- 16 files changed, 113 insertions(+), 206 deletions(-) diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index cebd564036b2c..4333b032c9d4e 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -1671,8 +1671,9 @@ void RelocationScanner::scan(Relocs rels) { } // Sort relocations by offset for more efficient searching for - // R_RISCV_PCREL_HI20, R_PPC64_ADDR64 and the branch-to-branch optimization. - if (ctx.arg.emachine == EM_RISCV || + // R_RISCV_PCREL_HI20, ALIGN relocations, R_PPC64_ADDR64 and the + // branch-to-branch optimization. + if (is_contained({EM_RISCV, EM_LOONGARCH}, ctx.arg.emachine) || (ctx.arg.emachine == EM_PPC64 && sec->name == ".toc") || ctx.arg.branchToBranch) llvm::stable_sort(sec->relocs(), diff --git a/llvm/include/llvm/MC/MCAssembler.h b/llvm/include/llvm/MC/MCAssembler.h index aa396efa9f018..ade9ee6fa56e0 100644 --- a/llvm/include/llvm/MC/MCAssembler.h +++ b/llvm/include/llvm/MC/MCAssembler.h @@ -69,6 +69,13 @@ class MCAssembler { SmallVector Symbols; + struct RelocDirective { + const MCExpr &Offset; + const MCExpr *Expr; + uint32_t Kind; + }; + SmallVector relocDirectives; + mutable SmallVector, 0> PendingErrors; MCDwarfLineTableParams LTParams; @@ -205,6 +212,7 @@ class MCAssembler { LLVM_ABI bool registerSection(MCSection &Section); LLVM_ABI bool registerSymbol(const MCSymbol &Symbol); + void addRelocDirective(RelocDirective RD); LLVM_ABI void reportError(SMLoc L, const Twine &Msg) const; // Record pending errors during layout iteration, as they may go away once the diff --git a/llvm/include/llvm/MC/MCELFStreamer.h b/llvm/include/llvm/MC/MCELFStreamer.h index ad0961c8bcf97..144f6bc3bd91c 100644 --- a/llvm/include/llvm/MC/MCELFStreamer.h +++ b/llvm/include/llvm/MC/MCELFStreamer.h @@ -141,7 +141,8 @@ class MCELFStreamer : public MCObjectStreamer { } private: - void finalizeCGProfileEntry(const MCSymbolRefExpr *&S, uint64_t Offset); + void finalizeCGProfileEntry(const MCSymbolRefExpr *Sym, uint64_t Offset, + const MCSymbolRefExpr *&S); void finalizeCGProfile(); bool SeenIdent = false; diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h index e2a77b809b6ca..a55fd4a14675f 100644 --- a/llvm/include/llvm/MC/MCObjectStreamer.h +++ b/llvm/include/llvm/MC/MCObjectStreamer.h @@ -40,14 +40,6 @@ class MCObjectStreamer : public MCStreamer { std::unique_ptr Assembler; bool EmitEHFrame; bool EmitDebugFrame; - struct PendingMCFixup { - const MCSymbol *Sym; - MCFixup Fixup; - MCFragment *DF; - PendingMCFixup(const MCSymbol *McSym, MCFragment *F, MCFixup McFixup) - : Sym(McSym), Fixup(McFixup), DF(F) {} - }; - SmallVector PendingFixups; struct PendingAssignment { MCSymbol *Symbol; @@ -63,7 +55,6 @@ class MCObjectStreamer : public MCStreamer { void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override; void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override; void emitInstructionImpl(const MCInst &Inst, const MCSubtargetInfo &STI); - void resolvePendingFixups(); protected: MCObjectStreamer(MCContext &Context, std::unique_ptr TAB, @@ -162,9 +153,8 @@ class MCObjectStreamer : public MCStreamer { void emitCVStringTableDirective() override; void emitCVFileChecksumsDirective() override; void emitCVFileChecksumOffsetDirective(unsigned FileNo) override; - std::optional> - emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr, - SMLoc Loc, const MCSubtargetInfo &STI) override; + void emitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc Loc = {}) override; using MCStreamer::emitFill; void emitFill(const MCExpr &NumBytes, uint64_t FillValue, SMLoc Loc = SMLoc()) override; diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index 1f7c8b57540a7..b3a9aabd6ece5 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -1048,13 +1048,9 @@ class LLVM_ABI MCStreamer { virtual void emitSyntaxDirective(); - /// Record a relocation described by the .reloc directive. Return std::nullopt - /// if succeeded. Otherwise, return a pair (Name is invalid, error message). - virtual std::optional> - emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr, - SMLoc Loc, const MCSubtargetInfo &STI) { - return std::nullopt; - } + /// Record a relocation described by the .reloc directive. + virtual void emitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc Loc = {}) {} virtual void emitAddrsig() {} virtual void emitAddrsigSym(const MCSymbol *Sym) {} diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 3a330dbfec342..67c53e01a6111 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -407,9 +407,8 @@ class MCAsmStreamer final : public MCStreamer { const MCPseudoProbeInlineStack &InlineStack, MCSymbol *FnSym) override; - std::optional> - emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr, - SMLoc Loc, const MCSubtargetInfo &STI) override; + void emitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc Loc) override; void emitAddrsig() override; void emitAddrsigSym(const MCSymbol *Sym) override; @@ -2468,10 +2467,8 @@ void MCAsmStreamer::emitPseudoProbe(uint64_t Guid, uint64_t Index, EmitEOL(); } -std::optional> -MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, - const MCExpr *Expr, SMLoc, - const MCSubtargetInfo &STI) { +void MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc) { OS << "\t.reloc "; MAI->printExpr(OS, Offset); OS << ", " << Name; @@ -2480,7 +2477,6 @@ MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, MAI->printExpr(OS, *Expr); } EmitEOL(); - return std::nullopt; } void MCAsmStreamer::emitAddrsig() { diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index d4d10e0cd74a5..f1a82f6b08d31 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -398,6 +398,10 @@ bool MCAssembler::registerSymbol(const MCSymbol &Symbol) { return Changed; } +void MCAssembler::addRelocDirective(RelocDirective RD) { + relocDirectives.push_back(RD); +} + /// Write the fragment \p F to the output file. static void writeFragment(raw_ostream &OS, const MCAssembler &Asm, const MCFragment &F) { @@ -695,6 +699,27 @@ void MCAssembler::layout() { // helps check whether a PC-relative fixup is fully resolved. this->HasFinalLayout = true; + // Resolve .reloc offsets and add fixups. + for (auto &PF : relocDirectives) { + MCValue Res; + auto &O = PF.Offset; + if (!O.evaluateAsValue(Res, *this)) { + getContext().reportError(O.getLoc(), ".reloc offset is not relocatable"); + continue; + } + auto *Sym = Res.getAddSym(); + auto *F = Sym ? Sym->getFragment() : nullptr; + auto *Sec = F ? F->getParent() : nullptr; + if (Res.getSubSym() || !Sec) { + getContext().reportError(O.getLoc(), + ".reloc offset is not relative to a section"); + continue; + } + + uint64_t Offset = Sym ? Sym->getOffset() + Res.getConstant() : 0; + F->addFixup(MCFixup::create(Offset, PF.Expr, PF.Kind)); + } + // Evaluate and apply the fixups, generating relocation entries as necessary. for (MCSection &Sec : *this) { for (MCFragment &F : Sec) { diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index ffc57227cff16..49071bdec3dbd 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -314,8 +314,9 @@ void MCELFStreamer::emitIdent(StringRef IdentString) { popSection(); } -void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE, - uint64_t Offset) { +void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *Sym, + uint64_t Offset, + const MCSymbolRefExpr *&SRE) { const MCSymbol *S = &SRE->getSymbol(); if (S->isTemporary()) { if (!S->isInSection()) { @@ -328,13 +329,9 @@ void MCELFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE, S->setUsedInReloc(); SRE = MCSymbolRefExpr::create(S, getContext(), SRE->getLoc()); } - const MCConstantExpr *MCOffset = MCConstantExpr::create(Offset, getContext()); - if (std::optional> Err = - MCObjectStreamer::emitRelocDirective( - *MCOffset, "BFD_RELOC_NONE", SRE, SRE->getLoc(), - *getContext().getSubtargetInfo())) - report_fatal_error("Relocation for CG Profile could not be created: " + - Twine(Err->second)); + auto *O = MCBinaryExpr::createAdd( + Sym, MCConstantExpr::create(Offset, getContext()), getContext()); + MCObjectStreamer::emitRelocDirective(*O, "BFD_RELOC_NONE", SRE); } void MCELFStreamer::finalizeCGProfile() { @@ -347,9 +344,11 @@ void MCELFStreamer::finalizeCGProfile() { pushSection(); switchSection(CGProfile); uint64_t Offset = 0; + auto *Sym = + MCSymbolRefExpr::create(CGProfile->getBeginSymbol(), getContext()); for (auto &E : W.getCGProfile()) { - finalizeCGProfileEntry(E.From, Offset); - finalizeCGProfileEntry(E.To, Offset); + finalizeCGProfileEntry(Sym, Offset, E.From); + finalizeCGProfileEntry(Sym, Offset, E.To); emitIntValue(E.Count, sizeof(uint64_t)); Offset += sizeof(uint64_t); } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index c0cef0f06c57a..67433f2b265e5 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -46,35 +46,6 @@ MCAssembler *MCObjectStreamer::getAssemblerPtr() { return nullptr; } -// When fixup's offset is a forward declared label, e.g.: -// -// .reloc 1f, R_MIPS_JALR, foo -// 1: nop -// -// postpone adding it to Fixups vector until the label is defined and its offset -// is known. -void MCObjectStreamer::resolvePendingFixups() { - for (PendingMCFixup &PendingFixup : PendingFixups) { - if (!PendingFixup.Sym || PendingFixup.Sym->isUndefined ()) { - getContext().reportError(PendingFixup.Fixup.getLoc(), - "unresolved relocation offset"); - continue; - } - PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset() + - PendingFixup.Fixup.getOffset()); - - // If the location symbol to relocate is in MCEncodedFragment, - // put the Fixup into location symbol's fragment. Otherwise - // put into PendingFixup.DF - MCFragment *F = PendingFixup.Sym->getFragment(); - if (F->isEncoded()) - F->addFixup(PendingFixup.Fixup); - else - PendingFixup.DF->addFixup(PendingFixup.Fixup); - } - PendingFixups.clear(); -} - // As a compile-time optimization, avoid allocating and evaluating an MCExpr // tree for (Hi - Lo) when Hi and Lo are offsets into the same fragment's fixed // part. @@ -607,76 +578,14 @@ void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, insert(getContext().allocFragment(*Offset, Value, Loc)); } -static std::optional> -getOffsetAndDataFragment(const MCSymbol &Symbol, uint32_t &RelocOffset, - MCFragment *&DF) { - if (Symbol.isVariable()) { - const MCExpr *SymbolExpr = Symbol.getVariableValue(); - MCValue OffsetVal; - if (!SymbolExpr->evaluateAsRelocatable(OffsetVal, nullptr)) - return std::make_pair(false, - std::string("symbol in .reloc offset is not " - "relocatable")); - if (OffsetVal.isAbsolute()) { - RelocOffset = OffsetVal.getConstant(); - MCFragment *Fragment = Symbol.getFragment(); - // FIXME Support symbols with no DF. For example: - // .reloc .data, ENUM_VALUE, - if (!Fragment || Fragment->getKind() != MCFragment::FT_Data) - return std::make_pair(false, - std::string("symbol in offset has no data " - "fragment")); - DF = cast(Fragment); - return std::nullopt; - } - - if (OffsetVal.getSubSym()) - return std::make_pair(false, - std::string(".reloc symbol offset is not " - "representable")); - - const MCSymbol &SA = *OffsetVal.getAddSym(); - if (!SA.isDefined()) - return std::make_pair(false, - std::string("symbol used in the .reloc offset is " - "not defined")); - - if (SA.isVariable()) - return std::make_pair(false, - std::string("symbol used in the .reloc offset is " - "variable")); - - MCFragment *Fragment = SA.getFragment(); - // FIXME Support symbols with no DF. For example: - // .reloc .data, ENUM_VALUE, - if (!Fragment || Fragment->getKind() != MCFragment::FT_Data) - return std::make_pair(false, - std::string("symbol in offset has no data " - "fragment")); - RelocOffset = SA.getOffset() + OffsetVal.getConstant(); - DF = cast(Fragment); - } else { - RelocOffset = Symbol.getOffset(); - MCFragment *Fragment = Symbol.getFragment(); - // FIXME Support symbols with no DF. For example: - // .reloc .data, ENUM_VALUE, - if (!Fragment || Fragment->getKind() != MCFragment::FT_Data) - return std::make_pair(false, - std::string("symbol in offset has no data " - "fragment")); - DF = cast(Fragment); - } - return std::nullopt; -} - -std::optional> -MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, - const MCExpr *Expr, SMLoc Loc, - const MCSubtargetInfo &STI) { +void MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, + const MCExpr *Expr, SMLoc Loc) { std::optional MaybeKind = Assembler->getBackend().getFixupKind(Name); - if (!MaybeKind) - return std::make_pair(true, std::string("unknown relocation name")); + if (!MaybeKind) { + getContext().reportError(Loc, "unknown relocation name"); + return; + } MCFixupKind Kind = *MaybeKind; if (Expr) @@ -685,38 +594,14 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name, Expr = MCSymbolRefExpr::create(getContext().createTempSymbol(), getContext()); - MCFragment *DF = getOrCreateDataFragment(&STI); - MCValue OffsetVal; - if (!Offset.evaluateAsRelocatable(OffsetVal, nullptr)) - return std::make_pair(false, - std::string(".reloc offset is not relocatable")); - if (OffsetVal.isAbsolute()) { - if (OffsetVal.getConstant() < 0) - return std::make_pair(false, std::string(".reloc offset is negative")); - DF->addFixup(MCFixup::create(OffsetVal.getConstant(), Expr, Kind)); - return std::nullopt; - } - if (OffsetVal.getSubSym()) - return std::make_pair(false, - std::string(".reloc offset is not representable")); - - const MCSymbol &Symbol = *OffsetVal.getAddSym(); - if (Symbol.isDefined()) { - uint32_t SymbolOffset = 0; - std::optional> Error = - getOffsetAndDataFragment(Symbol, SymbolOffset, DF); - - if (Error != std::nullopt) - return Error; - - DF->addFixup( - MCFixup::create(SymbolOffset + OffsetVal.getConstant(), Expr, Kind)); - return std::nullopt; + auto *O = &Offset; + int64_t Val; + if (Offset.evaluateAsAbsolute(Val, nullptr)) { + auto *SecSym = getCurrentSectionOnly()->getBeginSymbol(); + O = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(SecSym, getContext()), + O, getContext(), Loc); } - - PendingFixups.emplace_back( - &Symbol, DF, MCFixup::create(OffsetVal.getConstant(), Expr, Kind)); - return std::nullopt; + getAssembler().addRelocDirective({*O, Expr, Kind}); } void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue, @@ -799,6 +684,5 @@ void MCObjectStreamer::finishImpl() { // Emit pseudo probes for the current module. MCPseudoProbeTable::emit(this); - resolvePendingFixups(); getAssembler().Finish(); } diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index a36b2dea70ccf..77bf84364c5a3 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -3079,7 +3079,6 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) { bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) { const MCExpr *Offset; const MCExpr *Expr = nullptr; - SMLoc OffsetLoc = Lexer.getTok().getLoc(); if (parseExpression(Offset)) return true; @@ -3105,13 +3104,7 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) { if (parseEOL()) return true; - const MCTargetAsmParser &MCT = getTargetParser(); - const MCSubtargetInfo &STI = MCT.getSTI(); - if (std::optional> Err = - getStreamer().emitRelocDirective(*Offset, Name, Expr, DirectiveLoc, - STI)) - return Error(Err->first ? NameLoc : OffsetLoc, Err->second); - + getStreamer().emitRelocDirective(*Offset, Name, Expr, NameLoc); return false; } diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 233f42b7a4790..08f547a85073e 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -559,8 +559,7 @@ void AArch64TargetELFStreamer::finish() { if (!Sym.isMemtag()) continue; auto *SRE = MCSymbolRefExpr::create(&Sym, Ctx); - (void)S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE, SMLoc(), - *Ctx.getSubtargetInfo()); + S.emitRelocDirective(*Zero, "BFD_RELOC_NONE", SRE); } } diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 01e4d17f6236d..259b71b37d9a3 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -2101,7 +2101,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, TOut.getStreamer().emitRelocDirective( *TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", - RelocJalrExpr, IDLoc, *STI); + RelocJalrExpr); TOut.getStreamer().emitLabel(TmpLabel); } diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index c18ba44bea08e..ca0331006be74 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -166,7 +166,7 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI, OutStreamer.emitRelocDirective( *OffsetExpr, Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR", - CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo()); + CaleeExpr); OutStreamer.emitLabel(OffsetLabel); return; } diff --git a/llvm/test/MC/ELF/reloc-directive.s b/llvm/test/MC/ELF/reloc-directive.s index 42995aa9e7d81..9871fba2e0021 100644 --- a/llvm/test/MC/ELF/reloc-directive.s +++ b/llvm/test/MC/ELF/reloc-directive.s @@ -9,15 +9,18 @@ # ASM-NEXT: .reloc .Ltmp1-1, R_X86_64_NONE, foo # ASM-NEXT: .Ltmp2: # ASM-NEXT: .reloc 2+.Ltmp2, R_X86_64_NONE, local -# ASM-NEXT: .reloc 1+foo+3, R_X86_64_NONE, data+1 -# ASM-NEXT: .Ltmp3: -# ASM-NEXT: .reloc .Ltmp3, BFD_RELOC_NONE, unused # CHECK: 0x2 R_X86_64_NONE foo 0x0 # CHECK-NEXT: 0x0 R_X86_64_NONE foo 0x0 # CHECK-NEXT: 0x3 R_X86_64_NONE local 0x0 -# CHECK-NEXT: 0x4 R_X86_64_NONE data 0x1 # CHECK-NEXT: 0x1 R_X86_64_NONE unused 0x0 +# CHECK-NEXT: 0x4 R_X86_64_NONE data 0x1 + +# CHECK: .rela.my { +# CHECK: 0x0 R_X86_64_NONE foo 0x0 +# CHECK-NEXT: 0x4 R_X86_64_NONE foo 0x0 +# CHECK-NEXT: 0x8 R_X86_64_NONE foo 0x0 +# CHECK-NEXT: } .text .globl foo @@ -27,17 +30,25 @@ local: .reloc .+3-2, R_X86_64_NONE, foo .reloc .-1, R_X86_64_NONE, foo .reloc 2+., R_X86_64_NONE, local - .reloc 1+foo+3, R_X86_64_NONE, data+1 .reloc ., BFD_RELOC_NONE, unused + .space 3 .data .globl data data: + .reloc 1+foo+3, R_X86_64_NONE, data+1 .long 0 -# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=ERR=1 %s 2>&1 | FileCheck %s --check-prefix=ERR +## Constant offsets are relative to the section start. +.section .my +.word 0 +.reloc 0, BFD_RELOC_NONE, foo +.word 0 +.p2align 3 +.reloc 2+2, BFD_RELOC_NONE, foo +.p2align 4 +.reloc 8, BFD_RELOC_NONE, foo -.ifdef ERR .text .globl a, b a: ret @@ -45,22 +56,26 @@ b: ret x: ret y: ret -# ERR: {{.*}}.s:[[#@LINE+1]]:10: error: expected comma +# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=PARSE=1 %s 2>&1 | FileCheck %s --check-prefix=PARSE +# RUN: not llvm-mc -filetype=obj -triple=x86_64 --defsym=ERR=1 %s 2>&1 | FileCheck %s --check-prefix=ERR + +.ifdef PARSE +# PARSE: {{.*}}.s:[[#@LINE+1]]:10: error: expected comma .reloc 0 R_X86_64_NONE, a -# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is negative +# PARSE: {{.*}}.s:[[#@LINE+1]]:8: error: directional label undefined +.reloc 1f, R_X86_64_NONE, a +.endif + +.ifdef ERR .reloc -1, R_X86_64_NONE, a -# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not relocatable +# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relocatable .reloc 2*., R_X86_64_NONE, a -# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not relocatable +# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relocatable .reloc a+a, R_X86_64_NONE, a -## GNU as accepts a-a but rejects b-a. -# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not representable -.reloc a-a, R_X86_64_NONE, a -## TODO GNU as accepts x-x and y-x. -# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: .reloc offset is not representable +# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relative to a section +.reloc b-a, R_X86_64_NONE, a +# ERR: {{.*}}.s:[[#@LINE+1]]:9: error: .reloc offset is not relative to a section .reloc x-x, R_X86_64_NONE, a -# ERR: {{.*}}.s:[[#@LINE+1]]:8: error: directional label undefined -.reloc 1f, R_X86_64_NONE, a .endif diff --git a/llvm/test/MC/Mips/reloc-directive-bad-obj.s b/llvm/test/MC/Mips/reloc-directive-bad-obj.s index 86d6d0cc66c57..74e5dae5264f6 100644 --- a/llvm/test/MC/Mips/reloc-directive-bad-obj.s +++ b/llvm/test/MC/Mips/reloc-directive-bad-obj.s @@ -2,8 +2,8 @@ # RUN: -target-abi=o32 -filetype=obj -o /dev/null 2>&1 | FileCheck %s .text nop -.reloc foo, R_MIPS_32, .text # CHECK: :[[@LINE]]:24: error: unresolved relocation offset +.reloc foo, R_MIPS_32, .text # CHECK: :[[@LINE]]:8: error: .reloc offset is not relative to a section nop nop -.reloc bar, R_MIPS_32, .text # CHECK: :[[@LINE]]:24: error: unresolved relocation offset +.reloc bar, R_MIPS_32, .text # CHECK: :[[@LINE]]:8: error: .reloc offset is not relative to a section nop diff --git a/llvm/test/MC/Mips/reloc-directive-label-offset.s b/llvm/test/MC/Mips/reloc-directive-label-offset.s index 257bfeb10d151..279fc7860dcea 100644 --- a/llvm/test/MC/Mips/reloc-directive-label-offset.s +++ b/llvm/test/MC/Mips/reloc-directive-label-offset.s @@ -58,18 +58,18 @@ bar: # OBJ-N32-LABEL: Relocations [ # OBJ-N32: 0x4 R_MIPS_NONE .text -# OBJ-N32-NEXT: 0x1C R_MIPS_GOT_OFST .text -# OBJ-N32-NEXT: 0x0 R_MIPS_32 .text # OBJ-N32-NEXT: 0xC R_MIPS_32 .text # OBJ-N32-NEXT: 0x10 R_MIPS_CALL16 foo # OBJ-N32-NEXT: 0x20 R_MIPS_GOT_DISP foo # OBJ-N32-NEXT: 0x24 R_MIPS_GOT_PAGE .text +# OBJ-N32-NEXT: 0x1C R_MIPS_GOT_OFST .text +# OBJ-N32-NEXT: 0x0 R_MIPS_32 .text # OBJ-N64-LABEL: Relocations [ # OBJ-N64: 0x4 R_MIPS_NONE/R_MIPS_NONE/R_MIPS_NONE .text 0x0 -# OBJ-N64-NEXT: 0x1C R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE .text 0x0 -# OBJ-N64-NEXT: 0x0 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0 # OBJ-N64-NEXT: 0xC R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0 # OBJ-N64-NEXT: 0x10 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE foo 0x0 # OBJ-N64-NEXT: 0x20 R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE foo 0x0 # OBJ-N64-NEXT: 0x24 R_MIPS_GOT_PAGE/R_MIPS_NONE/R_MIPS_NONE .text 0x0 +# OBJ-N64-NEXT: 0x1C R_MIPS_GOT_OFST/R_MIPS_NONE/R_MIPS_NONE .text 0x0 +# OBJ-N64-NEXT: 0x0 R_MIPS_32/R_MIPS_NONE/R_MIPS_NONE .text 0x0 From 1653a093def10543d8f958fe7d4cde9ab1dd8bca Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Thu, 17 Jul 2025 09:36:52 +0200 Subject: [PATCH 139/813] [clang-format] Add IgnoreExtension to SortIncludes (#137840) Sorting by stem gives nicer results when various header file names are substrings of other header file names. For example, a CLI application with a main header named analyze.h and an analyze-xxx.h header for each subcommand currently will always put analyze.h last after all the analyze-xxx.h headers, but putting analyze.h first instead is arguably nicer to read. TLDR; Instead of ``` #include "analyze-blame.h" #include "analyze.h" ``` You'd get ``` #include "analyze.h" #include "analyze-blame.h" ``` Let's allow sorting by stem instead of full path by adding IgnoreExtension to SortIncludes. --- clang/docs/ClangFormatStyleOptions.rst | 10 +++++ clang/include/clang/Format/Format.h | 12 +++++- clang/lib/Format/Format.cpp | 45 +++++++++++++-------- clang/unittests/Format/ConfigParseTest.cpp | 22 +++++----- clang/unittests/Format/SortIncludesTest.cpp | 20 +++++++++ 5 files changed, 83 insertions(+), 26 deletions(-) diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 0e21ef0244f78..bfc8094f3f50c 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -6015,6 +6015,16 @@ the configuration (without a prefix: ``Auto``). #include "B/A.h" #include "B/a.h" #include "B/a.h" #include "a/b.h" + * ``bool IgnoreExtension`` When sorting includes in each block, only take file extensions into + account if two includes compare equal otherwise. + + .. code-block:: c++ + + true: false: + # include "A.h" vs. # include "A-util.h" + # include "A.inc" # include "A.h" + # include "A-util.h" # include "A.inc" + .. _SortJavaStaticImport: diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index b4f2a87fe7e83..7677604484f52 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -4385,8 +4385,18 @@ struct FormatStyle { /// #include "B/a.h" #include "a/b.h" /// \endcode bool IgnoreCase; + /// When sorting includes in each block, only take file extensions into + /// account if two includes compare equal otherwise. + /// \code + /// true: false: + /// # include "A.h" vs. # include "A-util.h" + /// # include "A.inc" # include "A.h" + /// # include "A-util.h" # include "A.inc" + /// \endcode + bool IgnoreExtension; bool operator==(const SortIncludesOptions &R) const { - return Enabled == R.Enabled && IgnoreCase == R.IgnoreCase; + return Enabled == R.Enabled && IgnoreCase == R.IgnoreCase && + IgnoreExtension == R.IgnoreExtension; } bool operator!=(const SortIncludesOptions &R) const { return !(*this == R); diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 78c09be458f0a..62feb3db0ed5e 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -665,21 +665,25 @@ template <> struct MappingTraits { IO.enumCase(Value, "Never", FormatStyle::SortIncludesOptions({})); IO.enumCase(Value, "CaseInsensitive", FormatStyle::SortIncludesOptions({/*Enabled=*/true, - /*IgnoreCase=*/true})); + /*IgnoreCase=*/true, + /*IgnoreExtension=*/false})); IO.enumCase(Value, "CaseSensitive", FormatStyle::SortIncludesOptions({/*Enabled=*/true, - /*IgnoreCase=*/false})); + /*IgnoreCase=*/false, + /*IgnoreExtension=*/false})); // For backward compatibility. IO.enumCase(Value, "false", FormatStyle::SortIncludesOptions({})); IO.enumCase(Value, "true", FormatStyle::SortIncludesOptions({/*Enabled=*/true, - /*IgnoreCase=*/false})); + /*IgnoreCase=*/false, + /*IgnoreExtension=*/false})); } static void mapping(IO &IO, FormatStyle::SortIncludesOptions &Value) { IO.mapOptional("Enabled", Value.Enabled); IO.mapOptional("IgnoreCase", Value.IgnoreCase); + IO.mapOptional("IgnoreExtension", Value.IgnoreExtension); } }; @@ -1650,7 +1654,8 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.SeparateDefinitionBlocks = FormatStyle::SDS_Leave; LLVMStyle.ShortNamespaceLines = 1; LLVMStyle.SkipMacroDefinitionBody = false; - LLVMStyle.SortIncludes = {/*Enabled=*/true, /*IgnoreCase=*/false}; + LLVMStyle.SortIncludes = {/*Enabled=*/true, /*IgnoreCase=*/false, + /*IgnoreExtension=*/false}; LLVMStyle.SortJavaStaticImport = FormatStyle::SJSIO_Before; LLVMStyle.SortUsingDeclarations = FormatStyle::SUD_LexicographicNumeric; LLVMStyle.SpaceAfterCStyleCast = false; @@ -3239,19 +3244,27 @@ static void sortCppIncludes(const FormatStyle &Style, SmallVector Indices = llvm::to_vector<16>(llvm::seq(0, Includes.size())); - if (Style.SortIncludes.Enabled && Style.SortIncludes.IgnoreCase) { + if (Style.SortIncludes.Enabled) { stable_sort(Indices, [&](unsigned LHSI, unsigned RHSI) { - const auto LHSFilenameLower = Includes[LHSI].Filename.lower(); - const auto RHSFilenameLower = Includes[RHSI].Filename.lower(); - return std::tie(Includes[LHSI].Priority, LHSFilenameLower, - Includes[LHSI].Filename) < - std::tie(Includes[RHSI].Priority, RHSFilenameLower, - Includes[RHSI].Filename); - }); - } else { - stable_sort(Indices, [&](unsigned LHSI, unsigned RHSI) { - return std::tie(Includes[LHSI].Priority, Includes[LHSI].Filename) < - std::tie(Includes[RHSI].Priority, Includes[RHSI].Filename); + SmallString<128> LHSStem, RHSStem; + if (Style.SortIncludes.IgnoreExtension) { + LHSStem = Includes[LHSI].Filename; + RHSStem = Includes[RHSI].Filename; + llvm::sys::path::replace_extension(LHSStem, ""); + llvm::sys::path::replace_extension(RHSStem, ""); + } + std::string LHSStemLower, RHSStemLower; + std::string LHSFilenameLower, RHSFilenameLower; + if (Style.SortIncludes.IgnoreCase) { + LHSStemLower = LHSStem.str().lower(); + RHSStemLower = RHSStem.str().lower(); + LHSFilenameLower = Includes[LHSI].Filename.lower(); + RHSFilenameLower = Includes[RHSI].Filename.lower(); + } + return std::tie(Includes[LHSI].Priority, LHSStemLower, LHSStem, + LHSFilenameLower, Includes[LHSI].Filename) < + std::tie(Includes[RHSI].Priority, RHSStemLower, RHSStem, + RHSFilenameLower, Includes[RHSI].Filename); }); } diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index d17109aebc0f8..65d8b36c677bd 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -259,6 +259,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { CHECK_PARSE_NESTED_BOOL(SpacesInParensOptions, Other); CHECK_PARSE_NESTED_BOOL(SortIncludes, Enabled); CHECK_PARSE_NESTED_BOOL(SortIncludes, IgnoreCase); + CHECK_PARSE_NESTED_BOOL(SortIncludes, IgnoreExtension); } #undef CHECK_PARSE_BOOL @@ -980,17 +981,20 @@ TEST(ConfigParseTest, ParsesConfiguration) { IncludeStyle.IncludeIsMainSourceRegex, "abc$"); Style.SortIncludes = {}; - CHECK_PARSE("SortIncludes: true", SortIncludes, - FormatStyle::SortIncludesOptions( - {/*Enabled=*/true, /*IgnoreCase=*/false})); + CHECK_PARSE( + "SortIncludes: true", SortIncludes, + FormatStyle::SortIncludesOptions( + {/*Enabled=*/true, /*IgnoreCase=*/false, /*IgnoreExtension=*/false})); CHECK_PARSE("SortIncludes: false", SortIncludes, FormatStyle::SortIncludesOptions({})); - CHECK_PARSE("SortIncludes: CaseInsensitive", SortIncludes, - FormatStyle::SortIncludesOptions( - {/*Enabled=*/true, /*IgnoreCase=*/true})); - CHECK_PARSE("SortIncludes: CaseSensitive", SortIncludes, - FormatStyle::SortIncludesOptions( - {/*Enabled=*/true, /*IgnoreCase=*/false})); + CHECK_PARSE( + "SortIncludes: CaseInsensitive", SortIncludes, + FormatStyle::SortIncludesOptions( + {/*Enabled=*/true, /*IgnoreCase=*/true, /*IgnoreExtension=*/false})); + CHECK_PARSE( + "SortIncludes: CaseSensitive", SortIncludes, + FormatStyle::SortIncludesOptions( + {/*Enabled=*/true, /*IgnoreCase=*/false, /*IgnoreExtension=*/false})); CHECK_PARSE("SortIncludes: Never", SortIncludes, FormatStyle::SortIncludesOptions({})); diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp index 994227efdd4f8..5194d65ed3637 100644 --- a/clang/unittests/Format/SortIncludesTest.cpp +++ b/clang/unittests/Format/SortIncludesTest.cpp @@ -1483,6 +1483,26 @@ TEST_F(SortIncludesTest, BlockCommentedOutIncludes) { verifyFormat(Code, sort(Code, "input.cpp", 0)); } +TEST_F(SortIncludesTest, IgnoreExtension) { + FmtStyle.SortIncludes.IgnoreExtension = true; + + verifyFormat("#include \n" + "#include \n" + "#include ", + sort("#include \n" + "#include \n" + "#include ", + "input.h")); + + verifyFormat("#include \n" + "#include \n" + "#include ", + sort("#include \n" + "#include \n" + "#include ", + "input.h")); +} + } // end namespace } // end namespace format } // end namespace clang From 2194bca2b78cf1ada2aa539e2a3ad8128a80f63a Mon Sep 17 00:00:00 2001 From: William Tran-Viet Date: Thu, 17 Jul 2025 03:43:12 -0400 Subject: [PATCH 140/813] [libc++] Granularize `range_format` and `format_kind` declarations (#148876) While working on #105430 I ran into an issue implementing [[optional.syn]](https://eel.is/c++draft/optional.syn) because of a circular include that looked like the following: `optional -> __format/range_default_formatter.h -> __format/range_formatter.h -> __format/format_context.h -> optional`. Only `format_kind` and `range_format` are needed, and so they looked like candidates to be put into an internal header. --- libcxx/include/CMakeLists.txt | 1 + .../__format/range_default_formatter.h | 42 +---------- libcxx/include/__format/range_format.h | 71 +++++++++++++++++++ libcxx/include/module.modulemap.in | 1 + 4 files changed, 74 insertions(+), 41 deletions(-) create mode 100644 libcxx/include/__format/range_format.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index d729fa81e2b2f..25b567df2dd33 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -403,6 +403,7 @@ set(files __format/indic_conjunct_break_table.h __format/parser_std_format_spec.h __format/range_default_formatter.h + __format/range_format.h __format/range_formatter.h __format/unicode.h __format/width_estimation_table.h diff --git a/libcxx/include/__format/range_default_formatter.h b/libcxx/include/__format/range_default_formatter.h index 7149debb2f141..2769647ad527e 100644 --- a/libcxx/include/__format/range_default_formatter.h +++ b/libcxx/include/__format/range_default_formatter.h @@ -16,10 +16,10 @@ #include <__algorithm/ranges_copy.h> #include <__chrono/statically_widen.h> -#include <__concepts/same_as.h> #include <__config> #include <__format/concepts.h> #include <__format/formatter.h> +#include <__format/range_format.h> #include <__format/range_formatter.h> #include <__iterator/back_insert_iterator.h> #include <__ranges/concepts.h> @@ -42,51 +42,11 @@ concept __const_formattable_range = template using __fmt_maybe_const _LIBCPP_NODEBUG = conditional_t<__const_formattable_range<_Rp, _CharT>, const _Rp, _Rp>; -_LIBCPP_DIAGNOSTIC_PUSH -_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wshadow") -_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wshadow") -// This shadows map, set, and string. -enum class range_format { disabled, map, set, sequence, string, debug_string }; -_LIBCPP_DIAGNOSTIC_POP - // There is no definition of this struct, it's purely intended to be used to // generate diagnostics. template struct __instantiated_the_primary_template_of_format_kind; -template -constexpr range_format format_kind = [] { - // [format.range.fmtkind]/1 - // A program that instantiates the primary template of format_kind is ill-formed. - static_assert(sizeof(_Rp) != sizeof(_Rp), "create a template specialization of format_kind for your type"); - return range_format::disabled; -}(); - -template - requires same_as<_Rp, remove_cvref_t<_Rp>> -inline constexpr range_format format_kind<_Rp> = [] { - // [format.range.fmtkind]/2 - - // 2.1 If same_as>, R> is true, - // Otherwise format_kind is range_format::disabled. - if constexpr (same_as>, _Rp>) - return range_format::disabled; - // 2.2 Otherwise, if the qualified-id R::key_type is valid and denotes a type: - else if constexpr (requires { typename _Rp::key_type; }) { - // 2.2.1 If the qualified-id R::mapped_type is valid and denotes a type ... - if constexpr (requires { typename _Rp::mapped_type; } && - // 2.2.1 ... If either U is a specialization of pair or U is a specialization - // of tuple and tuple_size_v == 2 - __fmt_pair_like>>) - return range_format::map; - else - // 2.2.2 Otherwise format_kind is range_format::set. - return range_format::set; - } else - // 2.3 Otherwise, format_kind is range_format::sequence. - return range_format::sequence; -}(); - template struct __range_default_formatter; diff --git a/libcxx/include/__format/range_format.h b/libcxx/include/__format/range_format.h new file mode 100644 index 0000000000000..139cfd92ee32b --- /dev/null +++ b/libcxx/include/__format/range_format.h @@ -0,0 +1,71 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___FORMAT_RANGE_FORMAT_H +#define _LIBCPP___FORMAT_RANGE_FORMAT_H + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#include <__concepts/same_as.h> +#include <__config> +#include <__format/concepts.h> +#include <__ranges/concepts.h> +#include <__type_traits/remove_cvref.h> + +_LIBCPP_BEGIN_NAMESPACE_STD + +#if _LIBCPP_STD_VER >= 23 + +_LIBCPP_DIAGNOSTIC_PUSH +_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wshadow") +_LIBCPP_GCC_DIAGNOSTIC_IGNORED("-Wshadow") +// This shadows map, set, and string. +enum class range_format { disabled, map, set, sequence, string, debug_string }; +_LIBCPP_DIAGNOSTIC_POP + +template +constexpr range_format format_kind = [] { + // [format.range.fmtkind]/1 + // A program that instantiates the primary template of format_kind is ill-formed. + static_assert(sizeof(_Rp) != sizeof(_Rp), "create a template specialization of format_kind for your type"); + return range_format::disabled; +}(); + +template + requires same_as<_Rp, remove_cvref_t<_Rp>> +inline constexpr range_format format_kind<_Rp> = [] { + // [format.range.fmtkind]/2 + + // 2.1 If same_as>, R> is true, + // Otherwise format_kind is range_format::disabled. + if constexpr (same_as>, _Rp>) + return range_format::disabled; + // 2.2 Otherwise, if the qualified-id R::key_type is valid and denotes a type: + else if constexpr (requires { typename _Rp::key_type; }) { + // 2.2.1 If the qualified-id R::mapped_type is valid and denotes a type ... + if constexpr (requires { typename _Rp::mapped_type; } && + // 2.2.1 ... If either U is a specialization of pair or U is a specialization + // of tuple and tuple_size_v == 2 + __fmt_pair_like>>) + return range_format::map; + else + // 2.2.2 Otherwise format_kind is range_format::set. + return range_format::set; + } else + // 2.3 Otherwise, format_kind is range_format::sequence. + return range_format::sequence; +}(); + +#endif // _LIBCPP_STD_VER >= 23 + +_LIBCPP_END_NAMESPACE_STD + +#endif diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index 602e72bbf5b01..78607f2c1301d 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -1369,6 +1369,7 @@ module std [system] { module indic_conjunct_break_table { header "__format/indic_conjunct_break_table.h" } module parser_std_format_spec { header "__format/parser_std_format_spec.h" } module range_default_formatter { header "__format/range_default_formatter.h" } + module range_format { header "__format/range_format.h" } module range_formatter { header "__format/range_formatter.h" } module unicode { header "__format/unicode.h" } module width_estimation_table { header "__format/width_estimation_table.h" } From d218011159105208de38275594c75735929f74aa Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Thu, 17 Jul 2025 15:44:49 +0800 Subject: [PATCH 141/813] [LoongArch] Optimize inserting extracted elements (#146018) --- .../LoongArch/LoongArchISelLowering.cpp | 5 +- .../LoongArch/LoongArchLASXInstrInfo.td | 83 +++++++++++++++++-- .../Target/LoongArch/LoongArchLSXInstrInfo.td | 48 ++++++++++- .../lasx/ir-instruction/fix-xvshuf.ll | 12 +-- .../ir-instruction/insert-extract-element.ll | 30 +------ .../insert-extract-pair-elements.ll | 64 ++------------ .../ir-instruction/insert-extract-element.ll | 20 ++--- 7 files changed, 144 insertions(+), 118 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index c47987fbf683b..39a1d542dd309 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -2597,12 +2597,9 @@ LoongArchTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { EVT VecTy = Op->getOperand(0)->getValueType(0); SDValue Idx = Op->getOperand(1); - EVT EltTy = VecTy.getVectorElementType(); unsigned NumElts = VecTy.getVectorNumElements(); - if (isa(Idx) && - (EltTy == MVT::i32 || EltTy == MVT::i64 || EltTy == MVT::f32 || - EltTy == MVT::f64 || Idx->getAsZExtVal() < NumElts / 2)) + if (isa(Idx) && Idx->getAsZExtVal() < NumElts) return Op; return SDValue(); diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 95e9fd49d1c0d..6a8c9fac840d9 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1282,6 +1282,32 @@ multiclass PatCCXrXrF { (!cast(Inst#"_D") LASX256:$xj, LASX256:$xk)>; } +multiclass PairInsertExtractPatV8 { + foreach imm1 = 0...3 in { + foreach imm2 = 0...3 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert vecty:$xd, + (elemty (vector_extract vecty:$xj, imm1)), imm2), + (elemty (vector_extract vecty:$xj, !add(imm1, 4))), + !add(imm2, 4)), + (XVEXTRINS_W $xd, $xj, Imm)>; + } + } +} + +multiclass PairInsertExtractPatV4 { + foreach imm1 = 0...1 in { + foreach imm2 = 0...1 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert vecty:$xd, + (elemty (vector_extract vecty:$xj, imm1)), imm2), + (elemty (vector_extract vecty:$xj, !add(imm1, 2))), + !add(imm2, 2)), + (XVEXTRINS_D $xd, $xj, Imm)>; + } + } +} + let Predicates = [HasExtLASX] in { // XVADD_{B/H/W/D} @@ -1582,6 +1608,38 @@ defm : PatCCXrXrF; defm : PatCCXrXrF; defm : PatCCXrXrF; +// Insert two elements extracted from vector into vector. (The positions +// of the two elements must be same in the source or destination vector's +// front and back 128bits.) +// 2*XVPICKVE2GR_{W/D} + 2*XVINSGR2VR_{W/D} -> XVEXTRINS_{W/D} +// XVPERMI_D + 2*XVPICKVE2GR_{B/H} + 2*PseudoXVINSGR2VR_{B/H} -> XVEXTRINS_{W/D} +foreach imm1 = 0...15 in { + foreach imm2 = 0...15 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert v32i8:$xd, + (GRLenVT (vector_extract v32i8:$xj, imm1)), imm2), + (GRLenVT (vector_extract v32i8:$xj, !add(imm1, 16))), + !add(imm2, 16)), + (XVEXTRINS_B $xd, $xj, Imm)>; + } +} + +foreach imm1 = 0...7 in { + foreach imm2 = 0...7 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert (vector_insert v16i16:$xd, + (GRLenVT (vector_extract v16i16:$xj, imm1)), imm2), + (GRLenVT (vector_extract v16i16:$xj, !add(imm1, 8))), + !add(imm2, 8)), + (XVEXTRINS_H $xd, $xj, Imm)>; + } +} + +defm : PairInsertExtractPatV8; +defm : PairInsertExtractPatV8; +defm : PairInsertExtractPatV4; +defm : PairInsertExtractPatV4; + // PseudoXVINSGR2VR_{B/H} def : Pat<(vector_insert v32i8:$xd, GRLenVT:$rj, uimm5:$imm), (PseudoXVINSGR2VR_B v32i8:$xd, GRLenVT:$rj, uimm5:$imm)>; @@ -1593,11 +1651,14 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm), (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>; def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm), (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>; - -def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm), - (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; -def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm), - (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2), + (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>; +def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2), + (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>; +def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm), + (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm), + (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>; // scalar_to_vector def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)), @@ -1791,6 +1852,18 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in { } // Vector extraction with constant index. +foreach imm = 16...31 in { + defvar Imm = !and(imm, 15); + def : Pat<(i64 (vector_extract v32i8:$xj, imm)), + (VPICKVE2GR_B (EXTRACT_SUBREG (XVPERMI_D v32i8:$xj, 14), sub_128), + Imm)>; +} +foreach imm = 8...15 in { + defvar Imm = !and(imm, 7); + def : Pat<(i64 (vector_extract v16i16:$xj, imm)), + (VPICKVE2GR_H (EXTRACT_SUBREG (XVPERMI_D v16i16:$xj, 14), sub_128), + Imm)>; +} def : Pat<(i64 (vector_extract v32i8:$xj, uimm4:$imm)), (VPICKVE2GR_B (EXTRACT_SUBREG v32i8:$xj, sub_128), uimm4:$imm)>; def : Pat<(i64 (vector_extract v16i16:$xj, uimm3:$imm)), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index d73d78083ddcd..9dd6006e3a9dc 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1482,6 +1482,28 @@ multiclass VstelmPat; } +multiclass InsertExtractPatV4 { + foreach imm1 = 0...3 in { + foreach imm2 = 0...3 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert vecty:$vd, + (elemty (vector_extract vecty:$vj, imm1)), imm2), + (VEXTRINS_W $vd, $vj, Imm)>; + } + } +} + +multiclass InsertExtractPatV2 { + foreach imm1 = 0...1 in { + foreach imm2 = 0...1 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert vecty:$vd, + (elemty (vector_extract vecty:$vj, imm1)), imm2), + (VEXTRINS_D $vd, $vj, Imm)>; + } + } +} + let Predicates = [HasExtLSX] in { // VADD_{B/H/W/D} @@ -1782,6 +1804,31 @@ defm : PatCCVrVrF; defm : PatCCVrVrF; defm : PatCCVrVrF; +// Insert element extracted from vector into vector. +// VPICKVE2GR_{B/H/W/D} + VINSGR2VR_{B/H/W/D} -> VEXTRINS_{B/H/W/D} +foreach imm1 = 0...15 in { + foreach imm2 = 0...15 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert v16i8:$vd, + (GRLenVT (vector_extract v16i8:$vj, imm1)), imm2), + (VEXTRINS_B $vd, $vj, Imm)>; + } +} + +foreach imm1 = 0...7 in { + foreach imm2 = 0...7 in { + defvar Imm = !or(!shl(imm2, 4), imm1); + def : Pat<(vector_insert v8i16:$vd, + (GRLenVT (vector_extract v8i16:$vj, imm1)), imm2), + (VEXTRINS_H $vd, $vj, Imm)>; + } +} + +defm : InsertExtractPatV4; +defm : InsertExtractPatV4; +defm : InsertExtractPatV2; +defm : InsertExtractPatV2; + // VINSGR2VR_{B/H/W/D} def : Pat<(vector_insert v16i8:$vd, GRLenVT:$rj, uimm4:$imm), (VINSGR2VR_B v16i8:$vd, GRLenVT:$rj, uimm4:$imm)>; @@ -1791,7 +1838,6 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm), (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>; def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm), (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>; - def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm), (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>; def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm), diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll index f3bec11810e9b..f154dd3b8eb3c 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll @@ -7,20 +7,12 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: shufflevector_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 -; CHECK-NEXT: movgr2fr.d $fa2, $a0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2 -; CHECK-NEXT: movgr2fr.d $fa3, $a0 -; CHECK-NEXT: movfr2gr.d $a0, $fa2 ; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa3 +; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 2 ; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 1 ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: movgr2fr.d $fa0, $a0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3 -; CHECK-NEXT: movgr2fr.d $fa1, $a0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 ; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 2 -; CHECK-NEXT: movfr2gr.d $a0, $fa1 +; CHECK-NEXT: xvpickve2gr.d $a0, $xr1, 3 ; CHECK-NEXT: xvinsgr2vr.d $xr2, $a0, 3 ; CHECK-NEXT: xvori.b $xr0, $xr2, 0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll index 3fdc439e68679..271e3eca31dbe 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll @@ -4,18 +4,9 @@ define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind { ; CHECK-LABEL: insert_extract_v32i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: ld.b $a0, $sp, 31 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vpickve2gr.b $a0, $vr1, 15 ; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 1 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 ; CHECK-NEXT: ret entry: %b = extractelement <32 x i8> %a, i32 31 @@ -26,18 +17,9 @@ entry: define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind { ; CHECK-LABEL: insert_extract_v16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: ld.h $a0, $sp, 30 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vpickve2gr.h $a0, $vr1, 7 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 1 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 ; CHECK-NEXT: ret entry: %b = extractelement <16 x i16> %a, i32 15 @@ -61,8 +43,6 @@ define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind { ; CHECK-LABEL: insert_extract_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7 -; CHECK-NEXT: movgr2fr.w $fa1, $a0 -; CHECK-NEXT: movfr2gr.s $a0, $fa1 ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1 ; CHECK-NEXT: ret entry: @@ -87,8 +67,6 @@ define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind { ; CHECK-LABEL: insert_extract_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: movgr2fr.d $fa1, $a0 -; CHECK-NEXT: movfr2gr.d $a0, $fa1 ; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll index 88c3e4367ffa7..4e173c4feadba 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll @@ -4,23 +4,7 @@ define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind { ; CHECK-LABEL: insert_extract_v32i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: vpickve2gr.b $a0, $vr0, 15 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: ld.b $a1, $sp, 31 -; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 1 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 1 -; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: xvextrins.b $xr0, $xr0, 31 ; CHECK-NEXT: ret entry: %b_lo = extractelement <32 x i8> %a, i32 15 @@ -33,23 +17,7 @@ entry: define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind { ; CHECK-LABEL: insert_extract_v16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 -; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 -; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 7 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: ld.h $a1, $sp, 30 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 1 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 -; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: xvextrins.h $xr0, $xr0, 23 ; CHECK-NEXT: ret entry: %b_lo = extractelement <16 x i16> %a, i32 7 @@ -62,10 +30,7 @@ entry: define <8 x i32> @insert_extract_v8i32(<8 x i32> %a) nounwind { ; CHECK-LABEL: insert_extract_v8i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3 -; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 7 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a1, 5 +; CHECK-NEXT: xvextrins.w $xr0, $xr0, 19 ; CHECK-NEXT: ret entry: %b_lo = extractelement <8 x i32> %a, i32 3 @@ -78,14 +43,7 @@ entry: define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind { ; CHECK-LABEL: insert_extract_v8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3 -; CHECK-NEXT: movgr2fr.w $fa1, $a0 -; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7 -; CHECK-NEXT: movgr2fr.w $fa2, $a0 -; CHECK-NEXT: movfr2gr.s $a0, $fa1 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1 -; CHECK-NEXT: movfr2gr.s $a0, $fa2 -; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 5 +; CHECK-NEXT: xvextrins.w $xr0, $xr0, 19 ; CHECK-NEXT: ret entry: %b_lo = extractelement <8 x float> %a, i32 3 @@ -98,10 +56,7 @@ entry: define <4 x i64> @insert_extract_v4i64(<4 x i64> %a) nounwind { ; CHECK-LABEL: insert_extract_v4i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: xvpickve2gr.d $a1, $xr0, 3 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a1, 2 +; CHECK-NEXT: xvextrins.d $xr0, $xr0, 1 ; CHECK-NEXT: ret entry: %b_lo = extractelement <4 x i64> %a, i32 1 @@ -114,14 +69,7 @@ entry: define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind { ; CHECK-LABEL: insert_extract_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 -; CHECK-NEXT: movgr2fr.d $fa1, $a0 -; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: movgr2fr.d $fa2, $a0 -; CHECK-NEXT: movfr2gr.d $a0, $fa1 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0 -; CHECK-NEXT: movfr2gr.d $a0, $fa2 -; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 2 +; CHECK-NEXT: xvextrins.d $xr0, $xr0, 1 ; CHECK-NEXT: ret entry: %b_lo = extractelement <4 x double> %a, i32 1 diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll index c7dd1454c7e33..e9a0c8a110452 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll @@ -4,8 +4,7 @@ define <16 x i8> @insert_extract_v16i8(<16 x i8> %a) nounwind { ; CHECK-LABEL: insert_extract_v16i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpickve2gr.b $a0, $vr0, 15 -; CHECK-NEXT: vinsgr2vr.b $vr0, $a0, 1 +; CHECK-NEXT: vextrins.b $vr0, $vr0, 31 ; CHECK-NEXT: ret entry: %b = extractelement <16 x i8> %a, i32 15 @@ -16,8 +15,7 @@ entry: define <8 x i16> @insert_extract_v8i16(<8 x i16> %a) nounwind { ; CHECK-LABEL: insert_extract_v8i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpickve2gr.h $a0, $vr0, 7 -; CHECK-NEXT: vinsgr2vr.h $vr0, $a0, 1 +; CHECK-NEXT: vextrins.h $vr0, $vr0, 23 ; CHECK-NEXT: ret entry: %b = extractelement <8 x i16> %a, i32 7 @@ -28,8 +26,7 @@ entry: define <4 x i32> @insert_extract_v4i32(<4 x i32> %a) nounwind { ; CHECK-LABEL: insert_extract_v4i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 3 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 1 +; CHECK-NEXT: vextrins.w $vr0, $vr0, 19 ; CHECK-NEXT: ret entry: %b = extractelement <4 x i32> %a, i32 3 @@ -40,9 +37,7 @@ entry: define <4 x float> @insert_extract_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: insert_extract_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vreplvei.w $vr1, $vr0, 3 -; CHECK-NEXT: movfr2gr.s $a0, $fa1 -; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 0 +; CHECK-NEXT: vextrins.w $vr0, $vr0, 3 ; CHECK-NEXT: ret entry: %b = extractelement <4 x float> %a, i32 3 @@ -53,8 +48,7 @@ entry: define <2 x i64> @insert_extract_v2i64(<2 x i64> %a) nounwind { ; CHECK-LABEL: insert_extract_v2i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; CHECK-NEXT: vextrins.d $vr0, $vr0, 1 ; CHECK-NEXT: ret entry: %b = extractelement <2 x i64> %a, i32 1 @@ -65,9 +59,7 @@ entry: define <2 x double> @insert_extract_v2f64(<2 x double> %a) nounwind { ; CHECK-LABEL: insert_extract_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vreplvei.d $vr1, $vr0, 1 -; CHECK-NEXT: movfr2gr.d $a0, $fa1 -; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; CHECK-NEXT: vextrins.d $vr0, $vr0, 1 ; CHECK-NEXT: ret entry: %b = extractelement <2 x double> %a, i32 1 From 3071fe55f8321ea18952412a233f7330ce26b522 Mon Sep 17 00:00:00 2001 From: Jonathan Wakely Date: Thu, 17 Jul 2025 08:46:45 +0100 Subject: [PATCH 142/813] [libc++] Add missing include to bitset to_ullong.pass.cpp test (#149149) This was added to to_ulong.pass.cpp years ago by cf1dc8d39e2c9870468ca86f7956a65c7745fece but I don't think the other part of that commit matters here. --- .../utilities/template.bitset/bitset.members/to_ullong.pass.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp index 5df6103d79e7a..3c5a57d1c7fec 100644 --- a/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp +++ b/libcxx/test/std/utilities/template.bitset/bitset.members/to_ullong.pass.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include From 1d398a96dc6b58d15d289c71e2d9f229a0ba719b Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Thu, 17 Jul 2025 13:22:06 +0530 Subject: [PATCH 143/813] [GVN][NFC] Use early return in phiTranslateImpl() (#149268) --- llvm/lib/Transforms/Scalar/GVN.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 8bff458f88bb9..affae41ed2c83 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2367,12 +2367,15 @@ uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred, // See if we can refine the value number by looking at the PN incoming value // for the given predecessor. if (PHINode *PN = NumberingPhi[Num]) { - if (PN->getParent() == PhiBlock) - for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) - if (PN->getIncomingBlock(I) == Pred) - if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false)) - return TransVal; - return Num; + if (PN->getParent() != PhiBlock) + return Num; + + for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) { + if (PN->getIncomingBlock(I) != Pred) + continue; + if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false)) + return TransVal; + } } if (BasicBlock *BB = NumberingBB[Num]) { From ace6e20e52c0c343500f68fa053b6be546e5a0db Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 17 Jul 2025 07:53:11 +0000 Subject: [PATCH 144/813] [gn build] Port 2194bca2b78c --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 9a34f6b27d026..82ec8121548c9 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1061,6 +1061,7 @@ if (current_toolchain == default_toolchain) { "__format/indic_conjunct_break_table.h", "__format/parser_std_format_spec.h", "__format/range_default_formatter.h", + "__format/range_format.h", "__format/range_formatter.h", "__format/unicode.h", "__format/width_estimation_table.h", From bce951c572465c6ccd59b73a58c536641abc43eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Thu, 17 Jul 2025 09:14:17 +0100 Subject: [PATCH 145/813] [mlir][linalg] Update vectorization logic for linalg.unpack (#149156) This PR makes sure that we don't generate unnecessary `tensor.empty` when vectorizing `linalg.unpack`. To better visualize the changes implemented here, consider this IR: ```mlir func.func @example( %source: tensor<8x4x16x16xf32>, %dest: tensor<64x127xf32>) -> tensor<64x127xf32> { %res = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32> return %res : tensor<64x127xf32> } ``` Below is the output after vectorization, BEFORE and AFTER this PR. BEFORE (note `tensor.empty` and the fact that `%arg1` is not used): ```mlir func.func @example(%arg0: tensor<8x4x16x16xf32>, %arg1: tensor<64x127xf32>) -> tensor<64x127xf32> { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32> %1 = vector.transpose %0, [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32> %2 = vector.shape_cast %1 : vector<4x16x8x16xf32> to vector<64x128xf32> %3 = tensor.empty() : tensor<64x127xf32> %c0_0 = arith.constant 0 : index %4 = vector.transfer_write %2, %3[%c0_0, %c0_0] {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32> return %4 : tensor<64x127xf32> } ``` AFTER (note that `%arg1` is correctly used): ```mlir func.func @example(%arg0: tensor<8x4x16x16xf32>, %arg1: tensor<64x127xf32>) -> tensor<64x127xf32> { %cst = arith.constant 0.000000e+00 : f32 %c0 = arith.constant 0 : index %0 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32> %1 = vector.transpose %0, [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32> %2 = vector.shape_cast %1 : vector<4x16x8x16xf32> to vector<64x128xf32> %c0_0 = arith.constant 0 : index %3 = vector.transfer_write %2, %arg1[%c0_0, %c0_0] {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32> return %3 : tensor<64x127xf32> } ``` --- .../Linalg/Transforms/Vectorization.cpp | 5 +- .../Linalg/vectorization/linalg-ops.mlir | 51 +++++++++++-------- 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 5a8c5eab3f444..458ed543b8216 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -1928,11 +1928,8 @@ vectorizeAsTensorUnpackOp(RewriterBase &rewriter, linalg::UnPackOp unpackOp, unpackOp.getDestType().hasStaticShape() ? vectorSizes : shapeCastOp.getResultVectorType().getShape()); - Value dest = rewriter.create( - loc, reifiedRetShapes[0], - shapeCastOp.getResult().getType().getElementType()); Operation *write = createWriteOrMaskedWrite( - rewriter, loc, shapeCastOp.getResult(), dest, + rewriter, loc, shapeCastOp.getResult(), unpackOp.getDest(), /*writeIndices=*/{}, useInBoundsInsteadOfMasking); newResults.push_back(write->getResult(0)); return success(); diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir index 9e501affdd2a5..679adf0a52175 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir @@ -1158,6 +1158,7 @@ module attributes {transform.with_named_sequence} { // ----- // CHECK-LABEL: func @test_vectorize_dynamic_shapes_unpack +// CHECK-SAME: %[[ARG_0:.*]]: tensor, func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor, %arg1: tensor) -> tensor { // CHECK: %[[C0:.*]] = arith.constant 0 // CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor @@ -1175,9 +1176,8 @@ func.func @test_vectorize_dynamic_shapes_unpack(%arg0: tensor, %arg1: t // CHECK: %[[read0:.*]] = vector.mask %[[readMsk0]] {{.*}} vector.transfer_read %{{.*}} : tensor, vector<2x1x16x2xf32> } : vector<2x1x16x2xi1> -> vector<2x1x16x2xf32> // CHECK: %[[trans0:.*]] = vector.transpose %[[read0]], [0, 3, 1, 2] : vector<2x1x16x2xf32> to vector<2x2x1x16xf32> // CHECK: %[[sc0:.*]] = vector.shape_cast %[[trans0]] : vector<2x2x1x16xf32> to vector<4x16xf32> -// CHECK: %[[empt0:.*]] = tensor.empty // CHECK: %[[writeMsk0:.*]] = vector.create_mask {{.*}} : vector<4x16xi1> -// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[empt0]] +// CHECK: %[[write0:.*]] = vector.mask %[[writeMsk0:.*]] {{.*}} vector.transfer_write %[[sc0]], %[[ARG_0]] // CHECK: return %[[write0]] %ret = linalg.unpack %arg1 inner_dims_pos = [1, 0] inner_tiles = [16, 2] into %arg0 : tensor -> tensor return %ret : tensor @@ -1193,6 +1193,8 @@ module attributes {transform.with_named_sequence} { // ----- // CHECK-LABEL: func @test_vectorize_unpack +// CHECK-SAME: %[[SRC:.*]]: tensor<8x8x32x16xf32> +// CHECK-SAME: %[[DEST:.*]]: tensor<256x128xf32> func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[C0:.*]]= arith.constant 0 : index @@ -1201,15 +1203,14 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2 // CHECK: %[[C32:.*]] = arith.constant 32 : index // CHECK: %[[C16:.*]] = arith.constant 16 : index // CHECK: %[[MSK0:.*]] = vector.create_mask %[[C8]], %[[C80]], %[[C32]], %[[C16]] : vector<16x8x32x16xi1> - // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] {{.*}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32> + // CHECK: %[[READ0:.*]] = vector.mask %[[MSK0]] { vector.transfer_read %[[SRC]]{{.*}}} : vector<16x8x32x16xi1> -> vector<16x8x32x16xf32> // CHECK: %[[TRANSP0:.*]] = vector.transpose %[[READ0]], [0, 2, 1, 3] : vector<16x8x32x16xf32> to vector<16x32x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP0]] : vector<16x32x8x16xf32> to vector<512x128xf32> - // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32> // CHECK: %[[C01:.*]] = arith.constant 0 : index // CHECK: %[[C256:.*]] = arith.constant 256 : index // CHECK: %[[C128:.*]] = arith.constant 128 : index // CHECK: %[[WRITEMSK:.*]] = vector.create_mask %[[C256]], %[[C128]] : vector<512x128xi1> - // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] {{.*}} : vector<512x128xi1> -> tensor<256x128xf32> + // CHECK: %[[WRIT:.*]] = vector.mask %[[WRITEMSK]] { vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<512x128xi1> -> tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> @@ -1225,15 +1226,16 @@ func.func @test_vectorize_unpack(%source: tensor<8x8x32x16xf32>, %dest: tensor<2 // ----- // CHECK-LABEL: func @test_vectorize_unpack_no_masks +// CHECK-SAME: %[[SRC:.*]]: tensor<8x8x32x16xf32> +// CHECK-SAME: %[[DEST:.*]]: tensor<256x128xf32> func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32> - // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index - // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> + // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> @@ -1248,16 +1250,17 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest: // ----- - // CHECK-LABEL: test_vectorize_unpack_with_outer_perm +// CHECK-LABEL: test_vectorize_unpack_with_outer_perm +// CHECK-SAME: %[[SRC:.*]]: tensor<8x8x32x16xf32> +// CHECK-SAME: %[[DEST:.*]]: tensor<256x128xf32> func.func @test_vectorize_unpack_with_outer_perm(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32> - // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index - // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> + // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> @@ -1327,15 +1330,17 @@ module attributes {transform.with_named_sequence} { // ----- +// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes +// CHECK-SAME: %[[SRC:.*]]: tensor<8x8x32x16xf32> +// CHECK-SAME: %[[DEST:.*]]: tensor<256x128xf32> func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, %dest: tensor<256x128xf32>) -> tensor<256x128xf32> { // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x8x32x16xf32>, vector<8x8x32x16xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [0, 2, 1, 3] : vector<8x8x32x16xf32> to vector<8x32x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<8x32x8x16xf32> to vector<256x128xf32> - // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<256x128xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index - // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<256x128xf32>, tensor<256x128xf32> + // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<256x128xf32>, tensor<256x128xf32> // CHECK: return %[[WRIT]] : tensor<256x128xf32> %0 = linalg.unpack %source inner_dims_pos = [0, 1] inner_tiles = [32, 16] into %dest : tensor<8x8x32x16xf32> -> tensor<256x128xf32> return %0 : tensor<256x128xf32> @@ -1350,15 +1355,17 @@ func.func @test_vectorize_unpack_no_vector_sizes(%source: tensor<8x8x32x16xf32>, // ----- +// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes_slice_output +// CHECK-SAME: %[[SRC:.*]]: tensor<8x4x16x16xf32> +// CHECK-SAME: %[[DEST:.*]]: tensor<64x127xf32> func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x4x16x16xf32>, %dest: tensor<64x127xf32>) -> tensor<64x127xf32> { // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32> + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<8x4x16x16xf32>, vector<8x4x16x16xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 2, 0, 3] : vector<8x4x16x16xf32> to vector<4x16x8x16xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<4x16x8x16xf32> to vector<64x128xf32> - // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<64x127xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index - // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[EMPT]]{{\[}}%[[C00]], %[[C00]]] + // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]] // CHECK-SAME: {in_bounds = [true, false]} : vector<64x128xf32>, tensor<64x127xf32> // CHECK: return %[[WRIT]] : tensor<64x127xf32> %0 = linalg.unpack %source outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 16] into %dest : tensor<8x4x16x16xf32> -> tensor<64x127xf32> @@ -1374,18 +1381,20 @@ func.func @test_vectorize_unpack_no_vector_sizes_slice_output(%source: tensor<8x // ----- +// CHECK-LABEL: test_vectorize_unpack_no_vector_sizes_permute +// CHECK-SAME: %[[SRC:.*]]: tensor<4x7x4xf32> +// CHECK-SAME: %[[DEST:.*]]: tensor<7x16xf32> func.func @test_vectorize_unpack_no_vector_sizes_permute(%source: tensor<4x7x4xf32>, %dest: tensor<7x16xf32>) -> tensor<7x16xf32> { %0 = linalg.unpack %source outer_dims_perm=[1, 0] inner_dims_pos = [1] inner_tiles = [4] into %dest : tensor<4x7x4xf32> -> tensor<7x16xf32> return %0 : tensor<7x16xf32> } // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[READ:.*]] = vector.transfer_read {{.*}} : tensor<4x7x4xf32>, vector<4x7x4xf32> + // CHECK: %[[READ:.*]] = vector.transfer_read %[[SRC]]{{.*}}} : tensor<4x7x4xf32>, vector<4x7x4xf32> // CHECK: %[[TRANSP:.*]] = vector.transpose %[[READ]], [1, 0, 2] : vector<4x7x4xf32> to vector<7x4x4xf32> // CHECK: %[[SHAPC:.*]] = vector.shape_cast %[[TRANSP]] : vector<7x4x4xf32> to vector<7x16xf32> - // CHECK: %[[EMPT:.*]] = tensor.empty() : tensor<7x16xf32> // CHECK: %[[C00:.*]] = arith.constant 0 : index - // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], {{.*}} : vector<7x16xf32>, tensor<7x16xf32> + // CHECK: %[[WRIT:.*]] = vector.transfer_write %[[SHAPC]], %[[DEST]]{{.*}}} : vector<7x16xf32>, tensor<7x16xf32> // CHECK: return %[[WRIT]] : tensor<7x16xf32> module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { From 8b553c495155a024d22871f22f05187fb785c4fc Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Thu, 17 Jul 2025 13:44:26 +0530 Subject: [PATCH 146/813] Revert "[GVN][NFC] Use early return in phiTranslateImpl() (#149268)" (#149270) This reverts commit 1d398a96dc6b58d15d289c71e2d9f229a0ba719b. --- llvm/lib/Transforms/Scalar/GVN.cpp | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index affae41ed2c83..8bff458f88bb9 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2367,15 +2367,12 @@ uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred, // See if we can refine the value number by looking at the PN incoming value // for the given predecessor. if (PHINode *PN = NumberingPhi[Num]) { - if (PN->getParent() != PhiBlock) - return Num; - - for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) { - if (PN->getIncomingBlock(I) != Pred) - continue; - if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false)) - return TransVal; - } + if (PN->getParent() == PhiBlock) + for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) + if (PN->getIncomingBlock(I) == Pred) + if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false)) + return TransVal; + return Num; } if (BasicBlock *BB = NumberingBB[Num]) { From e0cce5cdcb8a7829389d910a9204447646e69407 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Thu, 17 Jul 2025 10:15:43 +0200 Subject: [PATCH 147/813] [libc] Improve Cortex `memset` and `memcpy` functions (#149044) The code for `memcpy` is the same as in #148204 but it fixes the build bot error by using `static_assert(cpp::always_false)` instead of `static_assert(false)` (older compilers fails on `static_assert(false)` in `constexpr` `else` bodies). The code for `memset` is new and vastly improves performance over the current byte per byte implementation. Both `memset` and `memcpy` implementations use prefetching for sizes >= 64. This lowers a bit the performance for sizes between 64 and 256 but improves throughput for greater sizes. --- libc/src/string/memory_utils/CMakeLists.txt | 2 + libc/src/string/memory_utils/arm/common.h | 55 +++++ .../string/memory_utils/arm/inline_memcpy.h | 193 +++++++++--------- .../string/memory_utils/arm/inline_memset.h | 156 ++++++++++++++ libc/src/string/memory_utils/inline_memset.h | 6 +- .../llvm-project-overlay/libc/BUILD.bazel | 2 + 6 files changed, 315 insertions(+), 99 deletions(-) create mode 100644 libc/src/string/memory_utils/arm/common.h create mode 100644 libc/src/string/memory_utils/arm/inline_memset.h diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt index a967247db53f4..8ab1c9ff98d2f 100644 --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -7,7 +7,9 @@ add_header_library( aarch64/inline_memcpy.h aarch64/inline_memmove.h aarch64/inline_memset.h + arm/common.h arm/inline_memcpy.h + arm/inline_memset.h generic/aligned_access.h generic/byte_per_byte.h inline_bcmp.h diff --git a/libc/src/string/memory_utils/arm/common.h b/libc/src/string/memory_utils/arm/common.h new file mode 100644 index 0000000000000..b9f40b64fed98 --- /dev/null +++ b/libc/src/string/memory_utils/arm/common.h @@ -0,0 +1,55 @@ +//===-- Common constants and defines for arm --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H + +#include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR +#include "src/string/memory_utils/utils.h" // CPtr, Ptr, distance_to_align + +#include // size_t + +// Our minimum supported compiler version does not recognize the standard +// [[likely]] / [[unlikely]] attributes so we use the preprocessor. + +// https://libc.llvm.org/compiler_support.html +// Support for [[likely]] / [[unlikely]] +// [X] GCC 12.2 +// [X] Clang 12 +// [ ] Clang 11 +#define LIBC_ATTR_LIKELY [[likely]] +#define LIBC_ATTR_UNLIKELY [[unlikely]] + +#if defined(LIBC_COMPILER_IS_CLANG) +#if LIBC_COMPILER_CLANG_VER < 1200 +#undef LIBC_ATTR_LIKELY +#undef LIBC_ATTR_UNLIKELY +#define LIBC_ATTR_LIKELY +#define LIBC_ATTR_UNLIKELY +#endif +#endif + +namespace LIBC_NAMESPACE_DECL { + +LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t); + +enum class AssumeAccess { kUnknown, kAligned }; +enum class BlockOp { kFull, kByWord }; + +LIBC_INLINE auto misaligned(CPtr ptr) { + return distance_to_align_down(ptr); +} + +LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) { + return cpp::bit_cast(cpp::bit_cast(a) | + cpp::bit_cast(b)); +} + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_COMMON_H diff --git a/libc/src/string/memory_utils/arm/inline_memcpy.h b/libc/src/string/memory_utils/arm/inline_memcpy.h index 61efebe29b485..c748048a3e586 100644 --- a/libc/src/string/memory_utils/arm/inline_memcpy.h +++ b/libc/src/string/memory_utils/arm/inline_memcpy.h @@ -5,63 +5,57 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// The functions defined in this file give approximate code size. These sizes +// assume the following configuration options: +// - LIBC_CONF_KEEP_FRAME_POINTER = false +// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false +// - LIBC_ADD_NULL_CHECKS = false #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H +#include "src/__support/CPP/type_traits.h" // always_false #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL +#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY #include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align #include // size_t -// https://libc.llvm.org/compiler_support.html -// Support for [[likely]] / [[unlikely]] -// [X] GCC 12.2 -// [X] Clang 12 -// [ ] Clang 11 -#define LIBC_ATTR_LIKELY [[likely]] -#define LIBC_ATTR_UNLIKELY [[unlikely]] - -#if defined(LIBC_COMPILER_IS_CLANG) -#if LIBC_COMPILER_CLANG_VER < 1200 -#undef LIBC_ATTR_LIKELY -#undef LIBC_ATTR_UNLIKELY -#define LIBC_ATTR_LIKELY -#define LIBC_ATTR_UNLIKELY -#endif -#endif - namespace LIBC_NAMESPACE_DECL { namespace { -LIBC_INLINE_VAR constexpr size_t kWordSize = sizeof(uint32_t); - -enum Strategy { - ForceWordLdStChain, - AssumeWordAligned, - AssumeUnaligned, -}; +// Performs a copy of `bytes` byte from `src` to `dst`. This function has the +// semantics of `memcpy` where `src` and `dst` are `__restrict`. The compiler is +// free to use whatever instruction is best for the size and assumed access. +template +LIBC_INLINE void copy(void *dst, const void *src) { + if constexpr (access == AssumeAccess::kAligned) { + constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes; + memcpy_inline(assume_aligned(dst), + assume_aligned(src)); + } else if constexpr (access == AssumeAccess::kUnknown) { + memcpy_inline(dst, src); + } else { + static_assert(cpp::always_false, "Invalid AssumeAccess"); + } +} -template -LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) { - if constexpr (strategy == AssumeUnaligned) { - memcpy_inline(assume_aligned<1>(dst), assume_aligned<1>(src)); - } else if constexpr (strategy == AssumeWordAligned) { - static_assert(bytes >= kWordSize); - memcpy_inline(assume_aligned(dst), - assume_aligned(src)); - } else if constexpr (strategy == ForceWordLdStChain) { +template +LIBC_INLINE void copy_block_and_bump_pointers(Ptr &dst, CPtr &src) { + if constexpr (block_op == BlockOp::kFull) { + copy(dst, src); + } else if constexpr (block_op == BlockOp::kByWord) { // We restrict loads/stores to 4 byte to prevent the use of load/store - // multiple (LDM, STM) and load/store double (LDRD, STRD). First, they may - // fault (see notes below) and second, they use more registers which in turn - // adds push/pop instructions in the hot path. + // multiple (LDM, STM) and load/store double (LDRD, STRD). static_assert((bytes % kWordSize == 0) && (bytes >= kWordSize)); LIBC_LOOP_UNROLL - for (size_t i = 0; i < bytes / kWordSize; ++i) { - const size_t offset = i * kWordSize; - memcpy_inline(dst + offset, src + offset); + for (size_t offset = 0; offset < bytes; offset += kWordSize) { + copy(dst + offset, src + offset); } + } else { + static_assert(cpp::always_false, "Invalid BlockOp"); } // In the 1, 2, 4 byte copy case, the compiler can fold pointer offsetting // into the load/store instructions. @@ -72,39 +66,27 @@ LIBC_INLINE void copy_and_bump_pointers(Ptr &dst, CPtr &src) { src += bytes; } -LIBC_INLINE void copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, - const size_t size) { +template +LIBC_INLINE void consume_by_block(Ptr &dst, CPtr &src, size_t &size) { LIBC_LOOP_NOUNROLL - for (size_t i = 0; i < size; ++i) - *dst++ = *src++; + for (size_t i = 0; i < size / bytes; ++i) + copy_block_and_bump_pointers(dst, src); + size %= bytes; } -template -LIBC_INLINE void copy_blocks_and_update_args(Ptr &dst, CPtr &src, - size_t &size) { +[[maybe_unused]] LIBC_INLINE void +copy_bytes_and_bump_pointers(Ptr &dst, CPtr &src, size_t size) { LIBC_LOOP_NOUNROLL - for (size_t i = 0; i < size / block_size; ++i) - copy_and_bump_pointers(dst, src); - // Update `size` once at the end instead of once per iteration. - size %= block_size; -} - -LIBC_INLINE CPtr bitwise_or(CPtr a, CPtr b) { - return cpp::bit_cast(cpp::bit_cast(a) | - cpp::bit_cast(b)); -} - -LIBC_INLINE auto misaligned(CPtr a) { - return distance_to_align_down(a); + for (size_t i = 0; i < size; ++i) + *dst++ = *src++; } } // namespace -// Implementation for Cortex-M0, M0+, M1. -// Notes: -// - It compiles down to 196 bytes, but 220 bytes when used through `memcpy` -// that also needs to return the `dst` ptr. -// - These cores do not allow for unaligned loads/stores. +// Implementation for Cortex-M0, M0+, M1 cores that do not allow for unaligned +// loads/stores. It compiles down to 208 bytes when used through `memcpy` that +// also needs to return the `dst` ptr. +// Note: // - When `src` and `dst` are coaligned, we start by aligning them and perform // bulk copies. We let the compiler know the pointers are aligned so it can // use load/store multiple (LDM, STM). This significantly increase throughput @@ -121,13 +103,20 @@ LIBC_INLINE auto misaligned(CPtr a) { copy_bytes_and_bump_pointers(dst, src, offset); size -= offset; } + constexpr AssumeAccess kAligned = AssumeAccess::kAligned; const auto src_alignment = distance_to_align_down(src); if (src_alignment == 0) LIBC_ATTR_LIKELY { // Both `src` and `dst` are now word-aligned. - copy_blocks_and_update_args<64, AssumeWordAligned>(dst, src, size); - copy_blocks_and_update_args<16, AssumeWordAligned>(dst, src, size); - copy_blocks_and_update_args<4, AssumeWordAligned>(dst, src, size); + // We first copy by blocks of 64 bytes, the compiler will use 4 + // load/store multiple (LDM, STM), each of 4 words. This requires more + // registers so additional push/pop are needed but the speedup is worth + // it. + consume_by_block<64, BlockOp::kFull, kAligned>(dst, src, size); + // Then we use blocks of 4 word load/store. + consume_by_block<16, BlockOp::kByWord, kAligned>(dst, src, size); + // Then we use word by word copy. + consume_by_block<4, BlockOp::kByWord, kAligned>(dst, src, size); } else { // `dst` is aligned but `src` is not. @@ -138,7 +127,7 @@ LIBC_INLINE auto misaligned(CPtr a) { src_alignment == 2 ? load_aligned(src) : load_aligned(src); - memcpy_inline(assume_aligned(dst), &value); + copy(dst, &value); dst += kWordSize; src += kWordSize; size -= kWordSize; @@ -151,17 +140,8 @@ LIBC_INLINE auto misaligned(CPtr a) { } // Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware -// support for unaligned loads and stores. -// Notes: -// - It compiles down to 266 bytes. -// - `dst` and `src` are not `__restrict` to prevent the compiler from -// reordering loads/stores. -// - We keep state variables to a strict minimum to keep everything in the free -// registers and prevent costly push / pop. -// - If unaligned single loads/stores to normal memory are supported, unaligned -// accesses for load/store multiple (LDM, STM) and load/store double (LDRD, -// STRD) instructions are generally not supported and will still fault so we -// make sure to restrict unrolling to word loads/stores. +// support for unaligned loads and stores. It compiles down to 272 bytes when +// used through `memcpy` that also needs to return the `dst` ptr. [[maybe_unused]] LIBC_INLINE void inline_memcpy_arm_mid_end(Ptr dst, CPtr src, size_t size) { if (misaligned(bitwise_or(src, dst))) @@ -169,38 +149,59 @@ LIBC_INLINE auto misaligned(CPtr a) { if (size < 8) LIBC_ATTR_UNLIKELY { if (size & 1) - copy_and_bump_pointers<1>(dst, src); + copy_block_and_bump_pointers<1>(dst, src); if (size & 2) - copy_and_bump_pointers<2>(dst, src); + copy_block_and_bump_pointers<2>(dst, src); if (size & 4) - copy_and_bump_pointers<4>(dst, src); + copy_block_and_bump_pointers<4>(dst, src); return; } if (misaligned(src)) LIBC_ATTR_UNLIKELY { const size_t offset = distance_to_align_up(dst); if (offset & 1) - copy_and_bump_pointers<1>(dst, src); + copy_block_and_bump_pointers<1>(dst, src); if (offset & 2) - copy_and_bump_pointers<2>(dst, src); + copy_block_and_bump_pointers<2>(dst, src); size -= offset; } } - copy_blocks_and_update_args<64, ForceWordLdStChain>(dst, src, size); - copy_blocks_and_update_args<16, ForceWordLdStChain>(dst, src, size); - copy_blocks_and_update_args<4, AssumeUnaligned>(dst, src, size); + // `dst` and `src` are not necessarily both aligned at that point but this + // implementation assumes hardware support for unaligned loads and stores so + // it is still fast to perform unrolled word by word copy. Note that wider + // accesses through the use of load/store multiple (LDM, STM) and load/store + // double (LDRD, STRD) instructions are generally not supported and can fault. + // By forcing decomposition of 64 bytes copy into word by word copy, the + // compiler uses a load to prefetch the next cache line: + // ldr r3, [r1, #64]! <- prefetch next cache line + // str r3, [r0] + // ldr r3, [r1, #0x4] + // str r3, [r0, #0x4] + // ... + // ldr r3, [r1, #0x3c] + // str r3, [r0, #0x3c] + // This is a bit detrimental for sizes between 64 and 256 (less than 10% + // penalty) but the prefetch yields better throughput for larger copies. + constexpr AssumeAccess kUnknown = AssumeAccess::kUnknown; + consume_by_block<64, BlockOp::kByWord, kUnknown>(dst, src, size); + consume_by_block<16, BlockOp::kByWord, kUnknown>(dst, src, size); + consume_by_block<4, BlockOp::kByWord, kUnknown>(dst, src, size); if (size & 1) - copy_and_bump_pointers<1>(dst, src); + copy_block_and_bump_pointers<1>(dst, src); if (size & 2) - LIBC_ATTR_UNLIKELY - copy_and_bump_pointers<2>(dst, src); + copy_block_and_bump_pointers<2>(dst, src); } -[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(void *__restrict dst_, - const void *__restrict src_, +[[maybe_unused]] LIBC_INLINE void inline_memcpy_arm(Ptr dst, CPtr src, size_t size) { - Ptr dst = cpp::bit_cast(dst_); - CPtr src = cpp::bit_cast(src_); + // The compiler performs alias analysis and is able to prove that `dst` and + // `src` do not alias by propagating the `__restrict` keyword from the + // `memcpy` prototype. This allows the compiler to merge consecutive + // load/store (LDR, STR) instructions generated in + // `copy_block_and_bump_pointers` with `BlockOp::kByWord` into load/store + // double (LDRD, STRD) instructions, this is is undesirable so we prevent the + // compiler from inferring `__restrict` with the following line. + asm volatile("" : "+r"(dst), "+r"(src)); #ifdef __ARM_FEATURE_UNALIGNED return inline_memcpy_arm_mid_end(dst, src, size); #else @@ -210,8 +211,4 @@ LIBC_INLINE auto misaligned(CPtr a) { } // namespace LIBC_NAMESPACE_DECL -// Cleanup local macros -#undef LIBC_ATTR_LIKELY -#undef LIBC_ATTR_UNLIKELY - #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H diff --git a/libc/src/string/memory_utils/arm/inline_memset.h b/libc/src/string/memory_utils/arm/inline_memset.h new file mode 100644 index 0000000000000..a7ef9cc7df916 --- /dev/null +++ b/libc/src/string/memory_utils/arm/inline_memset.h @@ -0,0 +1,156 @@ +//===-- Memset implementation for arm ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// The functions defined in this file give approximate code size. These sizes +// assume the following configuration options: +// - LIBC_CONF_KEEP_FRAME_POINTER = false +// - LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR = false +// - LIBC_ADD_NULL_CHECKS = false +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMSET_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMSET_H + +#include "src/__support/CPP/type_traits.h" // always_false +#include "src/__support/macros/attributes.h" // LIBC_INLINE +#include "src/__support/macros/optimization.h" // LIBC_LOOP_NOUNROLL +#include "src/string/memory_utils/arm/common.h" // LIBC_ATTR_LIKELY, LIBC_ATTR_UNLIKELY +#include "src/string/memory_utils/utils.h" // memcpy_inline, distance_to_align + +#include // size_t + +namespace LIBC_NAMESPACE_DECL { + +namespace { + +template +LIBC_INLINE void set(void *dst, uint32_t value) { + static_assert(bytes == 1 || bytes == 2 || bytes == 4); + if constexpr (access == AssumeAccess::kAligned) { + constexpr size_t alignment = bytes > kWordSize ? kWordSize : bytes; + memcpy_inline(assume_aligned(dst), &value); + } else if constexpr (access == AssumeAccess::kUnknown) { + memcpy_inline(dst, &value); + } else { + static_assert(cpp::always_false, "Invalid AssumeAccess"); + } +} + +template +LIBC_INLINE void set_block_and_bump_pointers(Ptr &dst, uint32_t value) { + if constexpr (bytes <= kWordSize) { + set(dst, value); + } else { + static_assert(bytes % kWordSize == 0 && bytes >= kWordSize); + LIBC_LOOP_UNROLL + for (size_t offset = 0; offset < bytes; offset += kWordSize) { + set(dst + offset, value); + } + } + // In the 1, 2, 4 byte set case, the compiler can fold pointer offsetting + // into the store instructions. + // e.g., + // strb r3, [r0], #1 + dst += bytes; +} + +template +LIBC_INLINE void consume_by_block(Ptr &dst, uint32_t value, size_t &size) { + LIBC_LOOP_NOUNROLL + for (size_t i = 0; i < size / bytes; ++i) + set_block_and_bump_pointers(dst, value); + size %= bytes; +} + +[[maybe_unused]] LIBC_INLINE void +set_bytes_and_bump_pointers(Ptr &dst, uint32_t value, size_t size) { + LIBC_LOOP_NOUNROLL + for (size_t i = 0; i < size; ++i) { + set<1, AssumeAccess::kUnknown>(dst++, value); + } +} + +} // namespace + +// Implementation for Cortex-M0, M0+, M1. It compiles down to 140 bytes when +// used through `memset` that also needs to return the `dst` ptr. These cores do +// not allow unaligned stores so all accesses are aligned. +[[maybe_unused]] LIBC_INLINE void +inline_memset_arm_low_end(Ptr dst, uint8_t value, size_t size) { + if (size >= 8) + LIBC_ATTR_LIKELY { + // Align `dst` to word boundary. + if (const size_t offset = distance_to_align_up(dst)) + LIBC_ATTR_UNLIKELY { + set_bytes_and_bump_pointers(dst, value, offset); + size -= offset; + } + const uint32_t value32 = value * 0x01010101U; // splat value in each byte + consume_by_block<64, AssumeAccess::kAligned>(dst, value32, size); + consume_by_block<16, AssumeAccess::kAligned>(dst, value32, size); + consume_by_block<4, AssumeAccess::kAligned>(dst, value32, size); + } + set_bytes_and_bump_pointers(dst, value, size); +} + +// Implementation for Cortex-M3, M4, M7, M23, M33, M35P, M52 with hardware +// support for unaligned loads and stores. It compiles down to 186 bytes when +// used through `memset` that also needs to return the `dst` ptr. +[[maybe_unused]] LIBC_INLINE void +inline_memset_arm_mid_end(Ptr dst, uint8_t value, size_t size) { + const uint32_t value32 = value * 0x01010101U; // splat value in each byte + if (misaligned(dst)) + LIBC_ATTR_UNLIKELY { + if (size < 8) + LIBC_ATTR_UNLIKELY { + if (size & 1) + set_block_and_bump_pointers<1>(dst, value32); + if (size & 2) + set_block_and_bump_pointers<2>(dst, value32); + if (size & 4) + set_block_and_bump_pointers<4>(dst, value32); + return; + } + const size_t offset = distance_to_align_up(dst); + if (offset & 1) + set_block_and_bump_pointers<1>(dst, value32); + if (offset & 2) + set_block_and_bump_pointers<2>(dst, value32); + size -= offset; + } + // If we tell the compiler that the stores are aligned it will generate 8 x + // STRD instructions. By not specifying alignment, the compiler conservatively + // uses 16 x STR.W and is able to use the first one to prefetch the + // destination in advance leading to better asymptotic performances. + // str r12, [r3, #64]! <- prefetch next cache line + // str.w r12, [r3, #0x4] + // str.w r12, [r3, #0x8] + // ... + // str.w r12, [r3, #0x38] + // str.w r12, [r3, #0x3c] + consume_by_block<64, AssumeAccess::kUnknown>(dst, value32, size); + // Prefetching does not matter anymore at this scale so using STRD yields + // better results. + consume_by_block<16, AssumeAccess::kAligned>(dst, value32, size); + consume_by_block<4, AssumeAccess::kAligned>(dst, value32, size); + if (size & 1) + set_block_and_bump_pointers<1>(dst, value32); + if (size & 2) + LIBC_ATTR_UNLIKELY + set_block_and_bump_pointers<2>(dst, value32); +} + +[[maybe_unused]] LIBC_INLINE void +inline_memset_arm_dispatch(Ptr dst, uint8_t value, size_t size) { +#ifdef __ARM_FEATURE_UNALIGNED + return inline_memset_arm_mid_end(dst, value, size); +#else + return inline_memset_arm_low_end(dst, value, size); +#endif +} + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ARM_INLINE_MEMCPY_H diff --git a/libc/src/string/memory_utils/inline_memset.h b/libc/src/string/memory_utils/inline_memset.h index fd9c29ea4410a..e41bdb626d60e 100644 --- a/libc/src/string/memory_utils/inline_memset.h +++ b/libc/src/string/memory_utils/inline_memset.h @@ -18,6 +18,9 @@ #if defined(LIBC_TARGET_ARCH_IS_X86) #include "src/string/memory_utils/x86_64/inline_memset.h" #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_x86 +#elif defined(LIBC_TARGET_ARCH_IS_ARM) +#include "src/string/memory_utils/arm/inline_memset.h" +#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_arm_dispatch #elif defined(LIBC_TARGET_ARCH_IS_AARCH64) #include "src/string/memory_utils/aarch64/inline_memset.h" #define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET inline_memset_aarch64_dispatch @@ -34,7 +37,8 @@ namespace LIBC_NAMESPACE_DECL { -LIBC_INLINE static void inline_memset(void *dst, uint8_t value, size_t count) { +[[gnu::flatten]] LIBC_INLINE void inline_memset(void *dst, uint8_t value, + size_t count) { LIBC_SRC_STRING_MEMORY_UTILS_MEMSET(reinterpret_cast(dst), value, count); } diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 8b60ca13562f6..47464d448f997 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -4457,7 +4457,9 @@ libc_support_library( "src/string/memory_utils/aarch64/inline_memcpy.h", "src/string/memory_utils/aarch64/inline_memmove.h", "src/string/memory_utils/aarch64/inline_memset.h", + "src/string/memory_utils/arm/common.h", "src/string/memory_utils/arm/inline_memcpy.h", + "src/string/memory_utils/arm/inline_memset.h", "src/string/memory_utils/generic/aligned_access.h", "src/string/memory_utils/generic/byte_per_byte.h", "src/string/memory_utils/inline_bcmp.h", From 742147ba1bb26d7a69d4289b6ad9a07bd019a2ae Mon Sep 17 00:00:00 2001 From: Djordje Todorovic Date: Thu, 17 Jul 2025 10:36:31 +0200 Subject: [PATCH 148/813] [llvm-objcopy][libObject] Add RISC-V big-endian support (#146913) Add support for big-endian RISC-V ELF files: - Add riscv32be/riscv64be target architectures to Triple - Support elf32-bigriscv and elf64-bigriscv output targets in llvm-objcopy - Update ELFObjectFile to handle BE RISC-V format strings and architecture detection - Add BE RISC-V support to RelocationResolver - Add tests for new functionality This is a subset of a bigger RISC-V big-endian support patch, containing only the llvm-objcopy and libObject changes. Other changes will be added later. --- clang/test/Driver/frame-pointer-elim.c | 2 +- llvm/include/llvm/Object/ELFObjectFile.h | 8 +- llvm/include/llvm/TargetParser/Triple.h | 122 +++---- llvm/lib/Object/RelocationResolver.cpp | 6 +- llvm/lib/TargetParser/Triple.cpp | 324 ++++++++++-------- .../ELF/binary-output-target.test | 6 + llvm/tools/llvm-objcopy/ObjcopyOptions.cpp | 2 + llvm/unittests/Object/ELFObjectFileTest.cpp | 8 +- 8 files changed, 264 insertions(+), 214 deletions(-) diff --git a/clang/test/Driver/frame-pointer-elim.c b/clang/test/Driver/frame-pointer-elim.c index f00940bd7613d..6e21671f43775 100644 --- a/clang/test/Driver/frame-pointer-elim.c +++ b/clang/test/Driver/frame-pointer-elim.c @@ -162,7 +162,7 @@ // RUN: FileCheck --check-prefix=KEEP-ALL %s // RUN: %clang -### --target=riscv64-linux-android -O1 -S %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s -// RUN: not %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \ +// RUN: %clang -### --target=riscv64-linux-android -mbig-endian -O1 -S %s 2>&1 | \ // RUN: FileCheck --check-prefix=KEEP-NON-LEAF %s // On ARM backend bare metal targets, frame pointer is omitted diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h index 103686884e705..a3aa0d9c137a2 100644 --- a/llvm/include/llvm/Object/ELFObjectFile.h +++ b/llvm/include/llvm/Object/ELFObjectFile.h @@ -1312,7 +1312,7 @@ StringRef ELFObjectFile::getFileFormatName() const { case ELF::EM_PPC: return (IsLittleEndian ? "elf32-powerpcle" : "elf32-powerpc"); case ELF::EM_RISCV: - return "elf32-littleriscv"; + return (IsLittleEndian ? "elf32-littleriscv" : "elf32-bigriscv"); case ELF::EM_CSKY: return "elf32-csky"; case ELF::EM_SPARC: @@ -1338,7 +1338,7 @@ StringRef ELFObjectFile::getFileFormatName() const { case ELF::EM_PPC64: return (IsLittleEndian ? "elf64-powerpcle" : "elf64-powerpc"); case ELF::EM_RISCV: - return "elf64-littleriscv"; + return (IsLittleEndian ? "elf64-littleriscv" : "elf64-bigriscv"); case ELF::EM_S390: return "elf64-s390"; case ELF::EM_SPARCV9: @@ -1400,9 +1400,9 @@ template Triple::ArchType ELFObjectFile::getArch() const { case ELF::EM_RISCV: switch (EF.getHeader().e_ident[ELF::EI_CLASS]) { case ELF::ELFCLASS32: - return Triple::riscv32; + return IsLittleEndian ? Triple::riscv32 : Triple::riscv32be; case ELF::ELFCLASS64: - return Triple::riscv64; + return IsLittleEndian ? Triple::riscv64 : Triple::riscv64be; default: report_fatal_error("Invalid ELFCLASS!"); } diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index 57d771b80251a..670a6321fdc02 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -49,62 +49,64 @@ class Triple { enum ArchType { UnknownArch, - arm, // ARM (little endian): arm, armv.*, xscale - armeb, // ARM (big endian): armeb - aarch64, // AArch64 (little endian): aarch64 - aarch64_be, // AArch64 (big endian): aarch64_be - aarch64_32, // AArch64 (little endian) ILP32: aarch64_32 - arc, // ARC: Synopsys ARC - avr, // AVR: Atmel AVR microcontroller - bpfel, // eBPF or extended BPF or 64-bit BPF (little endian) - bpfeb, // eBPF or extended BPF or 64-bit BPF (big endian) - csky, // CSKY: csky - dxil, // DXIL 32-bit DirectX bytecode - hexagon, // Hexagon: hexagon - loongarch32, // LoongArch (32-bit): loongarch32 - loongarch64, // LoongArch (64-bit): loongarch64 - m68k, // M68k: Motorola 680x0 family - mips, // MIPS: mips, mipsallegrex, mipsr6 - mipsel, // MIPSEL: mipsel, mipsallegrexe, mipsr6el - mips64, // MIPS64: mips64, mips64r6, mipsn32, mipsn32r6 - mips64el, // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el - msp430, // MSP430: msp430 - ppc, // PPC: powerpc - ppcle, // PPCLE: powerpc (little endian) - ppc64, // PPC64: powerpc64, ppu - ppc64le, // PPC64LE: powerpc64le - r600, // R600: AMD GPUs HD2XXX - HD6XXX - amdgcn, // AMDGCN: AMD GCN GPUs - riscv32, // RISC-V (32-bit): riscv32 - riscv64, // RISC-V (64-bit): riscv64 - sparc, // Sparc: sparc - sparcv9, // Sparcv9: Sparcv9 - sparcel, // Sparc: (endianness = little). NB: 'Sparcle' is a CPU variant - systemz, // SystemZ: s390x - tce, // TCE (http://tce.cs.tut.fi/): tce - tcele, // TCE little endian (http://tce.cs.tut.fi/): tcele - thumb, // Thumb (little endian): thumb, thumbv.* - thumbeb, // Thumb (big endian): thumbeb - x86, // X86: i[3-9]86 - x86_64, // X86-64: amd64, x86_64 - xcore, // XCore: xcore - xtensa, // Tensilica: Xtensa - nvptx, // NVPTX: 32-bit - nvptx64, // NVPTX: 64-bit - amdil, // AMDIL - amdil64, // AMDIL with 64-bit pointers - hsail, // AMD HSAIL - hsail64, // AMD HSAIL with 64-bit pointers - spir, // SPIR: standard portable IR for OpenCL 32-bit version - spir64, // SPIR: standard portable IR for OpenCL 64-bit version - spirv, // SPIR-V with logical memory layout. - spirv32, // SPIR-V with 32-bit pointers - spirv64, // SPIR-V with 64-bit pointers - kalimba, // Kalimba: generic kalimba - shave, // SHAVE: Movidius vector VLIW processors - lanai, // Lanai: Lanai 32-bit - wasm32, // WebAssembly with 32-bit pointers - wasm64, // WebAssembly with 64-bit pointers + arm, // ARM (little endian): arm, armv.*, xscale + armeb, // ARM (big endian): armeb + aarch64, // AArch64 (little endian): aarch64 + aarch64_be, // AArch64 (big endian): aarch64_be + aarch64_32, // AArch64 (little endian) ILP32: aarch64_32 + arc, // ARC: Synopsys ARC + avr, // AVR: Atmel AVR microcontroller + bpfel, // eBPF or extended BPF or 64-bit BPF (little endian) + bpfeb, // eBPF or extended BPF or 64-bit BPF (big endian) + csky, // CSKY: csky + dxil, // DXIL 32-bit DirectX bytecode + hexagon, // Hexagon: hexagon + loongarch32, // LoongArch (32-bit): loongarch32 + loongarch64, // LoongArch (64-bit): loongarch64 + m68k, // M68k: Motorola 680x0 family + mips, // MIPS: mips, mipsallegrex, mipsr6 + mipsel, // MIPSEL: mipsel, mipsallegrexe, mipsr6el + mips64, // MIPS64: mips64, mips64r6, mipsn32, mipsn32r6 + mips64el, // MIPS64EL: mips64el, mips64r6el, mipsn32el, mipsn32r6el + msp430, // MSP430: msp430 + ppc, // PPC: powerpc + ppcle, // PPCLE: powerpc (little endian) + ppc64, // PPC64: powerpc64, ppu + ppc64le, // PPC64LE: powerpc64le + r600, // R600: AMD GPUs HD2XXX - HD6XXX + amdgcn, // AMDGCN: AMD GCN GPUs + riscv32, // RISC-V (32-bit, little endian): riscv32 + riscv64, // RISC-V (64-bit, little endian): riscv64 + riscv32be, // RISC-V (32-bit, big endian): riscv32be + riscv64be, // RISC-V (64-bit, big endian): riscv64be + sparc, // Sparc: sparc + sparcv9, // Sparcv9: Sparcv9 + sparcel, // Sparc: (endianness = little). NB: 'Sparcle' is a CPU variant + systemz, // SystemZ: s390x + tce, // TCE (http://tce.cs.tut.fi/): tce + tcele, // TCE little endian (http://tce.cs.tut.fi/): tcele + thumb, // Thumb (little endian): thumb, thumbv.* + thumbeb, // Thumb (big endian): thumbeb + x86, // X86: i[3-9]86 + x86_64, // X86-64: amd64, x86_64 + xcore, // XCore: xcore + xtensa, // Tensilica: Xtensa + nvptx, // NVPTX: 32-bit + nvptx64, // NVPTX: 64-bit + amdil, // AMDIL + amdil64, // AMDIL with 64-bit pointers + hsail, // AMD HSAIL + hsail64, // AMD HSAIL with 64-bit pointers + spir, // SPIR: standard portable IR for OpenCL 32-bit version + spir64, // SPIR: standard portable IR for OpenCL 64-bit version + spirv, // SPIR-V with logical memory layout. + spirv32, // SPIR-V with 32-bit pointers + spirv64, // SPIR-V with 64-bit pointers + kalimba, // Kalimba: generic kalimba + shave, // SHAVE: Movidius vector VLIW processors + lanai, // Lanai: Lanai 32-bit + wasm32, // WebAssembly with 32-bit pointers + wasm64, // WebAssembly with 64-bit pointers renderscript32, // 32-bit RenderScript renderscript64, // 64-bit RenderScript ve, // NEC SX-Aurora Vector Engine @@ -1064,10 +1066,14 @@ class Triple { } /// Tests whether the target is 32-bit RISC-V. - bool isRISCV32() const { return getArch() == Triple::riscv32; } + bool isRISCV32() const { + return getArch() == Triple::riscv32 || getArch() == Triple::riscv32be; + } /// Tests whether the target is 64-bit RISC-V. - bool isRISCV64() const { return getArch() == Triple::riscv64; } + bool isRISCV64() const { + return getArch() == Triple::riscv64 || getArch() == Triple::riscv64be; + } /// Tests whether the target is RISC-V (32- and 64-bit). bool isRISCV() const { return isRISCV32() || isRISCV64(); } diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp index b6318bbe3ab74..d81899334b2b1 100644 --- a/llvm/lib/Object/RelocationResolver.cpp +++ b/llvm/lib/Object/RelocationResolver.cpp @@ -812,6 +812,7 @@ getRelocationResolver(const ObjectFile &Obj) { case Triple::amdgcn: return {supportsAmdgpu, resolveAmdgpu}; case Triple::riscv64: + case Triple::riscv64be: return {supportsRISCV, resolveRISCV}; default: if (isAMDGPU(Obj)) @@ -851,6 +852,7 @@ getRelocationResolver(const ObjectFile &Obj) { case Triple::r600: return {supportsAmdgpu, resolveAmdgpu}; case Triple::riscv32: + case Triple::riscv32be: return {supportsRISCV, resolveRISCV}; case Triple::csky: return {supportsCSKY, resolveCSKY}; @@ -897,7 +899,9 @@ uint64_t resolveRelocation(RelocationResolver Resolver, const RelocationRef &R, if (Obj->getArch() != Triple::loongarch32 && Obj->getArch() != Triple::loongarch64 && Obj->getArch() != Triple::riscv32 && - Obj->getArch() != Triple::riscv64) + Obj->getArch() != Triple::riscv64 && + Obj->getArch() != Triple::riscv32be && + Obj->getArch() != Triple::riscv64be) LocData = 0; } } diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index bcc60c53484e4..be51453ee21d7 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -64,6 +64,10 @@ StringRef Triple::getArchTypeName(ArchType Kind) { case renderscript64: return "renderscript64"; case riscv32: return "riscv32"; case riscv64: return "riscv64"; + case riscv32be: + return "riscv32be"; + case riscv64be: + return "riscv64be"; case shave: return "shave"; case sparc: return "sparc"; case sparcel: return "sparcel"; @@ -238,7 +242,10 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) { case wasm64: return "wasm"; case riscv32: - case riscv64: return "riscv"; + case riscv64: + case riscv32be: + case riscv64be: + return "riscv"; case ve: return "ve"; case csky: return "csky"; @@ -426,71 +433,73 @@ static Triple::ArchType parseBPFArch(StringRef ArchName) { Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) { Triple::ArchType BPFArch(parseBPFArch(Name)); return StringSwitch(Name) - .Case("aarch64", aarch64) - .Case("aarch64_be", aarch64_be) - .Case("aarch64_32", aarch64_32) - .Case("arc", arc) - .Case("arm64", aarch64) // "arm64" is an alias for "aarch64" - .Case("arm64_32", aarch64_32) - .Case("arm", arm) - .Case("armeb", armeb) - .Case("avr", avr) - .StartsWith("bpf", BPFArch) - .Case("m68k", m68k) - .Case("mips", mips) - .Case("mipsel", mipsel) - .Case("mips64", mips64) - .Case("mips64el", mips64el) - .Case("msp430", msp430) - .Case("ppc64", ppc64) - .Case("ppc32", ppc) - .Case("ppc", ppc) - .Case("ppc32le", ppcle) - .Case("ppcle", ppcle) - .Case("ppc64le", ppc64le) - .Case("r600", r600) - .Case("amdgcn", amdgcn) - .Case("riscv32", riscv32) - .Case("riscv64", riscv64) - .Case("hexagon", hexagon) - .Case("sparc", sparc) - .Case("sparcel", sparcel) - .Case("sparcv9", sparcv9) - .Case("s390x", systemz) - .Case("systemz", systemz) - .Case("tce", tce) - .Case("tcele", tcele) - .Case("thumb", thumb) - .Case("thumbeb", thumbeb) - .Case("x86", x86) - .Case("i386", x86) - .Case("x86-64", x86_64) - .Case("xcore", xcore) - .Case("nvptx", nvptx) - .Case("nvptx64", nvptx64) - .Case("amdil", amdil) - .Case("amdil64", amdil64) - .Case("hsail", hsail) - .Case("hsail64", hsail64) - .Case("spir", spir) - .Case("spir64", spir64) - .Case("spirv", spirv) - .Case("spirv32", spirv32) - .Case("spirv64", spirv64) - .Case("kalimba", kalimba) - .Case("lanai", lanai) - .Case("shave", shave) - .Case("wasm32", wasm32) - .Case("wasm64", wasm64) - .Case("renderscript32", renderscript32) - .Case("renderscript64", renderscript64) - .Case("ve", ve) - .Case("csky", csky) - .Case("loongarch32", loongarch32) - .Case("loongarch64", loongarch64) - .Case("dxil", dxil) - .Case("xtensa", xtensa) - .Default(UnknownArch); + .Case("aarch64", aarch64) + .Case("aarch64_be", aarch64_be) + .Case("aarch64_32", aarch64_32) + .Case("arc", arc) + .Case("arm64", aarch64) // "arm64" is an alias for "aarch64" + .Case("arm64_32", aarch64_32) + .Case("arm", arm) + .Case("armeb", armeb) + .Case("avr", avr) + .StartsWith("bpf", BPFArch) + .Case("m68k", m68k) + .Case("mips", mips) + .Case("mipsel", mipsel) + .Case("mips64", mips64) + .Case("mips64el", mips64el) + .Case("msp430", msp430) + .Case("ppc64", ppc64) + .Case("ppc32", ppc) + .Case("ppc", ppc) + .Case("ppc32le", ppcle) + .Case("ppcle", ppcle) + .Case("ppc64le", ppc64le) + .Case("r600", r600) + .Case("amdgcn", amdgcn) + .Case("riscv32", riscv32) + .Case("riscv64", riscv64) + .Case("riscv32be", riscv32be) + .Case("riscv64be", riscv64be) + .Case("hexagon", hexagon) + .Case("sparc", sparc) + .Case("sparcel", sparcel) + .Case("sparcv9", sparcv9) + .Case("s390x", systemz) + .Case("systemz", systemz) + .Case("tce", tce) + .Case("tcele", tcele) + .Case("thumb", thumb) + .Case("thumbeb", thumbeb) + .Case("x86", x86) + .Case("i386", x86) + .Case("x86-64", x86_64) + .Case("xcore", xcore) + .Case("nvptx", nvptx) + .Case("nvptx64", nvptx64) + .Case("amdil", amdil) + .Case("amdil64", amdil64) + .Case("hsail", hsail) + .Case("hsail64", hsail64) + .Case("spir", spir) + .Case("spir64", spir64) + .Case("spirv", spirv) + .Case("spirv32", spirv32) + .Case("spirv64", spirv64) + .Case("kalimba", kalimba) + .Case("lanai", lanai) + .Case("shave", shave) + .Case("wasm32", wasm32) + .Case("wasm64", wasm64) + .Case("renderscript32", renderscript32) + .Case("renderscript64", renderscript64) + .Case("ve", ve) + .Case("csky", csky) + .Case("loongarch32", loongarch32) + .Case("loongarch64", loongarch64) + .Case("dxil", dxil) + .Case("xtensa", xtensa) + .Default(UnknownArch); } static Triple::ArchType parseARMArch(StringRef ArchName) { @@ -559,84 +568,85 @@ static Triple::ArchType parseARMArch(StringRef ArchName) { } static Triple::ArchType parseArch(StringRef ArchName) { - auto AT = - StringSwitch(ArchName) - .Cases("i386", "i486", "i586", "i686", Triple::x86) - // FIXME: Do we need to support these? - .Cases("i786", "i886", "i986", Triple::x86) - .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64) - .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc) - .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle) - .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64) - .Cases("powerpc64le", "ppc64le", Triple::ppc64le) - .Case("xscale", Triple::arm) - .Case("xscaleeb", Triple::armeb) - .Case("aarch64", Triple::aarch64) - .Case("aarch64_be", Triple::aarch64_be) - .Case("aarch64_32", Triple::aarch64_32) - .Case("arc", Triple::arc) - .Case("arm64", Triple::aarch64) - .Case("arm64_32", Triple::aarch64_32) - .Case("arm64e", Triple::aarch64) - .Case("arm64ec", Triple::aarch64) - .Case("arm", Triple::arm) - .Case("armeb", Triple::armeb) - .Case("thumb", Triple::thumb) - .Case("thumbeb", Triple::thumbeb) - .Case("avr", Triple::avr) - .Case("m68k", Triple::m68k) - .Case("msp430", Triple::msp430) - .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6", "mipsr6", - Triple::mips) - .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el", - Triple::mipsel) - .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6", "mips64r6", - "mipsn32r6", Triple::mips64) - .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el", - "mipsn32r6el", Triple::mips64el) - .Case("r600", Triple::r600) - .Case("amdgcn", Triple::amdgcn) - .Case("riscv32", Triple::riscv32) - .Case("riscv64", Triple::riscv64) - .Case("hexagon", Triple::hexagon) - .Cases("s390x", "systemz", Triple::systemz) - .Case("sparc", Triple::sparc) - .Case("sparcel", Triple::sparcel) - .Cases("sparcv9", "sparc64", Triple::sparcv9) - .Case("tce", Triple::tce) - .Case("tcele", Triple::tcele) - .Case("xcore", Triple::xcore) - .Case("nvptx", Triple::nvptx) - .Case("nvptx64", Triple::nvptx64) - .Case("amdil", Triple::amdil) - .Case("amdil64", Triple::amdil64) - .Case("hsail", Triple::hsail) - .Case("hsail64", Triple::hsail64) - .Case("spir", Triple::spir) - .Case("spir64", Triple::spir64) - .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv) - .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2", - "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", - "spirv32v1.6", Triple::spirv32) - .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2", - "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", - "spirv64v1.6", Triple::spirv64) - .StartsWith("kalimba", Triple::kalimba) - .Case("lanai", Triple::lanai) - .Case("renderscript32", Triple::renderscript32) - .Case("renderscript64", Triple::renderscript64) - .Case("shave", Triple::shave) - .Case("ve", Triple::ve) - .Case("wasm32", Triple::wasm32) - .Case("wasm64", Triple::wasm64) - .Case("csky", Triple::csky) - .Case("loongarch32", Triple::loongarch32) - .Case("loongarch64", Triple::loongarch64) - .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3", - "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", "dxilv1.8", - Triple::dxil) - .Case("xtensa", Triple::xtensa) - .Default(Triple::UnknownArch); + auto AT = StringSwitch(ArchName) + .Cases("i386", "i486", "i586", "i686", Triple::x86) + // FIXME: Do we need to support these? + .Cases("i786", "i886", "i986", Triple::x86) + .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64) + .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc) + .Cases("powerpcle", "ppcle", "ppc32le", Triple::ppcle) + .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64) + .Cases("powerpc64le", "ppc64le", Triple::ppc64le) + .Case("xscale", Triple::arm) + .Case("xscaleeb", Triple::armeb) + .Case("aarch64", Triple::aarch64) + .Case("aarch64_be", Triple::aarch64_be) + .Case("aarch64_32", Triple::aarch64_32) + .Case("arc", Triple::arc) + .Case("arm64", Triple::aarch64) + .Case("arm64_32", Triple::aarch64_32) + .Case("arm64e", Triple::aarch64) + .Case("arm64ec", Triple::aarch64) + .Case("arm", Triple::arm) + .Case("armeb", Triple::armeb) + .Case("thumb", Triple::thumb) + .Case("thumbeb", Triple::thumbeb) + .Case("avr", Triple::avr) + .Case("m68k", Triple::m68k) + .Case("msp430", Triple::msp430) + .Cases("mips", "mipseb", "mipsallegrex", "mipsisa32r6", + "mipsr6", Triple::mips) + .Cases("mipsel", "mipsallegrexel", "mipsisa32r6el", "mipsr6el", + Triple::mipsel) + .Cases("mips64", "mips64eb", "mipsn32", "mipsisa64r6", + "mips64r6", "mipsn32r6", Triple::mips64) + .Cases("mips64el", "mipsn32el", "mipsisa64r6el", "mips64r6el", + "mipsn32r6el", Triple::mips64el) + .Case("r600", Triple::r600) + .Case("amdgcn", Triple::amdgcn) + .Case("riscv32", Triple::riscv32) + .Case("riscv64", Triple::riscv64) + .Case("riscv32be", Triple::riscv32be) + .Case("riscv64be", Triple::riscv64be) + .Case("hexagon", Triple::hexagon) + .Cases("s390x", "systemz", Triple::systemz) + .Case("sparc", Triple::sparc) + .Case("sparcel", Triple::sparcel) + .Cases("sparcv9", "sparc64", Triple::sparcv9) + .Case("tce", Triple::tce) + .Case("tcele", Triple::tcele) + .Case("xcore", Triple::xcore) + .Case("nvptx", Triple::nvptx) + .Case("nvptx64", Triple::nvptx64) + .Case("amdil", Triple::amdil) + .Case("amdil64", Triple::amdil64) + .Case("hsail", Triple::hsail) + .Case("hsail64", Triple::hsail64) + .Case("spir", Triple::spir) + .Case("spir64", Triple::spir64) + .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv) + .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2", + "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", + "spirv32v1.6", Triple::spirv32) + .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2", + "spirv64v1.3", "spirv64v1.4", "spirv64v1.5", + "spirv64v1.6", Triple::spirv64) + .StartsWith("kalimba", Triple::kalimba) + .Case("lanai", Triple::lanai) + .Case("renderscript32", Triple::renderscript32) + .Case("renderscript64", Triple::renderscript64) + .Case("shave", Triple::shave) + .Case("ve", Triple::ve) + .Case("wasm32", Triple::wasm32) + .Case("wasm64", Triple::wasm64) + .Case("csky", Triple::csky) + .Case("loongarch32", Triple::loongarch32) + .Case("loongarch64", Triple::loongarch64) + .Cases("dxil", "dxilv1.0", "dxilv1.1", "dxilv1.2", "dxilv1.3", + "dxilv1.4", "dxilv1.5", "dxilv1.6", "dxilv1.7", + "dxilv1.8", Triple::dxil) + .Case("xtensa", Triple::xtensa) + .Default(Triple::UnknownArch); // Some architectures require special parsing logic just to compute the // ArchType result. @@ -966,6 +976,8 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) { case Triple::renderscript64: case Triple::riscv32: case Triple::riscv64: + case Triple::riscv32be: + case Triple::riscv64be: case Triple::shave: case Triple::sparc: case Triple::sparcel: @@ -1688,6 +1700,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::r600: case llvm::Triple::renderscript32: case llvm::Triple::riscv32: + case llvm::Triple::riscv32be: case llvm::Triple::shave: case llvm::Triple::sparc: case llvm::Triple::sparcel: @@ -1718,6 +1731,7 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) { case llvm::Triple::ppc64le: case llvm::Triple::renderscript64: case llvm::Triple::riscv64: + case llvm::Triple::riscv64be: case llvm::Triple::sparcv9: case llvm::Triple::spirv: case llvm::Triple::spir64: @@ -1796,6 +1810,7 @@ Triple Triple::get32BitArchVariant() const { case Triple::r600: case Triple::renderscript32: case Triple::riscv32: + case Triple::riscv32be: case Triple::shave: case Triple::sparc: case Triple::sparcel: @@ -1828,6 +1843,9 @@ Triple Triple::get32BitArchVariant() const { case Triple::ppc64le: T.setArch(Triple::ppcle); break; case Triple::renderscript64: T.setArch(Triple::renderscript32); break; case Triple::riscv64: T.setArch(Triple::riscv32); break; + case Triple::riscv64be: + T.setArch(Triple::riscv32be); + break; case Triple::sparcv9: T.setArch(Triple::sparc); break; case Triple::spir64: T.setArch(Triple::spir); break; case Triple::spirv: @@ -1878,6 +1896,7 @@ Triple Triple::get64BitArchVariant() const { case Triple::ppc64le: case Triple::renderscript64: case Triple::riscv64: + case Triple::riscv64be: case Triple::sparcv9: case Triple::spir64: case Triple::spirv64: @@ -1905,6 +1924,9 @@ Triple Triple::get64BitArchVariant() const { case Triple::ppcle: T.setArch(Triple::ppc64le); break; case Triple::renderscript32: T.setArch(Triple::renderscript64); break; case Triple::riscv32: T.setArch(Triple::riscv64); break; + case Triple::riscv32be: + T.setArch(Triple::riscv64be); + break; case Triple::sparc: T.setArch(Triple::sparcv9); break; case Triple::spir: T.setArch(Triple::spir64); break; case Triple::spirv: @@ -1943,8 +1965,6 @@ Triple Triple::getBigEndianArchVariant() const { case Triple::r600: case Triple::renderscript32: case Triple::renderscript64: - case Triple::riscv32: - case Triple::riscv64: case Triple::shave: case Triple::spir64: case Triple::spir: @@ -1977,6 +1997,12 @@ Triple Triple::getBigEndianArchVariant() const { break; case Triple::ppcle: T.setArch(Triple::ppc); break; case Triple::ppc64le: T.setArch(Triple::ppc64); break; + case Triple::riscv32: + T.setArch(Triple::riscv32be); + break; + case Triple::riscv64: + T.setArch(Triple::riscv64be); + break; case Triple::sparcel: T.setArch(Triple::sparc); break; case Triple::tcele: T.setArch(Triple::tce); break; default: @@ -2014,6 +2040,12 @@ Triple Triple::getLittleEndianArchVariant() const { break; case Triple::ppc: T.setArch(Triple::ppcle); break; case Triple::ppc64: T.setArch(Triple::ppc64le); break; + case Triple::riscv32be: + T.setArch(Triple::riscv32); + break; + case Triple::riscv64be: + T.setArch(Triple::riscv64); + break; case Triple::sparc: T.setArch(Triple::sparcel); break; case Triple::tce: T.setArch(Triple::tcele); break; default: diff --git a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test index f88b7575002a9..3547b728a426d 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test +++ b/llvm/test/tools/llvm-objcopy/ELF/binary-output-target.test @@ -33,6 +33,12 @@ # RUN: llvm-objcopy -I binary -O elf64-littleriscv %t.txt %t.rv64.o # RUN: llvm-readobj --file-headers %t.rv64.o | FileCheck %s --check-prefixes=CHECK,LE,RISCV64,64 +# RUN: llvm-objcopy -I binary -O elf32-bigriscv %t.txt %t.rv32.o +# RUN: llvm-readobj --file-headers %t.rv32.o | FileCheck %s --check-prefixes=CHECK,BE,RISCV32,32 + +# RUN: llvm-objcopy -I binary -O elf64-bigriscv %t.txt %t.rv64.o +# RUN: llvm-readobj --file-headers %t.rv64.o | FileCheck %s --check-prefixes=CHECK,BE,RISCV64,64 + # RUN: llvm-objcopy -I binary -O elf32-sparc %t.txt %t.sparc.o # RUN: llvm-readobj --file-headers %t.sparc.o | FileCheck %s --check-prefixes=CHECK,BE,SPARC,32 diff --git a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp index 0d209590655ef..175f77c894825 100644 --- a/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp +++ b/llvm/tools/llvm-objcopy/ObjcopyOptions.cpp @@ -308,6 +308,8 @@ static const StringMap TargetMap{ // RISC-V {"elf32-littleriscv", {ELF::EM_RISCV, false, true}}, {"elf64-littleriscv", {ELF::EM_RISCV, true, true}}, + {"elf32-bigriscv", {ELF::EM_RISCV, false, false}}, + {"elf64-bigriscv", {ELF::EM_RISCV, true, false}}, // PowerPC {"elf32-powerpc", {ELF::EM_PPC, false, false}}, {"elf32-powerpcle", {ELF::EM_PPC, false, true}}, diff --git a/llvm/unittests/Object/ELFObjectFileTest.cpp b/llvm/unittests/Object/ELFObjectFileTest.cpp index 21606fff1f32b..25b390758f172 100644 --- a/llvm/unittests/Object/ELFObjectFileTest.cpp +++ b/llvm/unittests/Object/ELFObjectFileTest.cpp @@ -177,10 +177,10 @@ TEST(ELFObjectFileTest, MachineTestForPPC) { } TEST(ELFObjectFileTest, MachineTestForRISCV) { - std::array Formats = {"elf32-littleriscv", "elf32-littleriscv", - "elf64-littleriscv", "elf64-littleriscv"}; - std::array Archs = {Triple::riscv32, Triple::riscv32, - Triple::riscv64, Triple::riscv64}; + std::array Formats = {"elf32-littleriscv", "elf32-bigriscv", + "elf64-littleriscv", "elf64-bigriscv"}; + std::array Archs = {Triple::riscv32, Triple::riscv32be, + Triple::riscv64, Triple::riscv64be}; for (auto [Idx, Data] : enumerate(generateData(ELF::EM_RISCV))) checkFormatAndArch(Data, Formats[Idx], Archs[Idx]); } From 3dc5d687b09af5568e9bd80160addb550a46e341 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 17 Jul 2025 10:55:29 +0200 Subject: [PATCH 149/813] [libc++] Remove minimal_cxx11_configuration.pass.cpp (#149119) This test doesn't seem to be very useful. If it is the only test that fails we would just remove the failing parts of the test, and otherwise it doesn't provide any value either, since there will be another test that fails. --- .../minimal_cxx11_configuration.pass.cpp | 130 ------------------ 1 file changed, 130 deletions(-) delete mode 100644 libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp diff --git a/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp b/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp deleted file mode 100644 index e0811e02f5c13..0000000000000 --- a/libcxx/test/libcxx/minimal_cxx11_configuration.pass.cpp +++ /dev/null @@ -1,130 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// Test the set of C++11 features that Clang provides as an extension in C++03 mode. -// The language features we expect are: -// -// 1. rvalue references (and perfect forwarding) -// 2. variadic templates -// 3. alias templates -// 4. defaulted and deleted functions. -// 5. default values for non-type template parameters. -// -// Some features we don't get and can't be used in extended C++03 mode: -// -// 1. noexcept and constexpr -// 2. Two closing '>' without a space. - -#include -#include - -// Equals delete and default are allowed in minimal C++03 mode. -namespace test_eq_delete_and_default { -void t1() = delete; -struct T2 { - T2() = default; - T2(T2 const&) = delete; -}; -} - -namespace alias_templates { -template -using X = T; -static_assert((std::is_same, int>::value), ""); -} - -namespace variadics_templates { -template -int t1(Args...) { - return sizeof...(Args); -} -void test() { - assert(t1() == 0); - assert(t1(42) == 1); - assert(t1(1, 2, 3) == 3); -} -} - -namespace rvalue_references_move_semantics { -struct T { - T() : moved(0) {} - T(T const& other) : moved(other.moved) {} - T(T&& other) : moved(other.moved) { ++moved; other.moved = -1; } - int moved; -}; -void f(T o, int expect_moved) { assert(o.moved == expect_moved); } -void test() { - { - T t; - assert(t.moved == 0); - T t2(static_cast(t)); - assert(t2.moved == 1); - assert(t.moved == -1); - } - { - T t; - f(t, 0); - f(static_cast(t), 1); - } -} -} - -namespace rvalue_references_perfect_forwarding { -template -void f(T&&) { - static_assert((std::is_same::value), ""); -} -void test() { - int x = 42; - f(x); - f(42); - f(static_cast(x)); -} -} - -namespace default_values_for_nttp { -template -void f() { assert(I == 42); } -void test() { - f(); -} -} - -namespace reference_qualified_functions { -struct T { - T() : lvalue_called(0), rvalue_called(0) {} - void foo() const & { lvalue_called++; } - void foo() && { rvalue_called++; } - mutable int lvalue_called; - int rvalue_called; -}; - -void test() { - { - T t; - t.foo(); - assert(t.lvalue_called == 1); - assert(t.rvalue_called == 0); - } - { - T t; - static_cast(t).foo(); - assert(t.lvalue_called == 0); - assert(t.rvalue_called == 1); - } -} -} - -int main(int, char**) { - variadics_templates::test(); - rvalue_references_move_semantics::test(); - rvalue_references_perfect_forwarding::test(); - default_values_for_nttp::test(); - reference_qualified_functions::test(); - return 0; -} From b291d1a71f39eb14b89b6aeccfc10bcd3c92a1ef Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Thu, 17 Jul 2025 02:09:52 -0700 Subject: [PATCH 150/813] [TII] Do not fold undef copies (#147392) RegallocBase::cleanupFailedVReg hacks up the state of the liveness in order to facilitate producing valid IR. During this process, we may end up producing undef copies. If the destination of these copies is a spill candidate, we will attempt to fold the source register when issuing the spill. The undef of the source is not propagated to storeRegToStackSlot , thus we end up dropping the undef, issuing a spill, and producing an illegal liveness state. This checks for undef copies, and, if found, inserts a kill instead of spill. --- llvm/lib/CodeGen/TargetInstrInfo.cpp | 16 ++-- .../AMDGPU/regalloc-undef-copy-fold.mir | 79 +++++++++++++++++++ 2 files changed, 90 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp index 518a9339d8d11..18d6bbc0ff2b0 100644 --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -792,12 +792,18 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI, const MachineOperand &MO = MI.getOperand(1 - Ops[0]); MachineBasicBlock::iterator Pos = MI; - - if (Flags == MachineMemOperand::MOStore) - storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI, - Register()); - else + if (Flags == MachineMemOperand::MOStore) { + if (MO.isUndef()) { + // If this is an undef copy, we do not need to bother we inserting spill + // code. + BuildMI(*MBB, Pos, MI.getDebugLoc(), get(TargetOpcode::KILL)).add(MO); + } else { + storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI, + Register()); + } + } else loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI, Register()); + return &*--Pos; } diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir new file mode 100644 index 0000000000000..b416c96d74d98 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir @@ -0,0 +1,79 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs --start-before=greedy,2 --stop-after=greedy,2 %s -o - | FileCheck %s + +# Make sure there's no machine verifier error + +# If RA is unable to find a register to allocate, then cleanupFailedVReg will do ad-hoc rewriting and will insert undefs to make the LiveRanges workable. +# %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 is an example of such a rewrite / undef. If we were to want to spill %30, we should not be inserting +# actual spill code, as the source operand is undef. +# Check that there are no verfier issues with the LiveRange of $vgpr0_vgpr1_vgpr2_vgpr3 / that we do not insert spill code for %30. + + +--- | + define void @foo() #0 { + ret void + } + + attributes #0 = { "amdgpu-waves-per-eu"="8,8" } + +... + +--- +name: foo +tracksRegLiveness: true +stack: + - { id: 0, type: spill-slot, size: 32, alignment: 4 } +machineFunctionInfo: + maxKernArgAlign: 4 + isEntryFunction: true + waveLimiter: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledVGPRs: true + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + dispatchPtr: { reg: '$sgpr4_sgpr5' } + kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } + workGroupIDX: { reg: '$sgpr8' } + privateSegmentWaveByteOffset: { reg: '$sgpr9' } +body: | + bb.0: + ; CHECK-LABEL: name: foo + ; CHECK: INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10 /* regdef */, def %10, 10 /* regdef */, def %1, 10 /* regdef */, def %2, 10 /* regdef */, def $vgpr0_vgpr1_vgpr2_vgpr3, 10 /* regdef */, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; CHECK-NEXT: KILL undef $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: SI_SPILL_AV160_SAVE %2, %stack.1, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_AV256_SAVE %1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY %10 + ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY1]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY [[SI_SPILL_V512_RESTORE]] + ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE1:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V512_SAVE [[SI_SPILL_V512_RESTORE1]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_AV256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V256_SAVE [[SI_SPILL_AV256_RESTORE]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV160_RESTORE:%[0-9]+]]:vreg_160 = SI_SPILL_AV160_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.1, align 4, addrspace 5) + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9 /* reguse */, [[SI_SPILL_AV512_RESTORE]], 9 /* reguse */, [[SI_SPILL_V256_RESTORE]], 9 /* reguse */, [[SI_SPILL_AV160_RESTORE]], 9 /* reguse */, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9 /* reguse */, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 + ; CHECK-NEXT: SI_RETURN + INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10, def %22:vreg_512, 10, def %25:vreg_256, 10, def %28:vreg_160, 10, def $vgpr0_vgpr1_vgpr2_vgpr3, 10, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 + %27:av_160 = COPY %28:vreg_160 + %24:av_256 = COPY %25:vreg_256 + SI_SPILL_V512_SAVE %22:vreg_512, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) + %18:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 + $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY %18:vreg_512 + %23:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) + %26:vreg_256 = COPY %24:av_256 + %29:vreg_160 = COPY %27:av_160 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %30:av_128 + INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9, %23:vreg_512, 9, %26:vreg_256, 9, %29:vreg_160, 9, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 + SI_RETURN + +... From 4993f5b12ce4c847bb76f9bd23c188a02b27f2d9 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 17 Jul 2025 11:11:30 +0200 Subject: [PATCH 151/813] [libc++][NFC] Use variable templates in (#149038) Variable templates are a bit lighter on the compiler than class templates. --- libcxx/include/string | 68 +++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/libcxx/include/string b/libcxx/include/string index 514dd91c7c172..788af36d67c58 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -700,18 +700,18 @@ __concatenate_strings(const _Allocator& __alloc, __type_identity_t > __str2); template -struct __string_is_trivial_iterator : public false_type {}; +inline const bool __string_is_trivial_iterator_v = false; template -struct __string_is_trivial_iterator<_Tp*> : public is_arithmetic<_Tp> {}; +inline const bool __string_is_trivial_iterator_v<_Tp*> = is_arithmetic<_Tp>::value; template -struct __string_is_trivial_iterator<__wrap_iter<_Iter> > : public __string_is_trivial_iterator<_Iter> {}; +inline const bool __string_is_trivial_iterator_v<__wrap_iter<_Iter> > = __string_is_trivial_iterator_v<_Iter>; template -struct __can_be_converted_to_string_view - : public _BoolConstant< is_convertible >::value && - !is_convertible::value > {}; +inline const bool __can_be_converted_to_string_view_v = + is_convertible >::value && + !is_convertible::value; struct __uninitialized_size_tag {}; struct __init_with_sentinel_tag {}; @@ -1125,7 +1125,7 @@ public: } template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 @@ -1137,7 +1137,7 @@ public: } template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t) { @@ -1146,7 +1146,7 @@ public: } template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 explicit basic_string(const _Tp& __t, const allocator_type& __a) @@ -1205,7 +1205,7 @@ public: operator=(const basic_string& __str); template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator=(const _Tp& __t) { @@ -1342,7 +1342,7 @@ public: } template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string >::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& operator+=(const _Tp& __t) { @@ -1371,7 +1371,7 @@ public: } template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const _Tp& __t) { @@ -1382,7 +1382,7 @@ public: _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const basic_string& __str, size_type __pos, size_type __n = npos); template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& @@ -1415,7 +1415,7 @@ public: size_type __cap = capacity(); size_type __n = static_cast(std::distance(__first, __last)); if (__n) { - if (__string_is_trivial_iterator<_ForwardIterator>::value && !__addr_in_range(*__first)) { + if (__string_is_trivial_iterator_v<_ForwardIterator> && !__addr_in_range(*__first)) { if (__cap - __sz < __n) __grow_by_without_replace(__cap, __sz + __n - __cap, __sz, __sz, 0); __annotate_increase(__n); @@ -1467,7 +1467,7 @@ public: return *(data() + size() - 1); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const _Tp& __t) { __self_view __sv = __t; return assign(__sv.data(), __sv.size()); @@ -1509,7 +1509,7 @@ public: _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const basic_string& __str, size_type __pos, size_type __n = npos); template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& @@ -1535,7 +1535,7 @@ public: template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(_ForwardIterator __first, _ForwardIterator __last) { - if (__string_is_trivial_iterator<_ForwardIterator>::value) { + if (__string_is_trivial_iterator_v<_ForwardIterator>) { size_type __n = static_cast(std::distance(__first, __last)); __assign_trivial(__first, __last, __n); } else { @@ -1548,7 +1548,7 @@ public: # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_CharT> _Range> _LIBCPP_HIDE_FROM_ABI constexpr basic_string& assign_range(_Range&& __range) { - if constexpr (__string_is_trivial_iterator>::value && + if constexpr (__string_is_trivial_iterator_v> && (ranges::forward_range<_Range> || ranges::sized_range<_Range>)) { size_type __n = static_cast(ranges::distance(__range)); __assign_trivial(ranges::begin(__range), ranges::end(__range), __n); @@ -1572,14 +1572,14 @@ public: return insert(__pos1, __str.data(), __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos1, const _Tp& __t) { __self_view __sv = __t; return insert(__pos1, __sv.data(), __sv.size()); } template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& @@ -1649,7 +1649,7 @@ public: return replace(__pos1, __n1, __str.data(), __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos1, size_type __n1, const _Tp& __t) { __self_view __sv = __t; @@ -1660,7 +1660,7 @@ public: replace(size_type __pos1, size_type __n1, const basic_string& __str, size_type __pos2, size_type __n2 = npos); template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& @@ -1683,7 +1683,7 @@ public: static_cast(__i1 - begin()), static_cast(__i2 - __i1), __str.data(), __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(const_iterator __i1, const_iterator __i2, const _Tp& __t) { __self_view __sv = __t; @@ -1776,7 +1776,7 @@ public: return std::__str_find(data(), size(), __str.data(), __pos, __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; @@ -1807,7 +1807,7 @@ public: data(), size(), __str.data(), __pos, __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; @@ -1838,7 +1838,7 @@ public: data(), size(), __str.data(), __pos, __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; @@ -1872,7 +1872,7 @@ public: data(), size(), __str.data(), __pos, __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; @@ -1906,7 +1906,7 @@ public: data(), size(), __str.data(), __pos, __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_first_not_of(const _Tp& __t, size_type __pos = 0) const _NOEXCEPT { __self_view __sv = __t; @@ -1940,7 +1940,7 @@ public: data(), size(), __str.data(), __pos, __str.size()); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find_last_not_of(const _Tp& __t, size_type __pos = npos) const _NOEXCEPT { __self_view __sv = __t; @@ -1972,7 +1972,7 @@ public: return compare(__self_view(__str)); } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(const _Tp& __t) const _NOEXCEPT { __self_view __sv = __t; size_t __lhs_sz = size(); @@ -1987,7 +1987,7 @@ public: return 0; } - template ::value, int> = 0> + template , int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int compare(size_type __pos1, size_type __n1, const _Tp& __t) const { __self_view __sv = __t; @@ -2005,7 +2005,7 @@ public: } template ::value && + __enable_if_t<__can_be_converted_to_string_view_v<_CharT, _Traits, _Tp> && !is_same<__remove_cvref_t<_Tp>, basic_string>::value, int> = 0> inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 int @@ -2951,7 +2951,7 @@ template _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI void basic_string<_CharT, _Traits, _Allocator>::__assign_trivial(_Iterator __first, _Sentinel __last, size_type __n) { _LIBCPP_ASSERT_INTERNAL( - __string_is_trivial_iterator<_Iterator>::value, "The iterator type given to `__assign_trivial` must be trivial"); + __string_is_trivial_iterator_v<_Iterator>, "The iterator type given to `__assign_trivial` must be trivial"); size_type __old_size = size(); size_type __cap = capacity(); @@ -3166,7 +3166,7 @@ basic_string<_CharT, _Traits, _Allocator>::__insert_with_size( if (__n == 0) return begin() + __ip; - if (__string_is_trivial_iterator<_Iterator>::value && !__addr_in_range(*__first)) { + if (__string_is_trivial_iterator_v<_Iterator> && !__addr_in_range(*__first)) { return __insert_from_safe_copy(__n, __ip, std::move(__first), std::move(__last)); } else { const basic_string __temp(__init_with_sentinel_tag(), std::move(__first), std::move(__last), __alloc_); From 4695aea28e5cc6ba2841562992c83d3e16dda36a Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 17 Jul 2025 11:12:01 +0200 Subject: [PATCH 152/813] [libc++] Move more tests into better places (#148419) --- libcxx/test/extensions/libcxx/localization/lit.local.cfg | 3 +++ .../conversions/conversions.string/ctor_move.pass.cpp | 0 .../numerics/c.math/fdelayed-template-parsing.pass.cpp | 0 .../rand/rand.req}/rand.req.urng/valid_int_type.verify.cpp | 0 .../rand/rand.req}/rand.req.urng/valid_real_type.verify.cpp | 0 .../util.smartptr.shared.create}/shared_ptr_array.pass.cpp | 0 6 files changed, 3 insertions(+) create mode 100644 libcxx/test/extensions/libcxx/localization/lit.local.cfg rename libcxx/test/{libcxx/localization/locales => extensions/libcxx/localization}/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp (100%) rename libcxx/test/{libcxx => std}/numerics/c.math/fdelayed-template-parsing.pass.cpp (100%) rename libcxx/test/{libcxx/numerics/rand => std/numerics/rand/rand.req}/rand.req.urng/valid_int_type.verify.cpp (100%) rename libcxx/test/{libcxx/numerics/rand => std/numerics/rand/rand.req}/rand.req.urng/valid_real_type.verify.cpp (100%) rename libcxx/test/{libcxx/memory => std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create}/shared_ptr_array.pass.cpp (100%) diff --git a/libcxx/test/extensions/libcxx/localization/lit.local.cfg b/libcxx/test/extensions/libcxx/localization/lit.local.cfg new file mode 100644 index 0000000000000..d47f3e0fe4752 --- /dev/null +++ b/libcxx/test/extensions/libcxx/localization/lit.local.cfg @@ -0,0 +1,3 @@ +# tests are obviously not supported when localization support is disabled +if "no-localization" in config.available_features: + config.unsupported = True diff --git a/libcxx/test/libcxx/localization/locales/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp b/libcxx/test/extensions/libcxx/localization/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp similarity index 100% rename from libcxx/test/libcxx/localization/locales/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp rename to libcxx/test/extensions/libcxx/localization/locale.convenience/conversions/conversions.string/ctor_move.pass.cpp diff --git a/libcxx/test/libcxx/numerics/c.math/fdelayed-template-parsing.pass.cpp b/libcxx/test/std/numerics/c.math/fdelayed-template-parsing.pass.cpp similarity index 100% rename from libcxx/test/libcxx/numerics/c.math/fdelayed-template-parsing.pass.cpp rename to libcxx/test/std/numerics/c.math/fdelayed-template-parsing.pass.cpp diff --git a/libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_int_type.verify.cpp b/libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_int_type.verify.cpp similarity index 100% rename from libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_int_type.verify.cpp rename to libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_int_type.verify.cpp diff --git a/libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_real_type.verify.cpp b/libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_real_type.verify.cpp similarity index 100% rename from libcxx/test/libcxx/numerics/rand/rand.req.urng/valid_real_type.verify.cpp rename to libcxx/test/std/numerics/rand/rand.req/rand.req.urng/valid_real_type.verify.cpp diff --git a/libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp similarity index 100% rename from libcxx/test/libcxx/memory/shared_ptr_array.pass.cpp rename to libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp From d87170211dc780341e42f7719c7332416f9cc290 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 17 Jul 2025 09:17:25 +0000 Subject: [PATCH 153/813] [lldb][test] Remove XFAIL from some Windows tests These are now passing on Windows x86_64 and Arm64. --- .../basics/ArraySubscript/TestFrameVarDILArraySubscript.py | 1 - .../data-formatter-disabling/TestDataFormatterDisabling.py | 4 ---- lldb/test/API/lang/cpp/template/TestTemplateArgs.py | 1 - 3 files changed, 6 deletions(-) diff --git a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py index c0ef29fab8597..0f56057189395 100644 --- a/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py +++ b/lldb/test/API/commands/frame/var-dil/basics/ArraySubscript/TestFrameVarDILArraySubscript.py @@ -98,7 +98,6 @@ def test_subscript(self): substrs=["subscript of pointer to incomplete type 'void'"], ) - @expectedFailureAll(oslist=["windows"]) def test_subscript_synthetic(self): self.build() lldbutil.run_to_source_breakpoint( diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py b/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py index 20f49e02adcea..e6aeef2bedff2 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-disabling/TestDataFormatterDisabling.py @@ -16,10 +16,6 @@ def setUp(self): # Find the line number to break at. self.line = line_number("main.cpp", "// Set break point at this line.") - @expectedFailureAll( - oslist=["windows"], - bugnumber="llvm.org/pr24462, Data formatters have problems on Windows", - ) def test_with_run_command(self): """Check that we can properly disable all data formatter categories.""" self.build() diff --git a/lldb/test/API/lang/cpp/template/TestTemplateArgs.py b/lldb/test/API/lang/cpp/template/TestTemplateArgs.py index 9708e98a59fce..2fd05d6404650 100644 --- a/lldb/test/API/lang/cpp/template/TestTemplateArgs.py +++ b/lldb/test/API/lang/cpp/template/TestTemplateArgs.py @@ -82,7 +82,6 @@ def test_integer_args(self): 'expr_result.GetType().GetName() == "int"', ) - @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24489") def test_template_template_args(self): frame = self.prepareProcess() From e66eabee0022af60347e83ef2c0d4b6dbfb7f0a5 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 17 Jul 2025 10:28:56 +0100 Subject: [PATCH 154/813] [llvm][cmake] Add clang if not already present when building lldb (#149055) Fixes https://github.com/llvm/llvm-project/issues/54555 This follows flang's pattern, it adds clang if you don't have it in LLVM_ENABLE_PROJECTS. --- llvm/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 3f8201fa426fe..903461b39902e 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -179,6 +179,13 @@ if ("flang" IN_LIST LLVM_ENABLE_PROJECTS) endif () endif() +if ("lldb" IN_LIST LLVM_ENABLE_PROJECTS) + if (NOT "clang" IN_LIST LLVM_ENABLE_PROJECTS) + message(STATUS "Enabling clang as a dependency of lldb") + list(APPEND LLVM_ENABLE_PROJECTS "clang") + endif() +endif () + if ("libc" IN_LIST LLVM_ENABLE_PROJECTS) message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated. Please use " "-DLLVM_ENABLE_RUNTIMES=libc or see the instructions at " From a78a0f8d204393a0cce367b63395bad90311c1b8 Mon Sep 17 00:00:00 2001 From: Trevor Gross Date: Thu, 17 Jul 2025 04:30:36 -0500 Subject: [PATCH 155/813] [X86] Align f128 and i128 to 16 bytes when passing on x86-32 (#138092) The i386 psABI specifies that `__float128` has 16 byte alignment and must be passed on the stack; however, LLVM currently stores it in a stack slot that has an offset of 4. Add a custom lowering to correct this alignment to 16-byte. i386 does not specify an `__int128`, but it seems reasonable to keep the same behavior as `__float128` so this is changed as well. There also isn't a good way to distinguish whether a set of four registers came from an integer or a float. The main test demonstrating this change is `store_perturbed` in `llvm/test/CodeGen/X86/i128-fp128-abi.ll`. Referenced ABI: https://gitlab.com/x86-psABIs/i386-ABI/-/wikis/uploads/14c05f1b1e156e0e46b61bfa7c1df1e2/intel386-psABI-2020-08-07.pdf Fixes: https://github.com/llvm/llvm-project/issues/77401 --- llvm/docs/ReleaseNotes.md | 2 + llvm/lib/Target/X86/X86CallingConv.cpp | 32 + llvm/lib/Target/X86/X86CallingConv.td | 5 + llvm/lib/Target/X86/X86ISelLoweringCall.cpp | 15 +- llvm/test/CodeGen/X86/abds-neg.ll | 410 ++-- llvm/test/CodeGen/X86/abds.ll | 390 ++-- llvm/test/CodeGen/X86/abdu-neg.ll | 282 +-- llvm/test/CodeGen/X86/abdu.ll | 225 +- llvm/test/CodeGen/X86/abs.ll | 55 +- llvm/test/CodeGen/X86/add-sub-bool.ll | 25 +- llvm/test/CodeGen/X86/arg-copy-elide.ll | 8 +- llvm/test/CodeGen/X86/avx512fp16-cvt.ll | 42 +- llvm/test/CodeGen/X86/bitselect.ll | 55 +- llvm/test/CodeGen/X86/bsf.ll | 144 +- llvm/test/CodeGen/X86/bsr.ll | 158 +- llvm/test/CodeGen/X86/bswap-wide-int.ll | 30 +- .../X86/div-rem-pair-recomposition-signed.ll | 36 +- .../div-rem-pair-recomposition-unsigned.ll | 100 +- llvm/test/CodeGen/X86/fp128-cast-strict.ll | 92 +- llvm/test/CodeGen/X86/fp128-cast.ll | 125 +- .../test/CodeGen/X86/fp128-libcalls-strict.ll | 2060 ++++++++++------- llvm/test/CodeGen/X86/fp128-libcalls.ll | 1773 ++++++++------ llvm/test/CodeGen/X86/fshl.ll | 185 +- llvm/test/CodeGen/X86/fshr.ll | 170 +- llvm/test/CodeGen/X86/funnel-shift.ll | 74 +- llvm/test/CodeGen/X86/i128-add.ll | 23 +- llvm/test/CodeGen/X86/i128-fp128-abi.ll | 455 ++-- llvm/test/CodeGen/X86/i128-sdiv.ll | 61 +- llvm/test/CodeGen/X86/i128-udiv.ll | 12 +- llvm/test/CodeGen/X86/iabs.ll | 43 +- llvm/test/CodeGen/X86/icmp-shift-opt.ll | 102 +- llvm/test/CodeGen/X86/mul128.ll | 97 +- llvm/test/CodeGen/X86/neg-abs.ll | 55 +- llvm/test/CodeGen/X86/popcnt.ll | 485 ++-- llvm/test/CodeGen/X86/pr46004.ll | 19 + llvm/test/CodeGen/X86/scalar-fp-to-i32.ll | 76 +- llvm/test/CodeGen/X86/scalar-fp-to-i64.ll | 76 +- llvm/test/CodeGen/X86/scmp.ll | 39 +- llvm/test/CodeGen/X86/sdiv_fix.ll | 99 +- llvm/test/CodeGen/X86/sdiv_fix_sat.ll | 440 ++-- llvm/test/CodeGen/X86/shift-combine.ll | 14 +- llvm/test/CodeGen/X86/shift-i128.ll | 72 +- llvm/test/CodeGen/X86/smax.ll | 78 +- llvm/test/CodeGen/X86/smin.ll | 81 +- llvm/test/CodeGen/X86/ucmp.ll | 34 +- llvm/test/CodeGen/X86/udiv_fix.ll | 28 +- llvm/test/CodeGen/X86/udiv_fix_sat.ll | 28 +- llvm/test/CodeGen/X86/umax.ll | 135 +- llvm/test/CodeGen/X86/umin.ll | 81 +- .../X86/umulo-128-legalisation-lowering.ll | 6 +- llvm/test/CodeGen/X86/wide-integer-cmp.ll | 14 +- 51 files changed, 5364 insertions(+), 3782 deletions(-) diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index 68d653b9b53d6..5591ac619c399 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -233,6 +233,8 @@ Changes to the X86 Backend -------------------------- * `fp128` will now use `*f128` libcalls on 32-bit GNU targets as well. +* On x86-32, `fp128` and `i128` are now passed with the expected 16-byte stack + alignment. Changes to the OCaml bindings ----------------------------- diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 0b4c63f7a81f7..eb39259f7166b 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -374,5 +374,37 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, return true; } +/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized +/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment. +/// Technically only fp128 has a specified ABI, but it makes sense to handle +/// i128 the same until we hear differently. +static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, + CCValAssign::LocInfo &LocInfo, + ISD::ArgFlagsTy &ArgFlags, CCState &State) { + assert(ValVT == MVT::i32 && "Should have i32 parts"); + SmallVectorImpl &PendingMembers = State.getPendingLocs(); + PendingMembers.push_back( + CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo)); + + if (!ArgFlags.isInConsecutiveRegsLast()) + return true; + + unsigned NumRegs = PendingMembers.size(); + assert(NumRegs == 4 && "Should have two parts"); + + int64_t Offset = State.AllocateStack(16, Align(16)); + PendingMembers[0].convertToMem(Offset); + PendingMembers[1].convertToMem(Offset + 4); + PendingMembers[2].convertToMem(Offset + 8); + PendingMembers[3].convertToMem(Offset + 12); + + State.addLoc(PendingMembers[0]); + State.addLoc(PendingMembers[1]); + State.addLoc(PendingMembers[2]); + State.addLoc(PendingMembers[3]); + PendingMembers.clear(); + return true; +} + // Provides entry points of CC_X86 and RetCC_X86. #include "X86GenCallingConv.inc" diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 823e0caa02262..f020e0b55141c 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[ // The 'nest' parameter, if any, is passed in ECX. CCIfNest>, + // i128 and fp128 need to be passed on the stack with a higher alignment than + // their legal types. Handle this with a custom function. + CCIfType<[i32], + CCIfConsecutiveRegs>>, + // On swifttailcc pass swiftself in ECX. CCIfCC<"CallingConv::SwiftTail", CCIfSwiftSelf>>>, diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp index 9ad355311527b..b4639ac2577e8 100644 --- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp +++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp @@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const { - // i128 split into i64 needs to be allocated to two consecutive registers, - // or spilled to the stack as a whole. - return Ty->isIntegerTy(128); + // On x86-64 i128 is split into two i64s and needs to be allocated to two + // consecutive registers, or spilled to the stack as a whole. On x86-32 i128 + // is split to four i32s and never actually passed in registers, but we use + // the consecutive register mark to match it in TableGen. + if (Ty->isIntegerTy(128)) + return true; + + // On x86-32, fp128 acts the same as i128. + if (Subtarget.is32Bit() && Ty->isFP128Ty()) + return true; + + return false; } /// Helper for getByValTypeAlignment to determine diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll index f6d66ab47ce05..2911edfbfd409 100644 --- a/llvm/test/CodeGen/X86/abds-neg.ll +++ b/llvm/test/CodeGen/X86/abds-neg.ll @@ -367,44 +367,49 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_ext_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll %eax, %esi -; X86-NEXT: cmovll %ebx, %edi -; X86-NEXT: cmovll %ebp, %edx -; X86-NEXT: cmovll (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %edx +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl 32(%ebp), %ebx +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl 52(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: subl 24(%ebp), %ecx +; X86-NEXT: sbbl 28(%ebp), %eax +; X86-NEXT: sbbl 32(%ebp), %edx +; X86-NEXT: sbbl 36(%ebp), %esi +; X86-NEXT: cmovll %edi, %esi +; X86-NEXT: cmovll %ebx, %edx +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: xorl %edi, %edi ; X86-NEXT: negl %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %edx, %ebp -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %ebx, 4(%edx) +; X86-NEXT: movl %eax, 8(%edx) +; X86-NEXT: movl %edi, 12(%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -438,44 +443,49 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_ext_i128_undef: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %edx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl %edi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll %eax, %esi -; X86-NEXT: cmovll %ebx, %edi -; X86-NEXT: cmovll %ebp, %edx -; X86-NEXT: cmovll (%esp), %ecx # 4-byte Folded Reload -; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl %ecx, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %edx +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl 32(%ebp), %ebx +; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl 52(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: subl 24(%ebp), %ecx +; X86-NEXT: sbbl 28(%ebp), %eax +; X86-NEXT: sbbl 32(%ebp), %edx +; X86-NEXT: sbbl 36(%ebp), %esi +; X86-NEXT: cmovll %edi, %esi +; X86-NEXT: cmovll %ebx, %edx +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: xorl %edi, %edi ; X86-NEXT: negl %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %edx, %ebp -; X86-NEXT: movl $0, %edx -; X86-NEXT: sbbl %edi, %edx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %ebx, 4(%edx) +; X86-NEXT: movl %eax, 8(%edx) +; X86-NEXT: movl %edi, 12(%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -639,55 +649,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_minmax_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %ebp, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: cmovll %edx, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: cmovll %esi, %edx -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %eax, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: sbbl 44(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmovll %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %eax +; X86-NEXT: cmovll 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%ebp), %eax +; X86-NEXT: cmovll 28(%ebp), %eax +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: cmovll %edi, %ecx +; X86-NEXT: cmpl %edi, %esi +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: sbbl 28(%ebp), %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sbbl 32(%ebp), %edi +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: sbbl 36(%ebp), %edi +; X86-NEXT: cmovll 36(%ebp), %ebx +; X86-NEXT: cmovll 32(%ebp), %edx +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: cmovll 28(%ebp), %edi +; X86-NEXT: cmovll 24(%ebp), %esi +; X86-NEXT: subl %esi, %ecx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %eax, 4(%edx) +; X86-NEXT: movl %edi, 8(%edx) +; X86-NEXT: movl %esi, 12(%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -848,37 +862,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_cmp_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovgel (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovgel %ebx, %esi -; X86-NEXT: cmovgel %ebp, %ecx -; X86-NEXT: cmovgel %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: movl 44(%ebp), %esi +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 48(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 52(%ebp), %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: subl 40(%ebp), %ecx +; X86-NEXT: sbbl 44(%ebp), %edx +; X86-NEXT: sbbl 48(%ebp), %esi +; X86-NEXT: sbbl 52(%ebp), %ebx +; X86-NEXT: cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovgel %edi, %esi +; X86-NEXT: cmovgel %eax, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1118,35 +1136,39 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_subnsw_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: subl %edi, %ebp -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: subl 40(%ebp), %esi +; X86-NEXT: sbbl 44(%ebp), %edx +; X86-NEXT: sbbl 48(%ebp), %ecx +; X86-NEXT: sbbl 52(%ebp), %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: xorl %edi, %eax +; X86-NEXT: xorl %edi, %ecx +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl %edi, %esi ; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl %ebp, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %edi, %edx +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1175,35 +1197,39 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_subnsw_i128_undef: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: subl %edi, %ebp -; X86-NEXT: movl %ebx, %edi -; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: subl 40(%ebp), %esi +; X86-NEXT: sbbl 44(%ebp), %edx +; X86-NEXT: sbbl 48(%ebp), %ecx +; X86-NEXT: sbbl 52(%ebp), %eax +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sarl $31, %edi +; X86-NEXT: xorl %edi, %eax +; X86-NEXT: xorl %edi, %ecx +; X86-NEXT: xorl %edi, %edx +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: subl %esi, %ebx +; X86-NEXT: movl %edi, %esi ; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: sbbl %ecx, %ebx -; X86-NEXT: movl %ebp, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %edi, %edx +; X86-NEXT: sbbl %ecx, %edx +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll index 0356c2702a419..a1a4ba81ae493 100644 --- a/llvm/test/CodeGen/X86/abds.ll +++ b/llvm/test/CodeGen/X86/abds.ll @@ -343,37 +343,41 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_ext_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx -; X86-NEXT: cmovll %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: subl 24(%ebp), %ecx +; X86-NEXT: sbbl 28(%ebp), %edx +; X86-NEXT: sbbl 32(%ebp), %esi +; X86-NEXT: sbbl 36(%ebp), %ebx +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll %edi, %esi +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -404,37 +408,41 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_ext_i128_undef: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx -; X86-NEXT: cmovll %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: subl 24(%ebp), %ecx +; X86-NEXT: sbbl 28(%ebp), %edx +; X86-NEXT: sbbl 32(%ebp), %esi +; X86-NEXT: sbbl 36(%ebp), %ebx +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll %edi, %esi +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -585,37 +593,41 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_minmax_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx -; X86-NEXT: cmovll %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: subl 24(%ebp), %ecx +; X86-NEXT: sbbl 28(%ebp), %edx +; X86-NEXT: sbbl 32(%ebp), %esi +; X86-NEXT: sbbl 36(%ebp), %ebx +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll %edi, %esi +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -768,37 +780,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_cmp_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx -; X86-NEXT: cmovll %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: subl 24(%ebp), %ecx +; X86-NEXT: sbbl 28(%ebp), %edx +; X86-NEXT: sbbl 32(%ebp), %esi +; X86-NEXT: sbbl 36(%ebp), %ebx +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll %edi, %esi +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1027,35 +1043,38 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_subnsw_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl 40(%ebp), %edi +; X86-NEXT: sbbl 44(%ebp), %esi +; X86-NEXT: sbbl 48(%ebp), %edx +; X86-NEXT: sbbl 52(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: abd_subnsw_i128: @@ -1079,35 +1098,38 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_subnsw_i128_undef: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl 40(%ebp), %edi +; X86-NEXT: sbbl 44(%ebp), %esi +; X86-NEXT: sbbl 48(%ebp), %edx +; X86-NEXT: sbbl 52(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: abd_subnsw_i128_undef: @@ -1282,37 +1304,41 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_select_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: subl %edx, %eax -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll (%esp), %edx # 4-byte Folded Reload -; X86-NEXT: cmovll %ebx, %esi -; X86-NEXT: cmovll %ebp, %ecx -; X86-NEXT: cmovll %eax, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: subl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 48(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: subl 24(%ebp), %ecx +; X86-NEXT: sbbl 28(%ebp), %edx +; X86-NEXT: sbbl 32(%ebp), %esi +; X86-NEXT: sbbl 36(%ebp), %ebx +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovll %edi, %esi +; X86-NEXT: cmovll %eax, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll index 6bda99c89a37e..b7c34070f1af6 100644 --- a/llvm/test/CodeGen/X86/abdu-neg.ll +++ b/llvm/test/CodeGen/X86/abdu-neg.ll @@ -355,39 +355,43 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_ext_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: xorl %ebp, %ecx -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: xorl %ebp, %ebx -; X86-NEXT: xorl %ebp, %edx -; X86-NEXT: subl %ebp, %edx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: sbbl %ebp, %ecx -; X86-NEXT: negl %edx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebx, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl 40(%ebp), %ecx +; X86-NEXT: sbbl 44(%ebp), %edi +; X86-NEXT: sbbl 48(%ebp), %esi +; X86-NEXT: sbbl 52(%ebp), %eax ; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: negl %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -423,39 +427,43 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_ext_i128_undef: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: xorl %ebp, %ecx -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: xorl %ebp, %ebx -; X86-NEXT: xorl %ebp, %edx -; X86-NEXT: subl %ebp, %edx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: sbbl %ebp, %ecx -; X86-NEXT: negl %edx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebx, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl 40(%ebp), %ecx +; X86-NEXT: sbbl 44(%ebp), %edi +; X86-NEXT: sbbl 48(%ebp), %esi +; X86-NEXT: sbbl 52(%ebp), %eax +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: negl %ecx ; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -621,55 +629,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_minmax_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %eax, %esi -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %ebp, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: sbbl %edi, %ecx -; X86-NEXT: movl %edi, %ecx -; X86-NEXT: cmovbl %edx, %ecx -; X86-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: cmovbl %esi, %edx -; X86-NEXT: cmpl %esi, %eax -; X86-NEXT: movl %ebx, %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %edi, %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: subl %eax, %edx -; X86-NEXT: sbbl %ebx, %ecx -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl (%esp), %esi # 4-byte Reload -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %ebp, 8(%eax) -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: sbbl 44(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: cmovbl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edx, %eax +; X86-NEXT: cmovbl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 44(%ebp), %eax +; X86-NEXT: cmovbl 28(%ebp), %eax +; X86-NEXT: movl %esi, %ecx +; X86-NEXT: cmovbl %edi, %ecx +; X86-NEXT: cmpl %edi, %esi +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: sbbl 28(%ebp), %edi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sbbl 32(%ebp), %edi +; X86-NEXT: movl %ebx, %edi +; X86-NEXT: sbbl 36(%ebp), %edi +; X86-NEXT: cmovbl 36(%ebp), %ebx +; X86-NEXT: cmovbl 32(%ebp), %edx +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: cmovbl 28(%ebp), %edi +; X86-NEXT: cmovbl 24(%ebp), %esi +; X86-NEXT: subl %esi, %ecx +; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: movl 8(%ebp), %edx +; X86-NEXT: movl %ecx, (%edx) +; X86-NEXT: movl %eax, 4(%edx) +; X86-NEXT: movl %edi, 8(%edx) +; X86-NEXT: movl %esi, 12(%edx) +; X86-NEXT: movl %edx, %eax +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -827,39 +839,43 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_cmp_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %edi, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: xorl %ebp, %ecx -; X86-NEXT: xorl %ebp, %esi -; X86-NEXT: xorl %ebp, %ebx -; X86-NEXT: xorl %ebp, %edx -; X86-NEXT: subl %ebp, %edx -; X86-NEXT: sbbl %ebp, %ebx -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: sbbl %ebp, %ecx -; X86-NEXT: negl %edx -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebx, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: subl 40(%ebp), %ecx +; X86-NEXT: sbbl 44(%ebp), %edi +; X86-NEXT: sbbl 48(%ebp), %esi +; X86-NEXT: sbbl 52(%ebp), %eax +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx +; X86-NEXT: xorl %ebx, %eax +; X86-NEXT: xorl %ebx, %esi +; X86-NEXT: xorl %ebx, %edi +; X86-NEXT: xorl %ebx, %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebx, %esi +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: negl %ecx ; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %ebx, 4(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll index 27acec32fd348..043c9155f52f9 100644 --- a/llvm/test/CodeGen/X86/abdu.ll +++ b/llvm/test/CodeGen/X86/abdu.ll @@ -326,35 +326,38 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind { define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_ext_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: subl 40(%ebp), %edi +; X86-NEXT: sbbl 44(%ebp), %esi +; X86-NEXT: sbbl 48(%ebp), %edx +; X86-NEXT: sbbl 52(%ebp), %ecx +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: abd_ext_i128: @@ -381,35 +384,38 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind { define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_ext_i128_undef: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: subl 40(%ebp), %edi +; X86-NEXT: sbbl 44(%ebp), %esi +; X86-NEXT: sbbl 48(%ebp), %edx +; X86-NEXT: sbbl 52(%ebp), %ecx +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: abd_ext_i128_undef: @@ -548,35 +554,38 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind { define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_minmax_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: subl 40(%ebp), %edi +; X86-NEXT: sbbl 44(%ebp), %esi +; X86-NEXT: sbbl 48(%ebp), %edx +; X86-NEXT: sbbl 52(%ebp), %ecx +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: abd_minmax_i128: @@ -717,35 +726,38 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind { define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_cmp_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: subl 40(%ebp), %edi +; X86-NEXT: sbbl 44(%ebp), %esi +; X86-NEXT: sbbl 48(%ebp), %edx +; X86-NEXT: sbbl 52(%ebp), %ecx +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: abd_cmp_i128: @@ -887,35 +899,38 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind { define i128 @abd_select_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: abd_select_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: xorl %ebx, %ecx -; X86-NEXT: xorl %ebx, %edx -; X86-NEXT: xorl %ebx, %esi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: sbbl %ebx, %esi -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: subl 40(%ebp), %edi +; X86-NEXT: sbbl 44(%ebp), %esi +; X86-NEXT: sbbl 48(%ebp), %edx +; X86-NEXT: sbbl 52(%ebp), %ecx +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: abd_select_i128: diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll index bae140abdf6b1..e252d5953e60e 100644 --- a/llvm/test/CodeGen/X86/abs.ll +++ b/llvm/test/CodeGen/X86/abs.ll @@ -144,31 +144,34 @@ define i128 @test_i128(i128 %a) nounwind { ; ; X86-LABEL: test_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %edx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: subl %edx, %ebx -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %r = call i128 @llvm.abs.i128(i128 %a, i1 false) ret i128 %r @@ -688,13 +691,17 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind { ; ; X86-LABEL: test_sextinreg_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 28(%ebp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx ; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 24(%ebp), %esi ; X86-NEXT: xorl %edx, %esi ; X86-NEXT: subl %edx, %esi ; X86-NEXT: sbbl %edx, %ecx @@ -702,7 +709,9 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind { ; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl $0, 12(%eax) ; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: leal -4(%ebp), %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %shl = shl i128 %a, 64 %ashr = ashr exact i128 %shl, 64 diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll index c2bfcf57185e3..1df284fb9fe2c 100644 --- a/llvm/test/CodeGen/X86/add-sub-bool.ll +++ b/llvm/test/CodeGen/X86/add-sub-bool.ll @@ -104,18 +104,21 @@ define i24 @test_i24_add_add_idx(i24 %x, i24 %y, i24 %z) nounwind { define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind { ; X86-LABEL: test_i128_add_add_idx: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx -; X86-NEXT: btl $5, {{[0-9]+}}(%esp) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %ecx +; X86-NEXT: movl 52(%ebp), %edx +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: addl 24(%ebp), %esi +; X86-NEXT: adcl 28(%ebp), %edi +; X86-NEXT: adcl 32(%ebp), %ecx +; X86-NEXT: adcl 36(%ebp), %edx +; X86-NEXT: btl $5, 64(%ebp) ; X86-NEXT: adcl $0, %esi ; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %ecx @@ -124,8 +127,10 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind { ; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test_i128_add_add_idx: diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll index 0eb2c630e6818..f13627b55856f 100644 --- a/llvm/test/CodeGen/X86/arg-copy-elide.ll +++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll @@ -188,11 +188,11 @@ define void @split_i128(ptr %sret, i128 %x) { ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: andl $-16, %esp ; CHECK-NEXT: subl $48, %esp -; CHECK-NEXT: movl 12(%ebp), %eax +; CHECK-NEXT: movl 24(%ebp), %eax ; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; CHECK-NEXT: movl 16(%ebp), %ebx -; CHECK-NEXT: movl 20(%ebp), %esi -; CHECK-NEXT: movl 24(%ebp), %edi +; CHECK-NEXT: movl 28(%ebp), %ebx +; CHECK-NEXT: movl 32(%ebp), %esi +; CHECK-NEXT: movl 36(%ebp), %edi ; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl %ebx, {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll index f66f0c0ceabc4..cc58bc1e44f37 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll @@ -628,13 +628,19 @@ define half @s128_to_half(i128 %x) { ; ; X86-LABEL: s128_to_half: ; X86: # %bb.0: -; X86-NEXT: subl $16, %esp -; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $32, %esp +; X86-NEXT: vmovups 8(%ebp), %xmm0 ; X86-NEXT: vmovups %xmm0, (%esp) ; X86-NEXT: calll __floattihf -; X86-NEXT: addl $16, %esp -; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl %a = sitofp i128 %x to half ret half %a @@ -713,13 +719,19 @@ define half @u128_to_half(i128 %x) { ; ; X86-LABEL: u128_to_half: ; X86: # %bb.0: -; X86-NEXT: subl $16, %esp -; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $32, %esp +; X86-NEXT: vmovups 8(%ebp), %xmm0 ; X86-NEXT: vmovups %xmm0, (%esp) ; X86-NEXT: calll __floatuntihf -; X86-NEXT: addl $16, %esp -; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl %a = uitofp i128 %x to half ret half %a @@ -1020,11 +1032,15 @@ define half @f128_to_half(fp128 %x) nounwind { ; ; X86-LABEL: f128_to_half: ; X86: # %bb.0: -; X86-NEXT: subl $16, %esp -; X86-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $32, %esp +; X86-NEXT: vmovups 8(%ebp), %xmm0 ; X86-NEXT: vmovups %xmm0, (%esp) ; X86-NEXT: calll __trunctfhf2 -; X86-NEXT: addl $16, %esp +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl %a = fptrunc fp128 %x to half ret half %a diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll index 4fc0827ac4dd6..33381313d3c19 100644 --- a/llvm/test/CodeGen/X86/bitselect.ll +++ b/llvm/test/CodeGen/X86/bitselect.ll @@ -146,37 +146,40 @@ define i64 @bitselect_i64(i64 %a, i64 %b, i64 %m) nounwind { define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind { ; X86-LABEL: bitselect_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edi, %ecx -; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorl %edi, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: andl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %ebx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %esi, %ebx -; X86-NEXT: andl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %esi, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: andl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: andl 56(%ebp), %ecx +; X86-NEXT: xorl %esi, %ecx +; X86-NEXT: movl 44(%ebp), %esi +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: andl 60(%ebp), %esi +; X86-NEXT: xorl %edi, %esi +; X86-NEXT: movl 48(%ebp), %edi +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: andl 64(%ebp), %edi +; X86-NEXT: xorl %edx, %edi +; X86-NEXT: movl 52(%ebp), %edx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: andl 68(%ebp), %edx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-NOBMI-LABEL: bitselect_i128: diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll index 312f94c041235..143e10e6909e4 100644 --- a/llvm/test/CodeGen/X86/bsf.ll +++ b/llvm/test/CodeGen/X86/bsf.ll @@ -263,70 +263,78 @@ define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind { ; X86-LABEL: cmov_bsf128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: orl %ebp, %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: orl %edx, %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: orl %ebx, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl %eax, %edx ; X86-NEXT: je .LBB8_1 ; X86-NEXT: # %bb.2: # %cond.false ; X86-NEXT: testl %ecx, %ecx ; X86-NEXT: jne .LBB8_3 ; X86-NEXT: # %bb.4: # %cond.false -; X86-NEXT: rep bsfl %edi, %esi -; X86-NEXT: addl $32, %esi -; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB8_7 -; X86-NEXT: .LBB8_6: -; X86-NEXT: rep bsfl %eax, %edx -; X86-NEXT: jmp .LBB8_8 +; X86-NEXT: rep bsfl %esi, %eax +; X86-NEXT: addl $32, %eax +; X86-NEXT: jmp .LBB8_5 ; X86-NEXT: .LBB8_1: -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: movl $128, %esi +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: movl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NEXT: jmp .LBB8_11 ; X86-NEXT: .LBB8_3: -; X86-NEXT: rep bsfl %ecx, %esi -; X86-NEXT: testl %eax, %eax +; X86-NEXT: rep bsfl %ecx, %eax +; X86-NEXT: .LBB8_5: # %cond.false +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: testl %edi, %edi ; X86-NEXT: jne .LBB8_6 -; X86-NEXT: .LBB8_7: # %cond.false -; X86-NEXT: rep bsfl %ebp, %edx +; X86-NEXT: # %bb.7: # %cond.false +; X86-NEXT: rep bsfl %ebx, %edx ; X86-NEXT: addl $32, %edx +; X86-NEXT: jmp .LBB8_8 +; X86-NEXT: .LBB8_6: +; X86-NEXT: rep bsfl %edi, %edx ; X86-NEXT: .LBB8_8: # %cond.false -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: orl %edi, %ebx +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: orl %esi, %edi ; X86-NEXT: jne .LBB8_10 ; X86-NEXT: # %bb.9: # %cond.false ; X86-NEXT: addl $64, %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: .LBB8_10: # %cond.false -; X86-NEXT: xorl %ebp, %ebp +; X86-NEXT: xorl %edi, %edi ; X86-NEXT: .LBB8_11: # %cond.end -; X86-NEXT: xorl %ebx, %ebx ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl %eax, %edi -; X86-NEXT: orl %ecx, %edi -; X86-NEXT: jne .LBB8_13 -; X86-NEXT: # %bb.12: -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: .LBB8_13: # %cond.end -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, 12(%eax) -; X86-NEXT: movl %ebx, 8(%eax) -; X86-NEXT: movl %ebp, 4(%eax) -; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: orl 32(%ebp), %ecx +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: je .LBB8_12 +; X86-NEXT: # %bb.13: # %cond.end +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: jmp .LBB8_14 +; X86-NEXT: .LBB8_12: +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: .LBB8_14: # %cond.end +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -361,46 +369,49 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X86-LABEL: cmov_bsf128_undef: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: orl %esi, %edi -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: orl %ebx, %ebp -; X86-NEXT: orl %edi, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: orl %eax, %ebx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: je .LBB9_11 ; X86-NEXT: # %bb.1: # %select.true.sink ; X86-NEXT: testl %edx, %edx ; X86-NEXT: jne .LBB9_2 ; X86-NEXT: # %bb.3: # %select.true.sink -; X86-NEXT: rep bsfl %ecx, %edi -; X86-NEXT: addl $32, %edi -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: rep bsfl %ecx, %ebx +; X86-NEXT: addl $32, %ebx +; X86-NEXT: testl %edi, %edi ; X86-NEXT: je .LBB9_6 ; X86-NEXT: .LBB9_5: -; X86-NEXT: rep bsfl %ebx, %esi +; X86-NEXT: rep bsfl %edi, %esi ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: je .LBB9_8 ; X86-NEXT: jmp .LBB9_9 ; X86-NEXT: .LBB9_11: # %select.end -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 52(%ebp), %ecx +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl 44(%ebp), %esi +; X86-NEXT: movl 40(%ebp), %edi ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) ; X86-NEXT: jmp .LBB9_10 ; X86-NEXT: .LBB9_2: -; X86-NEXT: rep bsfl %edx, %edi -; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: rep bsfl %edx, %ebx +; X86-NEXT: testl %edi, %edi ; X86-NEXT: jne .LBB9_5 ; X86-NEXT: .LBB9_6: # %select.true.sink ; X86-NEXT: rep bsfl %esi, %esi @@ -409,13 +420,14 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X86-NEXT: jne .LBB9_9 ; X86-NEXT: .LBB9_8: # %select.true.sink ; X86-NEXT: addl $64, %esi -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movl %esi, %ebx ; X86-NEXT: .LBB9_9: # %select.true.sink -; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %ebx, (%eax) ; X86-NEXT: movl $0, 12(%eax) ; X86-NEXT: movl $0, 8(%eax) ; X86-NEXT: movl $0, 4(%eax) ; X86-NEXT: .LBB9_10: # %select.true.sink +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll index fbca4af425eac..ab0478a4e944b 100644 --- a/llvm/test/CodeGen/X86/bsr.ll +++ b/llvm/test/CodeGen/X86/bsr.ll @@ -291,79 +291,80 @@ define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind { ; X86-LABEL: cmov_bsr128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: pushl %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: orl %ebp, %edx -; X86-NEXT: movl %ecx, %esi -; X86-NEXT: orl %ebx, %esi -; X86-NEXT: orl %edx, %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 32(%ebp), %ebx +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: orl %esi, %eax +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: orl %ebx, %edx +; X86-NEXT: orl %eax, %edx ; X86-NEXT: je .LBB8_1 ; X86-NEXT: # %bb.2: # %cond.false -; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: testl %esi, %esi ; X86-NEXT: jne .LBB8_3 ; X86-NEXT: # %bb.4: # %cond.false -; X86-NEXT: bsrl %ebx, %edx -; X86-NEXT: xorl $31, %edx -; X86-NEXT: orl $32, %edx +; X86-NEXT: bsrl %ebx, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi ; X86-NEXT: testl %edi, %edi ; X86-NEXT: je .LBB8_7 ; X86-NEXT: .LBB8_6: -; X86-NEXT: bsrl %edi, %esi -; X86-NEXT: xorl $31, %esi +; X86-NEXT: bsrl %edi, %eax +; X86-NEXT: xorl $31, %eax ; X86-NEXT: jmp .LBB8_8 ; X86-NEXT: .LBB8_1: -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill -; X86-NEXT: movl $128, %edx +; X86-NEXT: xorl %eax, %eax +; X86-NEXT: movl $128, %esi ; X86-NEXT: jmp .LBB8_11 ; X86-NEXT: .LBB8_3: -; X86-NEXT: bsrl %ebp, %edx -; X86-NEXT: xorl $31, %edx +; X86-NEXT: bsrl %esi, %esi +; X86-NEXT: xorl $31, %esi ; X86-NEXT: testl %edi, %edi ; X86-NEXT: jne .LBB8_6 ; X86-NEXT: .LBB8_7: # %cond.false -; X86-NEXT: bsrl %ecx, %esi -; X86-NEXT: xorl $31, %esi -; X86-NEXT: orl $32, %esi +; X86-NEXT: bsrl %ecx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl $32, %eax ; X86-NEXT: .LBB8_8: # %cond.false -; X86-NEXT: movl %ebx, %eax -; X86-NEXT: orl %ebp, %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: orl 36(%ebp), %edx ; X86-NEXT: jne .LBB8_10 ; X86-NEXT: # %bb.9: # %cond.false -; X86-NEXT: orl $64, %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: orl $64, %eax +; X86-NEXT: movl %eax, %esi ; X86-NEXT: .LBB8_10: # %cond.false -; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB8_11: # %cond.end -; X86-NEXT: xorl %esi, %esi ; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: orl %eax, %ecx -; X86-NEXT: orl %ebp, %edi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: orl 32(%ebp), %ecx +; X86-NEXT: orl 36(%ebp), %edi ; X86-NEXT: orl %ecx, %edi ; X86-NEXT: je .LBB8_12 ; X86-NEXT: # %bb.13: # %cond.end -; X86-NEXT: xorl $127, %edx -; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: xorl $127, %esi +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: jmp .LBB8_14 ; X86-NEXT: .LBB8_12: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl 52(%ebp), %edx +; X86-NEXT: movl 48(%ebp), %ebx +; X86-NEXT: movl 44(%ebp), %ecx +; X86-NEXT: movl 40(%ebp), %esi ; X86-NEXT: .LBB8_14: # %cond.end -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebx, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %ebx, 8(%eax) ; X86-NEXT: movl %ecx, 4(%eax) -; X86-NEXT: movl %edx, (%eax) -; X86-NEXT: addl $4, %esp +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -398,62 +399,67 @@ define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind { ; X86-LABEL: cmov_bsr128_undef: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: testl %edi, %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: testl %eax, %eax ; X86-NEXT: jne .LBB9_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: bsrl %esi, %ecx -; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx +; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi ; X86-NEXT: jmp .LBB9_3 ; X86-NEXT: .LBB9_1: -; X86-NEXT: bsrl %edi, %ecx -; X86-NEXT: xorl $31, %ecx +; X86-NEXT: bsrl %eax, %esi +; X86-NEXT: xorl $31, %esi ; X86-NEXT: .LBB9_3: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl 24(%ebp), %ebx ; X86-NEXT: testl %edx, %edx ; X86-NEXT: jne .LBB9_4 ; X86-NEXT: # %bb.5: -; X86-NEXT: bsrl %ebx, %ebp -; X86-NEXT: xorl $31, %ebp -; X86-NEXT: orl $32, %ebp -; X86-NEXT: jmp .LBB9_6 +; X86-NEXT: bsrl %ebx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: orl %eax, %edi +; X86-NEXT: je .LBB9_7 +; X86-NEXT: jmp .LBB9_8 ; X86-NEXT: .LBB9_4: -; X86-NEXT: bsrl %edx, %ebp -; X86-NEXT: xorl $31, %ebp -; X86-NEXT: .LBB9_6: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl %edi, %esi +; X86-NEXT: bsrl %edx, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl %eax, %edi ; X86-NEXT: jne .LBB9_8 -; X86-NEXT: # %bb.7: -; X86-NEXT: orl $64, %ebp -; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: .LBB9_7: +; X86-NEXT: orl $64, %ecx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: .LBB9_8: -; X86-NEXT: orl %edi, %edx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: orl 32(%ebp), %ebx ; X86-NEXT: orl %edx, %ebx ; X86-NEXT: jne .LBB9_9 ; X86-NEXT: # %bb.10: -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl 52(%ebp), %edi +; X86-NEXT: movl 40(%ebp), %esi +; X86-NEXT: movl 44(%ebp), %ecx ; X86-NEXT: jmp .LBB9_11 ; X86-NEXT: .LBB9_9: -; X86-NEXT: xorl $127, %ecx +; X86-NEXT: xorl $127, %esi +; X86-NEXT: xorl %ecx, %ecx ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: xorl %esi, %esi ; X86-NEXT: xorl %edi, %edi ; X86-NEXT: .LBB9_11: +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %esi, 8(%eax) -; X86-NEXT: movl %edx, 4(%eax) -; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll index 6d5e995a6d574..673b7f16de75c 100644 --- a/llvm/test/CodeGen/X86/bswap-wide-int.ll +++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll @@ -41,13 +41,16 @@ define i64 @bswap_i64(i64 %a0) nounwind { define i128 @bswap_i128(i128 %a0) nounwind { ; X86-LABEL: bswap_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %edx +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %edi ; X86-NEXT: bswapl %edi ; X86-NEXT: bswapl %esi ; X86-NEXT: bswapl %edx @@ -56,25 +59,32 @@ define i128 @bswap_i128(i128 %a0) nounwind { ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X86-MOVBE-LABEL: bswap_i128: ; X86-MOVBE: # %bb.0: +; X86-MOVBE-NEXT: pushl %ebp +; X86-MOVBE-NEXT: movl %esp, %ebp ; X86-MOVBE-NEXT: pushl %edi ; X86-MOVBE-NEXT: pushl %esi -; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-MOVBE-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-MOVBE-NEXT: andl $-16, %esp +; X86-MOVBE-NEXT: movl 8(%ebp), %eax +; X86-MOVBE-NEXT: movl 32(%ebp), %ecx +; X86-MOVBE-NEXT: movl 36(%ebp), %edx +; X86-MOVBE-NEXT: movl 24(%ebp), %esi +; X86-MOVBE-NEXT: movl 28(%ebp), %edi ; X86-MOVBE-NEXT: movbel %esi, 12(%eax) ; X86-MOVBE-NEXT: movbel %edi, 8(%eax) ; X86-MOVBE-NEXT: movbel %ecx, 4(%eax) ; X86-MOVBE-NEXT: movbel %edx, (%eax) +; X86-MOVBE-NEXT: leal -8(%ebp), %esp ; X86-MOVBE-NEXT: popl %esi ; X86-MOVBE-NEXT: popl %edi +; X86-MOVBE-NEXT: popl %ebp ; X86-MOVBE-NEXT: retl $4 ; ; X64-LABEL: bswap_i128: diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index d869f8ec01a5a..661e7bb19641c 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -152,17 +152,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $176, %esp -; X86-NEXT: movl 20(%ebp), %edx -; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 36(%ebp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: xorl %eax, %edx ; X86-NEXT: movl %edx, %esi -; X86-NEXT: movl 16(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %edx ; X86-NEXT: xorl %eax, %edx -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %ecx ; X86-NEXT: xorl %eax, %ecx ; X86-NEXT: subl %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -172,16 +172,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 52(%ebp), %ecx ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: xorl %edx, %esi -; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 48(%ebp), %ecx ; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: movl 32(%ebp), %ebx +; X86-NEXT: movl 44(%ebp), %ebx ; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: movl 40(%ebp), %edi ; X86-NEXT: xorl %edx, %edi ; X86-NEXT: subl %edx, %edi ; X86-NEXT: sbbl %edx, %ebx @@ -488,13 +488,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sbbl %ecx, %ebx ; X86-NEXT: sbbl %ecx, %esi ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 44(%ebp), %ecx +; X86-NEXT: movl 56(%ebp), %ecx ; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: movl %eax, 4(%ecx) ; X86-NEXT: movl %ebx, 8(%ecx) ; X86-NEXT: movl %esi, 12(%ecx) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: movl 40(%ebp), %ecx ; X86-NEXT: movl %ebx, %edi ; X86-NEXT: movl %edx, %esi ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -508,7 +508,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 44(%ebp), %esi ; X86-NEXT: mull %esi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -523,17 +523,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: adcl %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl 40(%ebp), %eax ; X86-NEXT: imull %eax, %ebx ; X86-NEXT: mull %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: imull %esi, %edi ; X86-NEXT: addl %edx, %edi ; X86-NEXT: addl %ebx, %edi -; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 40(%ebp), %ebx +; X86-NEXT: movl 52(%ebp), %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: imull %edx, %ebx ; X86-NEXT: mull %edx @@ -543,13 +543,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl 12(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %edx ; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %ecx ; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: movl 20(%ebp), %edi +; X86-NEXT: movl 32(%ebp), %edi ; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %esi ; X86-NEXT: sbbl %ebx, %esi ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edx, (%eax) diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index db6136c4a2b28..370e1c608e44f 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -152,26 +152,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp ; X86-NEXT: subl $160, %esp -; X86-NEXT: movl 28(%ebp), %ebx -; X86-NEXT: movl 40(%ebp), %esi -; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 40(%ebp), %ebx +; X86-NEXT: movl 52(%ebp), %esi +; X86-NEXT: movl 44(%ebp), %edi ; X86-NEXT: movl %edi, %eax ; X86-NEXT: orl %esi, %eax ; X86-NEXT: movl %ebx, %ecx -; X86-NEXT: orl 36(%ebp), %ecx +; X86-NEXT: orl 48(%ebp), %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: sete %cl -; X86-NEXT: movl 16(%ebp), %eax -; X86-NEXT: orl 24(%ebp), %eax -; X86-NEXT: movl 12(%ebp), %edx -; X86-NEXT: orl 20(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: orl 36(%ebp), %eax +; X86-NEXT: movl 24(%ebp), %edx +; X86-NEXT: orl 32(%ebp), %edx ; X86-NEXT: orl %eax, %edx ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: bsrl %esi, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl 36(%ebp), %ecx +; X86-NEXT: bsrl 48(%ebp), %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %esi, %esi @@ -184,28 +184,28 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %eax ; X86-NEXT: addl $64, %eax -; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: movl 48(%ebp), %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: cmovnel %ecx, %eax -; X86-NEXT: movl 24(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %ebx ; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: movl 20(%ebp), %ecx +; X86-NEXT: movl 32(%ebp), %ecx ; X86-NEXT: bsrl %ecx, %ecx ; X86-NEXT: xorl $31, %ecx ; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx -; X86-NEXT: movl 16(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %edi ; X86-NEXT: bsrl %edi, %esi ; X86-NEXT: xorl $31, %esi -; X86-NEXT: bsrl 12(%ebp), %edx +; X86-NEXT: bsrl 24(%ebp), %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %esi, %edx ; X86-NEXT: addl $64, %edx -; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %esi ; X86-NEXT: orl %ebx, %esi ; X86-NEXT: cmovnel %ecx, %edx ; X86-NEXT: xorl %edi, %edi @@ -230,15 +230,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sbbl %esi, %edx ; X86-NEXT: setb %dl ; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %dl # 1-byte Folded Reload -; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl 36(%ebp), %eax ; X86-NEXT: cmovnel %edi, %eax -; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %esi ; X86-NEXT: cmovnel %edi, %esi -; X86-NEXT: movl 16(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %edx ; X86-NEXT: cmovnel %edi, %edx -; X86-NEXT: movl 12(%ebp), %ebx +; X86-NEXT: movl 24(%ebp), %ebx ; X86-NEXT: cmovnel %edi, %ebx -; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: movl 56(%ebp), %edi ; X86-NEXT: jne .LBB4_8 ; X86-NEXT: # %bb.1: # %_udiv-special-cases ; X86-NEXT: movl %eax, %edi @@ -249,18 +249,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: movl %edi, %eax -; X86-NEXT: movl 44(%ebp), %edi -; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl 56(%ebp), %edi +; X86-NEXT: movl 24(%ebp), %ecx ; X86-NEXT: je .LBB4_8 ; X86-NEXT: # %bb.2: # %udiv-bb1 ; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: xorps %xmm0, %xmm0 ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl 28(%ebp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 20(%ebp), %eax +; X86-NEXT: movl 32(%ebp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl 36(%ebp), %eax ; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: movl %eax, %ecx @@ -293,13 +293,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: jmp .LBB4_7 ; X86-NEXT: .LBB4_3: # %udiv-preheader ; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NEXT: movl 12(%ebp), %edi +; X86-NEXT: movl 24(%ebp), %edi ; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl 16(%ebp), %edi +; X86-NEXT: movl 28(%ebp), %edi ; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl 20(%ebp), %edi +; X86-NEXT: movl 32(%ebp), %edi ; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: movl 36(%ebp), %edi ; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -326,16 +326,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shrdl %cl, %eax, %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl 40(%ebp), %eax ; X86-NEXT: addl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl 44(%ebp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: movl 52(%ebp), %eax ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill @@ -378,12 +378,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: andl $1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %ecx, %esi -; X86-NEXT: andl 40(%ebp), %esi +; X86-NEXT: andl 52(%ebp), %esi ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: andl 36(%ebp), %eax +; X86-NEXT: andl 48(%ebp), %eax ; X86-NEXT: movl %ecx, %edx -; X86-NEXT: andl 32(%ebp), %edx -; X86-NEXT: andl 28(%ebp), %ecx +; X86-NEXT: andl 44(%ebp), %edx +; X86-NEXT: andl 40(%ebp), %ecx ; X86-NEXT: subl %ecx, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %edx, %ebx @@ -413,7 +413,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: movl 56(%ebp), %edi ; X86-NEXT: .LBB4_7: # %udiv-loop-exit ; X86-NEXT: shldl $1, %esi, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -432,23 +432,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %esi, 8(%edi) ; X86-NEXT: movl %eax, 12(%edi) ; X86-NEXT: movl %eax, %ecx -; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %eax ; X86-NEXT: movl %eax, %esi ; X86-NEXT: imull %edx, %esi ; X86-NEXT: mull %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl 40(%ebp), %edi +; X86-NEXT: movl 52(%ebp), %edi ; X86-NEXT: imull %ebx, %edi ; X86-NEXT: addl %edx, %edi -; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl 40(%ebp), %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: mull %ebx ; X86-NEXT: movl %eax, %esi -; X86-NEXT: imull 28(%ebp), %ecx +; X86-NEXT: imull 40(%ebp), %ecx ; X86-NEXT: addl %edx, %ecx -; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl 44(%ebp), %eax ; X86-NEXT: imull %eax, %ebx ; X86-NEXT: addl %ecx, %ebx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload @@ -457,7 +457,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, %eax -; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: movl 40(%ebp), %ecx ; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -468,26 +468,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: addl %edi, %ecx ; X86-NEXT: adcl $0, %ebx ; X86-NEXT: movl %esi, %eax -; X86-NEXT: mull 32(%ebp) -; X86-NEXT: movl 16(%ebp), %esi +; X86-NEXT: mull 44(%ebp) +; X86-NEXT: movl 28(%ebp), %esi ; X86-NEXT: movl %edx, %edi ; X86-NEXT: addl %ecx, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: adcl %ebx, %edi ; X86-NEXT: setb %cl ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: mull 32(%ebp) +; X86-NEXT: mull 44(%ebp) ; X86-NEXT: addl %edi, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload -; X86-NEXT: movl 12(%ebp), %ebx +; X86-NEXT: movl 24(%ebp), %ebx ; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: movl 20(%ebp), %edi +; X86-NEXT: movl 32(%ebp), %edi ; X86-NEXT: sbbl %eax, %edi -; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %ecx ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %ebx, (%eax) diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll index 707b05f3478db..bb5640aeb66fa 100644 --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -481,18 +481,21 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp { ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $20, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __fixtfti -; X86-NEXT: addl $28, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -501,7 +504,7 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp { ; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $20, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -620,18 +623,21 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp { ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $20, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __fixunstfti -; X86-NEXT: addl $28, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -640,7 +646,7 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp { ; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $20, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -818,18 +824,21 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp { ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $20, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __floattitf -; X86-NEXT: addl $28, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -838,7 +847,7 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp { ; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $20, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 @@ -1016,18 +1025,21 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp { ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $20, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __floatuntitf -; X86-NEXT: addl $28, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1036,7 +1048,7 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp { ; X86-NEXT: movl %eax, (%esi) ; X86-NEXT: movl %ecx, 4(%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $20, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: retl $4 diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll index 1de2484d47ba1..6d4ec063ccd46 100644 --- a/llvm/test/CodeGen/X86/fp128-cast.ll +++ b/llvm/test/CodeGen/X86/fp128-cast.ll @@ -415,16 +415,20 @@ define dso_local void @TestFPToSIF128_I128() nounwind { ; X86-LABEL: TestFPToSIF128_I128: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $36, %esp +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl vf128, %eax +; X86-NEXT: movl vf128+4, %ecx +; X86-NEXT: movl vf128+8, %edx +; X86-NEXT: movl vf128+12, %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __fixtfti -; X86-NEXT: addl $28, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -432,7 +436,7 @@ define dso_local void @TestFPToSIF128_I128() nounwind { ; X86-NEXT: movl %edx, vi128+8 ; X86-NEXT: movl %ecx, vi128+4 ; X86-NEXT: movl %eax, vi128 -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -466,16 +470,20 @@ define dso_local void @TestFPToUIF128_U128() nounwind { ; X86-LABEL: TestFPToUIF128_U128: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $36, %esp +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl vf128, %eax +; X86-NEXT: movl vf128+4, %ecx +; X86-NEXT: movl vf128+8, %edx +; X86-NEXT: movl vf128+12, %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __fixunstfti -; X86-NEXT: addl $28, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -483,7 +491,7 @@ define dso_local void @TestFPToUIF128_U128() nounwind { ; X86-NEXT: movl %edx, vu128+8 ; X86-NEXT: movl %ecx, vu128+4 ; X86-NEXT: movl %eax, vu128 -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -913,16 +921,20 @@ define dso_local void @TestSIToFPI128_F128() nounwind { ; X86-LABEL: TestSIToFPI128_F128: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $36, %esp +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl vi128, %eax +; X86-NEXT: movl vi128+4, %ecx +; X86-NEXT: movl vi128+8, %edx +; X86-NEXT: movl vi128+12, %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl vi128+12 -; X86-NEXT: pushl vi128+8 -; X86-NEXT: pushl vi128+4 -; X86-NEXT: pushl vi128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __floattitf -; X86-NEXT: addl $28, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -930,7 +942,7 @@ define dso_local void @TestSIToFPI128_F128() nounwind { ; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 ; X86-NEXT: movl %eax, vf128 -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -964,16 +976,20 @@ define dso_local void @TestUIToFPU128_F128() #2 { ; X86-LABEL: TestUIToFPU128_F128: ; X86: # %bb.0: # %entry ; X86-NEXT: pushl %esi -; X86-NEXT: subl $36, %esp +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl vu128, %eax +; X86-NEXT: movl vu128+4, %ecx +; X86-NEXT: movl vu128+8, %edx +; X86-NEXT: movl vu128+12, %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl vu128+12 -; X86-NEXT: pushl vu128+8 -; X86-NEXT: pushl vu128+4 -; X86-NEXT: pushl vu128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __floatuntitf -; X86-NEXT: addl $28, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -981,7 +997,7 @@ define dso_local void @TestUIToFPU128_F128() #2 { ; X86-NEXT: movl %edx, vf128+8 ; X86-NEXT: movl %ecx, vf128+4 ; X86-NEXT: movl %eax, vf128 -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $56, %esp ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -1134,33 +1150,30 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind { ; ; X86-LABEL: TestBits128: ; X86: # %bb.0: # %entry -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $20, %esp +; X86-NEXT: subl $72, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: subl $12, %esp -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %edx +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __multf3 -; X86-NEXT: addl $44, %esp +; X86-NEXT: subl $4, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: orl (%esp), %ecx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: sete %al -; X86-NEXT: addl $20, %esp +; X86-NEXT: addl $72, %esp ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-AVX-LABEL: TestBits128: @@ -1359,12 +1372,14 @@ define i1 @PR34866(i128 %x) nounwind { ; ; X86-LABEL: PR34866: ; X86: # %bb.0: +; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al +; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; ; X64-AVX-LABEL: PR34866: @@ -1394,12 +1409,14 @@ define i1 @PR34866_commute(i128 %x) nounwind { ; ; X86-LABEL: PR34866_commute: ; X86: # %bb.0: +; X86-NEXT: subl $12, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al +; X86-NEXT: addl $12, %esp ; X86-NEXT: retl ; ; X64-AVX-LABEL: PR34866_commute: diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll index a7eea04181f60..ad2d690fd7ed0 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll @@ -41,27 +41,40 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: add: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __addtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: add: @@ -81,24 +94,32 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___addtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -107,9 +128,10 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -141,27 +163,40 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: sub: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __subtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: sub: @@ -181,24 +216,32 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___subtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -207,9 +250,10 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -241,27 +285,40 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: mul: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __multf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: mul: @@ -281,24 +338,32 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___multf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -307,9 +372,10 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -341,27 +407,40 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: div: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __divtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: div: @@ -381,24 +460,32 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___divtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -407,9 +494,10 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -434,31 +522,48 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp { ; ; X86-LABEL: fma: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $92, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll fmaf128 -; X86-NEXT: addl $60, %esp -; X86-NEXT: movaps (%esp), %xmm0 -; X86-NEXT: movaps %xmm0, (%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps %xmm0, (%ebp) +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: addl $92, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: fma: @@ -481,28 +586,40 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $96, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 56(%ebp) -; WIN-X86-NEXT: pushl 52(%ebp) -; WIN-X86-NEXT: pushl 48(%ebp) -; WIN-X86-NEXT: pushl 44(%ebp) -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 52(%ebp), %ebx +; WIN-X86-NEXT: movl 56(%ebp), %edi +; WIN-X86-NEXT: movl 60(%ebp), %edx +; WIN-X86-NEXT: movl 64(%ebp), %ecx +; WIN-X86-NEXT: movl 68(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 48(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 44(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 40(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 36(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _fmal -; WIN-X86-NEXT: addl $52, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -511,9 +628,10 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -538,27 +656,40 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: frem: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll fmodf128 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: frem: @@ -578,24 +709,32 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _fmodl -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -604,9 +743,10 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -631,23 +771,28 @@ define fp128 @ceil(fp128 %x) nounwind strictfp { ; ; X86-LABEL: ceil: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll ceilf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: ceil: @@ -667,17 +812,20 @@ define fp128 @ceil(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _ceill -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -713,23 +861,28 @@ define fp128 @acos(fp128 %x) nounwind strictfp { ; ; X86-LABEL: acos: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll acosf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: acos: @@ -749,17 +902,20 @@ define fp128 @acos(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _acosl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -795,23 +951,28 @@ define fp128 @cos(fp128 %x) nounwind strictfp { ; ; X86-LABEL: cos: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll cosf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: cos: @@ -831,17 +992,20 @@ define fp128 @cos(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _cosl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -877,23 +1041,28 @@ define fp128 @cosh(fp128 %x) nounwind strictfp { ; ; X86-LABEL: cosh: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll coshf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: cosh: @@ -913,17 +1082,20 @@ define fp128 @cosh(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _coshl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -959,23 +1131,28 @@ define fp128 @exp(fp128 %x) nounwind strictfp { ; ; X86-LABEL: exp: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll expf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: exp: @@ -995,17 +1172,20 @@ define fp128 @exp(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _expl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1041,23 +1221,28 @@ define fp128 @exp2(fp128 %x) nounwind strictfp { ; ; X86-LABEL: exp2: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll exp2f128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: exp2: @@ -1077,17 +1262,20 @@ define fp128 @exp2(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _exp2l -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1123,23 +1311,28 @@ define fp128 @floor(fp128 %x) nounwind strictfp { ; ; X86-LABEL: floor: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll floorf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: floor: @@ -1159,17 +1352,20 @@ define fp128 @floor(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _floorl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1205,23 +1401,28 @@ define fp128 @log(fp128 %x) nounwind strictfp { ; ; X86-LABEL: log: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll logf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: log: @@ -1241,17 +1442,20 @@ define fp128 @log(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _logl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1287,23 +1491,28 @@ define fp128 @log10(fp128 %x) nounwind strictfp { ; ; X86-LABEL: log10: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll log10f128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: log10: @@ -1323,17 +1532,20 @@ define fp128 @log10(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _log10l -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1369,23 +1581,28 @@ define fp128 @log2(fp128 %x) nounwind strictfp { ; ; X86-LABEL: log2: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll log2f128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: log2: @@ -1405,17 +1622,20 @@ define fp128 @log2(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _log2l -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1451,27 +1671,40 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: maxnum: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll fmaxf128 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: maxnum: @@ -1491,24 +1724,32 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _fmaxl -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1517,9 +1758,10 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -1544,27 +1786,40 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: minnum: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll fminf128 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: minnum: @@ -1584,24 +1839,32 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _fminl -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1610,9 +1873,10 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -1637,23 +1901,28 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp { ; ; X86-LABEL: nearbyint: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll nearbyintf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: nearbyint: @@ -1673,17 +1942,20 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _nearbyintl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1719,27 +1991,40 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: pow: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll powf128 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: pow: @@ -1759,24 +2044,32 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _powl -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1785,9 +2078,10 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -1819,24 +2113,32 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp { ; ; X86-LABEL: powi: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $64, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __powitf2 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $64, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 ; ; WIN-LABEL: powi: @@ -1853,21 +2155,26 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___powitf2 -; WIN-X86-NEXT: addl $24, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1876,9 +2183,10 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -1903,23 +2211,28 @@ define fp128 @rint(fp128 %x) nounwind strictfp { ; ; X86-LABEL: rint: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll rintf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: rint: @@ -1939,17 +2252,20 @@ define fp128 @rint(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _rintl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1985,23 +2301,28 @@ define fp128 @round(fp128 %x) nounwind strictfp { ; ; X86-LABEL: round: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll roundf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: round: @@ -2021,17 +2342,20 @@ define fp128 @round(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _roundl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2067,23 +2391,28 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp { ; ; X86-LABEL: roundeven: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll roundevenf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: roundeven: @@ -2103,17 +2432,20 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _roundevenl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2149,23 +2481,28 @@ define fp128 @asin(fp128 %x) nounwind strictfp { ; ; X86-LABEL: asin: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll asinf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: asin: @@ -2185,17 +2522,20 @@ define fp128 @asin(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _asinl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2231,23 +2571,28 @@ define fp128 @sin(fp128 %x) nounwind strictfp { ; ; X86-LABEL: sin: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll sinf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: sin: @@ -2267,17 +2612,20 @@ define fp128 @sin(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _sinl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2313,23 +2661,28 @@ define fp128 @sinh(fp128 %x) nounwind strictfp { ; ; X86-LABEL: sinh: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll sinhf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: sinh: @@ -2349,17 +2702,20 @@ define fp128 @sinh(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _sinhl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2395,23 +2751,28 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp { ; ; X86-LABEL: sqrt: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll sqrtf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: sqrt: @@ -2431,17 +2792,20 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _sqrtl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2477,23 +2841,28 @@ define fp128 @atan(fp128 %x) nounwind strictfp { ; ; X86-LABEL: atan: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll atanf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: atan: @@ -2513,17 +2882,20 @@ define fp128 @atan(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _atanl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2559,27 +2931,40 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp { ; ; X86-LABEL: atan2: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll atan2f128 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: atan2: @@ -2599,24 +2984,32 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _atan2l -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2625,9 +3018,10 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp { ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -2652,23 +3046,28 @@ define fp128 @tan(fp128 %x) nounwind strictfp { ; ; X86-LABEL: tan: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll tanf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: tan: @@ -2688,17 +3087,20 @@ define fp128 @tan(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _tanl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2734,23 +3136,28 @@ define fp128 @tanh(fp128 %x) nounwind strictfp { ; ; X86-LABEL: tanh: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll tanhf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: tanh: @@ -2770,17 +3177,20 @@ define fp128 @tanh(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _tanhl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2816,23 +3226,28 @@ define fp128 @trunc(fp128 %x) nounwind strictfp { ; ; X86-LABEL: trunc: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll truncf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: trunc: @@ -2852,17 +3267,20 @@ define fp128 @trunc(fp128 %x) nounwind strictfp { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _truncl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2919,12 +3337,18 @@ define i32 @lrint(fp128 %x) nounwind strictfp { ; ; WIN-X86-LABEL: lrint: ; WIN-X86: # %bb.0: # %entry -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl %ebp +; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: andl $-16, %esp +; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: pushl 20(%ebp) +; WIN-X86-NEXT: pushl 16(%ebp) +; WIN-X86-NEXT: pushl 12(%ebp) +; WIN-X86-NEXT: pushl 8(%ebp) ; WIN-X86-NEXT: calll _lrintl ; WIN-X86-NEXT: addl $16, %esp +; WIN-X86-NEXT: movl %ebp, %esp +; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: %rint = call i32 @llvm.experimental.constrained.lrint.i32.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 @@ -2969,12 +3393,18 @@ define i64 @llrint(fp128 %x) nounwind strictfp { ; ; WIN-X86-LABEL: llrint: ; WIN-X86: # %bb.0: # %entry -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl %ebp +; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: andl $-16, %esp +; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: pushl 20(%ebp) +; WIN-X86-NEXT: pushl 16(%ebp) +; WIN-X86-NEXT: pushl 12(%ebp) +; WIN-X86-NEXT: pushl 8(%ebp) ; WIN-X86-NEXT: calll _llrintl ; WIN-X86-NEXT: addl $16, %esp +; WIN-X86-NEXT: movl %ebp, %esp +; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: %rint = call i64 @llvm.experimental.constrained.llrint.i64.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0 @@ -3019,12 +3449,18 @@ define i32 @lround(fp128 %x) nounwind strictfp { ; ; WIN-X86-LABEL: lround: ; WIN-X86: # %bb.0: # %entry -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl %ebp +; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: andl $-16, %esp +; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: pushl 20(%ebp) +; WIN-X86-NEXT: pushl 16(%ebp) +; WIN-X86-NEXT: pushl 12(%ebp) +; WIN-X86-NEXT: pushl 8(%ebp) ; WIN-X86-NEXT: calll _lroundl ; WIN-X86-NEXT: addl $16, %esp +; WIN-X86-NEXT: movl %ebp, %esp +; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: %round = call i32 @llvm.experimental.constrained.lround.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0 @@ -3069,12 +3505,18 @@ define i64 @llround(fp128 %x) nounwind strictfp { ; ; WIN-X86-LABEL: llround: ; WIN-X86: # %bb.0: # %entry -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl %ebp +; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: andl $-16, %esp +; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: pushl 20(%ebp) +; WIN-X86-NEXT: pushl 16(%ebp) +; WIN-X86-NEXT: pushl 12(%ebp) +; WIN-X86-NEXT: pushl 8(%ebp) ; WIN-X86-NEXT: calll _llroundl ; WIN-X86-NEXT: addl $16, %esp +; WIN-X86-NEXT: movl %ebp, %esp +; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: %round = call i64 @llvm.experimental.constrained.llround.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0 @@ -3176,26 +3618,32 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; ; WIN-X86-LABEL: cmp: ; WIN-X86: # %bb.0: -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl %ebp +; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: andl $-16, %esp +; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: pushl 52(%ebp) +; WIN-X86-NEXT: pushl 48(%ebp) +; WIN-X86-NEXT: pushl 44(%ebp) +; WIN-X86-NEXT: pushl 40(%ebp) +; WIN-X86-NEXT: pushl 36(%ebp) +; WIN-X86-NEXT: pushl 32(%ebp) +; WIN-X86-NEXT: pushl 28(%ebp) +; WIN-X86-NEXT: pushl 24(%ebp) ; WIN-X86-NEXT: calll ___eqtf2 ; WIN-X86-NEXT: addl $32, %esp ; WIN-X86-NEXT: testl %eax, %eax ; WIN-X86-NEXT: je LBB37_1 ; WIN-X86-NEXT: # %bb.2: -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx +; WIN-X86-NEXT: leal 16(%ebp), %ecx ; WIN-X86-NEXT: jmp LBB37_3 ; WIN-X86-NEXT: LBB37_1: -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx +; WIN-X86-NEXT: leal 8(%ebp), %ecx ; WIN-X86-NEXT: LBB37_3: ; WIN-X86-NEXT: movl (%ecx), %eax ; WIN-X86-NEXT: movl 4(%ecx), %edx +; WIN-X86-NEXT: movl %ebp, %esp +; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl %cond = call i1 @llvm.experimental.constrained.fcmp.f128( fp128 %x, fp128 %y, @@ -3300,26 +3748,32 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; ; WIN-X86-LABEL: cmps: ; WIN-X86: # %bb.0: -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl %ebp +; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: andl $-16, %esp +; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: pushl 52(%ebp) +; WIN-X86-NEXT: pushl 48(%ebp) +; WIN-X86-NEXT: pushl 44(%ebp) +; WIN-X86-NEXT: pushl 40(%ebp) +; WIN-X86-NEXT: pushl 36(%ebp) +; WIN-X86-NEXT: pushl 32(%ebp) +; WIN-X86-NEXT: pushl 28(%ebp) +; WIN-X86-NEXT: pushl 24(%ebp) ; WIN-X86-NEXT: calll ___eqtf2 ; WIN-X86-NEXT: addl $32, %esp ; WIN-X86-NEXT: testl %eax, %eax ; WIN-X86-NEXT: je LBB38_1 ; WIN-X86-NEXT: # %bb.2: -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx +; WIN-X86-NEXT: leal 16(%ebp), %ecx ; WIN-X86-NEXT: jmp LBB38_3 ; WIN-X86-NEXT: LBB38_1: -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx +; WIN-X86-NEXT: leal 8(%ebp), %ecx ; WIN-X86-NEXT: LBB38_3: ; WIN-X86-NEXT: movl (%ecx), %eax ; WIN-X86-NEXT: movl 4(%ecx), %edx +; WIN-X86-NEXT: movl %ebp, %esp +; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl %cond = call i1 @llvm.experimental.constrained.fcmps.f128( fp128 %x, fp128 %y, @@ -3496,44 +3950,47 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; WIN-X86-LABEL: cmp_ueq_q: ; WIN-X86: # %bb.0: ; WIN-X86-NEXT: pushl %ebp +; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi -; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: andl $-16, %esp +; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: movl 32(%ebp), %edi +; WIN-X86-NEXT: movl 36(%ebp), %esi +; WIN-X86-NEXT: pushl 52(%ebp) +; WIN-X86-NEXT: pushl 48(%ebp) +; WIN-X86-NEXT: pushl 44(%ebp) +; WIN-X86-NEXT: pushl 40(%ebp) ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: pushl %edi -; WIN-X86-NEXT: pushl %ebp -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl 28(%ebp) +; WIN-X86-NEXT: pushl 24(%ebp) ; WIN-X86-NEXT: calll ___eqtf2 ; WIN-X86-NEXT: addl $32, %esp ; WIN-X86-NEXT: testl %eax, %eax ; WIN-X86-NEXT: sete %bl -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl 52(%ebp) +; WIN-X86-NEXT: pushl 48(%ebp) +; WIN-X86-NEXT: pushl 44(%ebp) +; WIN-X86-NEXT: pushl 40(%ebp) ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: pushl %edi -; WIN-X86-NEXT: pushl %ebp -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl 28(%ebp) +; WIN-X86-NEXT: pushl 24(%ebp) ; WIN-X86-NEXT: calll ___unordtf2 ; WIN-X86-NEXT: addl $32, %esp ; WIN-X86-NEXT: orb %bl, %al ; WIN-X86-NEXT: jne LBB39_1 ; WIN-X86-NEXT: # %bb.2: -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx +; WIN-X86-NEXT: leal 16(%ebp), %ecx ; WIN-X86-NEXT: jmp LBB39_3 ; WIN-X86-NEXT: LBB39_1: -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx +; WIN-X86-NEXT: leal 8(%ebp), %ecx ; WIN-X86-NEXT: LBB39_3: ; WIN-X86-NEXT: movl (%ecx), %eax ; WIN-X86-NEXT: movl 4(%ecx), %edx +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi ; WIN-X86-NEXT: popl %ebx @@ -3716,32 +4173,34 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; WIN-X86-LABEL: cmp_one_q: ; WIN-X86: # %bb.0: ; WIN-X86-NEXT: pushl %ebp +; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi -; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: andl $-16, %esp +; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: movl 32(%ebp), %edi +; WIN-X86-NEXT: movl 36(%ebp), %esi +; WIN-X86-NEXT: pushl 52(%ebp) +; WIN-X86-NEXT: pushl 48(%ebp) +; WIN-X86-NEXT: pushl 44(%ebp) +; WIN-X86-NEXT: pushl 40(%ebp) ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: pushl %edi -; WIN-X86-NEXT: pushl %ebp -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl 28(%ebp) +; WIN-X86-NEXT: pushl 24(%ebp) ; WIN-X86-NEXT: calll ___eqtf2 ; WIN-X86-NEXT: addl $32, %esp ; WIN-X86-NEXT: testl %eax, %eax ; WIN-X86-NEXT: setne %bl -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl 52(%ebp) +; WIN-X86-NEXT: pushl 48(%ebp) +; WIN-X86-NEXT: pushl 44(%ebp) +; WIN-X86-NEXT: pushl 40(%ebp) ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: pushl %edi -; WIN-X86-NEXT: pushl %ebp -; WIN-X86-NEXT: pushl {{[0-9]+}}(%esp) +; WIN-X86-NEXT: pushl 28(%ebp) +; WIN-X86-NEXT: pushl 24(%ebp) ; WIN-X86-NEXT: calll ___unordtf2 ; WIN-X86-NEXT: addl $32, %esp ; WIN-X86-NEXT: testl %eax, %eax @@ -3749,13 +4208,14 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 { ; WIN-X86-NEXT: testb %bl, %al ; WIN-X86-NEXT: jne LBB40_1 ; WIN-X86-NEXT: # %bb.2: -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx +; WIN-X86-NEXT: leal 16(%ebp), %ecx ; WIN-X86-NEXT: jmp LBB40_3 ; WIN-X86-NEXT: LBB40_1: -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx +; WIN-X86-NEXT: leal 8(%ebp), %ecx ; WIN-X86-NEXT: LBB40_3: ; WIN-X86-NEXT: movl (%ecx), %eax ; WIN-X86-NEXT: movl 4(%ecx), %edx +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi ; WIN-X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll index f727a79078627..4b0449fd7502e 100644 --- a/llvm/test/CodeGen/X86/fp128-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll @@ -42,22 +42,38 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Add: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __addtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Add: @@ -78,22 +94,31 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl 24(%ebp), %edi +; WIN-X86-NEXT: movl 28(%ebp), %ebx +; WIN-X86-NEXT: movl 32(%ebp), %ecx +; WIN-X86-NEXT: movl 36(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___addtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -101,8 +126,10 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+8 ; WIN-X86-NEXT: movl %ecx, _vf128+4 ; WIN-X86-NEXT: movl %eax, _vf128 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -144,22 +171,38 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Add: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl vf128, %edi +; X86-NEXT: movl vf128+4, %ebx +; X86-NEXT: movl vf128+8, %ebp +; X86-NEXT: movl vf128+12, %eax +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __addtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Add: @@ -180,22 +223,31 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl _vf128+12 -; WIN-X86-NEXT: pushl _vf128+8 -; WIN-X86-NEXT: pushl _vf128+4 -; WIN-X86-NEXT: pushl _vf128 -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %esi +; WIN-X86-NEXT: movl 20(%ebp), %edi +; WIN-X86-NEXT: movl _vf128, %edx +; WIN-X86-NEXT: movl _vf128+4, %ebx +; WIN-X86-NEXT: movl _vf128+8, %ecx +; WIN-X86-NEXT: movl _vf128+12, %eax +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___addtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -203,8 +255,10 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+12 ; WIN-X86-NEXT: movl %eax, _vf128 ; WIN-X86-NEXT: movl %ecx, _vf128+4 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -241,22 +295,38 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Sub: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __subtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Sub: @@ -277,22 +347,31 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl 24(%ebp), %edi +; WIN-X86-NEXT: movl 28(%ebp), %ebx +; WIN-X86-NEXT: movl 32(%ebp), %ecx +; WIN-X86-NEXT: movl 36(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___subtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -300,8 +379,10 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+8 ; WIN-X86-NEXT: movl %ecx, _vf128+4 ; WIN-X86-NEXT: movl %eax, _vf128 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -343,22 +424,38 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Sub: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl vf128, %edi +; X86-NEXT: movl vf128+4, %ebx +; X86-NEXT: movl vf128+8, %ebp +; X86-NEXT: movl vf128+12, %eax +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __subtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Sub: @@ -379,22 +476,31 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl _vf128+12 -; WIN-X86-NEXT: pushl _vf128+8 -; WIN-X86-NEXT: pushl _vf128+4 -; WIN-X86-NEXT: pushl _vf128 -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %esi +; WIN-X86-NEXT: movl 20(%ebp), %edi +; WIN-X86-NEXT: movl _vf128, %edx +; WIN-X86-NEXT: movl _vf128+4, %ebx +; WIN-X86-NEXT: movl _vf128+8, %ecx +; WIN-X86-NEXT: movl _vf128+12, %eax +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___subtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -402,8 +508,10 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+12 ; WIN-X86-NEXT: movl %eax, _vf128 ; WIN-X86-NEXT: movl %ecx, _vf128+4 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -440,22 +548,38 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Mul: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __multf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Mul: @@ -476,22 +600,31 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl 24(%ebp), %edi +; WIN-X86-NEXT: movl 28(%ebp), %ebx +; WIN-X86-NEXT: movl 32(%ebp), %ecx +; WIN-X86-NEXT: movl 36(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___multf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -499,8 +632,10 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+8 ; WIN-X86-NEXT: movl %ecx, _vf128+4 ; WIN-X86-NEXT: movl %eax, _vf128 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -542,22 +677,38 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Mul: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl vf128, %edi +; X86-NEXT: movl vf128+4, %ebx +; X86-NEXT: movl vf128+8, %ebp +; X86-NEXT: movl vf128+12, %eax +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __multf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Mul: @@ -578,22 +729,31 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl _vf128+12 -; WIN-X86-NEXT: pushl _vf128+8 -; WIN-X86-NEXT: pushl _vf128+4 -; WIN-X86-NEXT: pushl _vf128 -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %esi +; WIN-X86-NEXT: movl 20(%ebp), %edi +; WIN-X86-NEXT: movl _vf128, %edx +; WIN-X86-NEXT: movl _vf128+4, %ebx +; WIN-X86-NEXT: movl _vf128+8, %ecx +; WIN-X86-NEXT: movl _vf128+12, %eax +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___multf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -601,8 +761,10 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+12 ; WIN-X86-NEXT: movl %eax, _vf128 ; WIN-X86-NEXT: movl %ecx, _vf128+4 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -639,22 +801,38 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Div: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __divtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Div: @@ -675,22 +853,31 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl 24(%ebp), %edi +; WIN-X86-NEXT: movl 28(%ebp), %ebx +; WIN-X86-NEXT: movl 32(%ebp), %ecx +; WIN-X86-NEXT: movl 36(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___divtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -698,8 +885,10 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+8 ; WIN-X86-NEXT: movl %ecx, _vf128+4 ; WIN-X86-NEXT: movl %eax, _vf128 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -741,22 +930,38 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Div: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl vf128, %edi +; X86-NEXT: movl vf128+4, %ebx +; X86-NEXT: movl vf128+8, %ebp +; X86-NEXT: movl vf128+12, %eax +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll __divtf3 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Div: @@ -777,22 +982,31 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl _vf128+12 -; WIN-X86-NEXT: pushl _vf128+8 -; WIN-X86-NEXT: pushl _vf128+4 -; WIN-X86-NEXT: pushl _vf128 -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %esi +; WIN-X86-NEXT: movl 20(%ebp), %edi +; WIN-X86-NEXT: movl _vf128, %edx +; WIN-X86-NEXT: movl _vf128+4, %ebx +; WIN-X86-NEXT: movl _vf128+8, %ecx +; WIN-X86-NEXT: movl _vf128+12, %eax +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll ___divtf3 -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -800,8 +1014,10 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+12 ; WIN-X86-NEXT: movl %eax, _vf128 ; WIN-X86-NEXT: movl %ecx, _vf128+4 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -830,22 +1046,38 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind { ; ; X86-LABEL: Test128Rem: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll fmodf128 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128Rem: @@ -866,22 +1098,31 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl 24(%ebp), %edi +; WIN-X86-NEXT: movl 28(%ebp), %ebx +; WIN-X86-NEXT: movl 32(%ebp), %ecx +; WIN-X86-NEXT: movl 36(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _fmodl -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -889,8 +1130,10 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+8 ; WIN-X86-NEXT: movl %ecx, _vf128+4 ; WIN-X86-NEXT: movl %eax, _vf128 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -922,22 +1165,38 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind { ; ; X86-LABEL: Test128_1Rem: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: subl $76, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl vf128, %edi +; X86-NEXT: movl vf128+4, %ebx +; X86-NEXT: movl vf128+8, %ebp +; X86-NEXT: movl vf128+12, %eax +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl vf128+12 -; X86-NEXT: pushl vf128+8 -; X86-NEXT: pushl vf128+4 -; X86-NEXT: pushl vf128 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll fmodf128 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $76, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; WIN-LABEL: Test128_1Rem: @@ -958,22 +1217,31 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx +; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl _vf128+12 -; WIN-X86-NEXT: pushl _vf128+8 -; WIN-X86-NEXT: pushl _vf128+4 -; WIN-X86-NEXT: pushl _vf128 -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $80, %esp +; WIN-X86-NEXT: movl 16(%ebp), %esi +; WIN-X86-NEXT: movl 20(%ebp), %edi +; WIN-X86-NEXT: movl _vf128, %edx +; WIN-X86-NEXT: movl _vf128+4, %ebx +; WIN-X86-NEXT: movl _vf128+8, %ecx +; WIN-X86-NEXT: movl _vf128+12, %eax +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 12(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 8(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _fmodl -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -981,8 +1249,10 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %edx, _vf128+12 ; WIN-X86-NEXT: movl %eax, _vf128 ; WIN-X86-NEXT: movl %ecx, _vf128+4 -; WIN-X86-NEXT: leal -4(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi +; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -1011,18 +1281,24 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Sqrt: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll sqrtf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Sqrt: @@ -1042,16 +1318,19 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _sqrtl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1089,18 +1368,24 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Sin: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll sinf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Sin: @@ -1120,16 +1405,19 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _sinl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1167,18 +1455,24 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Cos: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll cosf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Cos: @@ -1198,16 +1492,19 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _cosl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1245,18 +1542,24 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Ceil: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll ceilf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Ceil: @@ -1276,16 +1579,19 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _ceill -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1323,18 +1629,24 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Floor: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll floorf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Floor: @@ -1354,16 +1666,19 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _floorl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1401,18 +1716,24 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Trunc: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll truncf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Trunc: @@ -1432,16 +1753,19 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _truncl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1479,18 +1803,24 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Nearbyint: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll nearbyintf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Nearbyint: @@ -1510,16 +1840,19 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _nearbyintl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1557,18 +1890,24 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Rint: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll rintf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Rint: @@ -1588,16 +1927,19 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _rintl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1635,18 +1977,24 @@ define dso_local void @Test128Round(fp128 %d1) nounwind { ; ; X86-LABEL: Test128Round: ; X86: # %bb.0: # %entry -; X86-NEXT: subl $40, %esp +; X86-NEXT: pushl %esi +; X86-NEXT: subl $56, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll roundf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, vf128 -; X86-NEXT: addl $28, %esp +; X86-NEXT: addl $56, %esp +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; WIN-LABEL: Test128Round: @@ -1666,16 +2014,19 @@ define dso_local void @Test128Round(fp128 %d1) nounwind { ; WIN-X86-NEXT: movl %esp, %ebp ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $32, %esp -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl 8(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: movl 8(%ebp), %eax +; WIN-X86-NEXT: movl 12(%ebp), %ecx +; WIN-X86-NEXT: movl 16(%ebp), %edx +; WIN-X86-NEXT: movl 20(%ebp), %esi +; WIN-X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _roundl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %esi @@ -1705,31 +2056,48 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind { ; ; X86-LABEL: Test128FMA: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $92, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll fmaf128 -; X86-NEXT: addl $60, %esp -; X86-NEXT: movaps (%esp), %xmm0 -; X86-NEXT: movaps %xmm0, (%esi) -; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: movaps %xmm0, (%ebp) +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: addl $92, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128FMA: @@ -1752,28 +2120,40 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind { ; WIN-X86: # %bb.0: # %entry ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $96, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 56(%ebp) -; WIN-X86-NEXT: pushl 52(%ebp) -; WIN-X86-NEXT: pushl 48(%ebp) -; WIN-X86-NEXT: pushl 44(%ebp) -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 52(%ebp), %ebx +; WIN-X86-NEXT: movl 56(%ebp), %edi +; WIN-X86-NEXT: movl 60(%ebp), %edx +; WIN-X86-NEXT: movl 64(%ebp), %ecx +; WIN-X86-NEXT: movl 68(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 48(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 44(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 40(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 36(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _fmal -; WIN-X86-NEXT: addl $52, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1782,9 +2162,10 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind { ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl entry: @@ -1804,23 +2185,28 @@ define fp128 @Test128Acos(fp128 %a) nounwind { ; ; X86-LABEL: Test128Acos: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll acosf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Acos: @@ -1840,17 +2226,20 @@ define fp128 @Test128Acos(fp128 %a) nounwind { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _acosl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1879,23 +2268,28 @@ define fp128 @Test128Asin(fp128 %a) nounwind { ; ; X86-LABEL: Test128Asin: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll asinf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Asin: @@ -1915,17 +2309,20 @@ define fp128 @Test128Asin(fp128 %a) nounwind { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _asinl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -1954,23 +2351,28 @@ define fp128 @Test128Atan(fp128 %a) nounwind { ; ; X86-LABEL: Test128Atan: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll atanf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Atan: @@ -1990,17 +2392,20 @@ define fp128 @Test128Atan(fp128 %a) nounwind { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _atanl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2029,27 +2434,40 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind { ; ; X86-LABEL: Test128Atan2: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $76, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebp, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll atan2f128 -; X86-NEXT: addl $44, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $76, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Atan2: @@ -2069,24 +2487,32 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind { ; WIN-X86: # %bb.0: ; WIN-X86-NEXT: pushl %ebp ; WIN-X86-NEXT: movl %esp, %ebp +; WIN-X86-NEXT: pushl %ebx ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $80, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 40(%ebp) -; WIN-X86-NEXT: pushl 36(%ebp) -; WIN-X86-NEXT: pushl 32(%ebp) -; WIN-X86-NEXT: pushl 28(%ebp) -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl 40(%ebp), %ebx +; WIN-X86-NEXT: movl 44(%ebp), %edx +; WIN-X86-NEXT: movl 48(%ebp), %ecx +; WIN-X86-NEXT: movl 52(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 32(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 28(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _atan2l -; WIN-X86-NEXT: addl $36, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2095,9 +2521,10 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind { ; WIN-X86-NEXT: movl %ecx, 4(%esi) ; WIN-X86-NEXT: movl %eax, (%esi) ; WIN-X86-NEXT: movl %esi, %eax -; WIN-X86-NEXT: leal -8(%ebp), %esp +; WIN-X86-NEXT: leal -12(%ebp), %esp ; WIN-X86-NEXT: popl %esi ; WIN-X86-NEXT: popl %edi +; WIN-X86-NEXT: popl %ebx ; WIN-X86-NEXT: popl %ebp ; WIN-X86-NEXT: retl %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b) @@ -2115,23 +2542,28 @@ define fp128 @Test128Cosh(fp128 %a) nounwind { ; ; X86-LABEL: Test128Cosh: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll coshf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Cosh: @@ -2151,17 +2583,20 @@ define fp128 @Test128Cosh(fp128 %a) nounwind { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _coshl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2190,23 +2625,28 @@ define fp128 @Test128Sinh(fp128 %a) nounwind { ; ; X86-LABEL: Test128Sinh: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll sinhf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Sinh: @@ -2226,17 +2666,20 @@ define fp128 @Test128Sinh(fp128 %a) nounwind { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _sinhl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2265,23 +2708,28 @@ define fp128 @Test128Tan(fp128 %a) nounwind { ; ; X86-LABEL: Test128Tan: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll tanf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Tan: @@ -2301,17 +2749,20 @@ define fp128 @Test128Tan(fp128 %a) nounwind { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _tanl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2340,23 +2791,28 @@ define fp128 @Test128Tanh(fp128 %a) nounwind { ; ; X86-LABEL: Test128Tanh: ; X86: # %bb.0: +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $24, %esp +; X86-NEXT: subl $52, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $12, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll tanhf128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $52, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Tanh: @@ -2376,17 +2832,20 @@ define fp128 @Test128Tanh(fp128 %a) nounwind { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $16, %esp +; WIN-X86-NEXT: subl $48, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi -; WIN-X86-NEXT: movl %esp, %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %eax +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _tanhl -; WIN-X86-NEXT: addl $20, %esp -; WIN-X86-NEXT: movl (%esp), %eax +; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %edi @@ -2425,27 +2884,34 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind { ; ; X86-LABEL: Test128Modf: ; X86: # %bb.0: +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $40, %esp +; X86-NEXT: subl $80, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl $8, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: leal {{[0-9]+}}(%esp), %ecx -; X86-NEXT: pushl %eax -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl {{[0-9]+}}(%esp) -; X86-NEXT: pushl %ecx +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: calll modff128 -; X86-NEXT: addl $28, %esp -; X86-NEXT: movaps (%esp), %xmm0 +; X86-NEXT: subl $4, %esp +; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm0 ; X86-NEXT: movaps {{[0-9]+}}(%esp), %xmm1 ; X86-NEXT: movaps %xmm1, 16(%esi) ; X86-NEXT: movaps %xmm0, (%esi) ; X86-NEXT: movl %esi, %eax -; X86-NEXT: addl $40, %esp +; X86-NEXT: addl $80, %esp ; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx ; X86-NEXT: retl $4 ; ; WIN-LABEL: Test128Modf: @@ -2468,18 +2934,21 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind { ; WIN-X86-NEXT: pushl %edi ; WIN-X86-NEXT: pushl %esi ; WIN-X86-NEXT: andl $-16, %esp -; WIN-X86-NEXT: subl $64, %esp +; WIN-X86-NEXT: subl $112, %esp ; WIN-X86-NEXT: movl 8(%ebp), %esi +; WIN-X86-NEXT: movl 24(%ebp), %eax +; WIN-X86-NEXT: movl 28(%ebp), %ecx +; WIN-X86-NEXT: movl 32(%ebp), %edx +; WIN-X86-NEXT: movl 36(%ebp), %edi +; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ebx +; WIN-X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; WIN-X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; WIN-X86-NEXT: leal {{[0-9]+}}(%esp), %ecx -; WIN-X86-NEXT: pushl %eax -; WIN-X86-NEXT: pushl 24(%ebp) -; WIN-X86-NEXT: pushl 20(%ebp) -; WIN-X86-NEXT: pushl 16(%ebp) -; WIN-X86-NEXT: pushl 12(%ebp) -; WIN-X86-NEXT: pushl %ecx +; WIN-X86-NEXT: movl %eax, (%esp) ; WIN-X86-NEXT: calll _modfl -; WIN-X86-NEXT: addl $24, %esp ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; WIN-X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; WIN-X86-NEXT: movl {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll index e8c8ccfa8d37f..ec1b8a3c8d6d9 100644 --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -264,53 +264,62 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-LABEL: var_shift_i128: ; X86-FAST: # %bb.0: ; X86-FAST-NEXT: pushl %ebp +; X86-FAST-NEXT: movl %esp, %ebp ; X86-FAST-NEXT: pushl %ebx ; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-FAST-NEXT: andl $-16, %esp +; X86-FAST-NEXT: subl $16, %esp +; X86-FAST-NEXT: movl 24(%ebp), %edi +; X86-FAST-NEXT: movl 28(%ebp), %edx +; X86-FAST-NEXT: movl 48(%ebp), %esi +; X86-FAST-NEXT: movl 56(%ebp), %ecx ; X86-FAST-NEXT: testb $64, %cl +; X86-FAST-NEXT: movl 52(%ebp), %eax ; X86-FAST-NEXT: jne .LBB6_1 ; X86-FAST-NEXT: # %bb.2: -; X86-FAST-NEXT: movl %ebx, %ebp ; X86-FAST-NEXT: movl %esi, %ebx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl %edi, %eax -; X86-FAST-NEXT: movl %edx, %edi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-FAST-NEXT: movl %edi, %esi +; X86-FAST-NEXT: movl 32(%ebp), %edi +; X86-FAST-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-FAST-NEXT: movl %edx, %eax +; X86-FAST-NEXT: movl 36(%ebp), %edx ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: je .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %esi, %edx -; X86-FAST-NEXT: movl %edi, %esi -; X86-FAST-NEXT: movl %ebx, %edi -; X86-FAST-NEXT: movl %eax, %ebx +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, %eax +; X86-FAST-NEXT: movl (%esp), %esi # 4-byte Reload ; X86-FAST-NEXT: jmp .LBB6_6 ; X86-FAST-NEXT: .LBB6_1: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-FAST-NEXT: movl 44(%ebp), %ebx +; X86-FAST-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-FAST-NEXT: movl 40(%ebp), %ebx ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: jne .LBB6_4 ; X86-FAST-NEXT: .LBB6_5: -; X86-FAST-NEXT: movl %eax, %ebp +; X86-FAST-NEXT: movl (%esp), %ebx # 4-byte Reload +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-FAST-NEXT: .LBB6_6: -; X86-FAST-NEXT: movl %ebx, %eax -; X86-FAST-NEXT: shldl %cl, %ebp, %eax -; X86-FAST-NEXT: movl %edi, %ebp -; X86-FAST-NEXT: shldl %cl, %ebx, %ebp -; X86-FAST-NEXT: movl %esi, %ebx -; X86-FAST-NEXT: shldl %cl, %edi, %ebx +; X86-FAST-NEXT: movl %esi, %edi +; X86-FAST-NEXT: shldl %cl, %ebx, %edi +; X86-FAST-NEXT: movl %eax, %edx +; X86-FAST-NEXT: movl %eax, %ebx +; X86-FAST-NEXT: shldl %cl, %esi, %ebx +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-FAST-NEXT: movl %eax, %esi +; X86-FAST-NEXT: shldl %cl, %edx, %esi ; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-FAST-NEXT: shldl %cl, %esi, %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-FAST-NEXT: movl %edx, 12(%ecx) -; X86-FAST-NEXT: movl %ebx, 8(%ecx) -; X86-FAST-NEXT: movl %ebp, 4(%ecx) -; X86-FAST-NEXT: movl %eax, (%ecx) -; X86-FAST-NEXT: movl %ecx, %eax +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-FAST-NEXT: shldl %cl, %eax, %edx +; X86-FAST-NEXT: movl 8(%ebp), %eax +; X86-FAST-NEXT: movl %edx, 12(%eax) +; X86-FAST-NEXT: movl %esi, 8(%eax) +; X86-FAST-NEXT: movl %ebx, 4(%eax) +; X86-FAST-NEXT: movl %edi, (%eax) +; X86-FAST-NEXT: leal -12(%ebp), %esp ; X86-FAST-NEXT: popl %esi ; X86-FAST-NEXT: popl %edi ; X86-FAST-NEXT: popl %ebx @@ -320,77 +329,91 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-LABEL: var_shift_i128: ; X86-SLOW: # %bb.0: ; X86-SLOW-NEXT: pushl %ebp +; X86-SLOW-NEXT: movl %esp, %ebp ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: pushl %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: testb $64, %al +; X86-SLOW-NEXT: andl $-16, %esp +; X86-SLOW-NEXT: subl $32, %esp +; X86-SLOW-NEXT: movl 24(%ebp), %esi +; X86-SLOW-NEXT: movl 28(%ebp), %eax +; X86-SLOW-NEXT: movl 48(%ebp), %edx +; X86-SLOW-NEXT: movl 56(%ebp), %ecx +; X86-SLOW-NEXT: testb $64, %cl +; X86-SLOW-NEXT: movl 52(%ebp), %edi ; X86-SLOW-NEXT: jne .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %edx, %ebp -; X86-SLOW-NEXT: movl %ebx, %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %esi, %edx +; X86-SLOW-NEXT: movl 32(%ebp), %esi ; X86-SLOW-NEXT: movl %edi, %ecx -; X86-SLOW-NEXT: movl %esi, %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: testb $32, %al -; X86-SLOW-NEXT: je .LBB6_5 -; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %edi, %ebx -; X86-SLOW-NEXT: movl %edx, %edi -; X86-SLOW-NEXT: movl %ecx, %edx -; X86-SLOW-NEXT: jmp .LBB6_6 +; X86-SLOW-NEXT: movl %eax, %edi +; X86-SLOW-NEXT: movl 36(%ebp), %eax +; X86-SLOW-NEXT: jmp .LBB6_3 ; X86-SLOW-NEXT: .LBB6_1: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: testb $32, %al +; X86-SLOW-NEXT: movl 40(%ebp), %ecx +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl 44(%ebp), %ecx +; X86-SLOW-NEXT: .LBB6_3: +; X86-SLOW-NEXT: movl 56(%ebp), %ebx +; X86-SLOW-NEXT: testb $32, %bl ; X86-SLOW-NEXT: jne .LBB6_4 -; X86-SLOW-NEXT: .LBB6_5: -; X86-SLOW-NEXT: movl %ecx, %ebp -; X86-SLOW-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: # %bb.5: +; X86-SLOW-NEXT: movl %ecx, %ebx +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: jmp .LBB6_6 +; X86-SLOW-NEXT: .LBB6_4: +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-SLOW-NEXT: .LBB6_6: ; X86-SLOW-NEXT: movl %edx, %esi -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: shrl %ebp -; X86-SLOW-NEXT: movb %al, %ch -; X86-SLOW-NEXT: notb %ch -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: orl %esi, %ebp -; X86-SLOW-NEXT: movl %edi, %esi -; X86-SLOW-NEXT: movb %al, %cl -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: shrl %edx -; X86-SLOW-NEXT: movb %ch, %cl -; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: orl %esi, %edx -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: movb %al, %cl +; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: shll %cl, %esi +; X86-SLOW-NEXT: movl %ebx, %edi ; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movl %ecx, %ebx +; X86-SLOW-NEXT: notb %bl +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: orl %esi, %edi -; X86-SLOW-NEXT: movb %al, %cl -; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-SLOW-NEXT: movl %esi, %eax +; X86-SLOW-NEXT: movl 56(%ebp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: shrl %edx +; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %edx +; X86-SLOW-NEXT: orl %eax, %edx +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-SLOW-NEXT: movl %ebx, %eax +; X86-SLOW-NEXT: movl 56(%ebp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: shrl %esi +; X86-SLOW-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: orl %eax, %esi +; X86-SLOW-NEXT: movl 56(%ebp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SLOW-NEXT: shll %cl, %eax ; X86-SLOW-NEXT: shrl %ebx -; X86-SLOW-NEXT: movb %ch, %cl +; X86-SLOW-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-SLOW-NEXT: shrl %cl, %ebx ; X86-SLOW-NEXT: orl %eax, %ebx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SLOW-NEXT: movl 8(%ebp), %eax ; X86-SLOW-NEXT: movl %ebx, 12(%eax) -; X86-SLOW-NEXT: movl %edi, 8(%eax) +; X86-SLOW-NEXT: movl %esi, 8(%eax) ; X86-SLOW-NEXT: movl %edx, 4(%eax) -; X86-SLOW-NEXT: movl %ebp, (%eax) -; X86-SLOW-NEXT: addl $4, %esp +; X86-SLOW-NEXT: movl %edi, (%eax) +; X86-SLOW-NEXT: leal -12(%ebp), %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll index 4340f8fd484ae..544ab7fc77374 100644 --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -258,51 +258,53 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-FAST-LABEL: var_shift_i128: ; X86-FAST: # %bb.0: ; X86-FAST-NEXT: pushl %ebp +; X86-FAST-NEXT: movl %esp, %ebp ; X86-FAST-NEXT: pushl %ebx ; X86-FAST-NEXT: pushl %edi ; X86-FAST-NEXT: pushl %esi -; X86-FAST-NEXT: pushl %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-FAST-NEXT: andl $-16, %esp +; X86-FAST-NEXT: subl $16, %esp +; X86-FAST-NEXT: movl 24(%ebp), %esi +; X86-FAST-NEXT: movl 28(%ebp), %eax +; X86-FAST-NEXT: movl 48(%ebp), %edx +; X86-FAST-NEXT: movl 56(%ebp), %ecx ; X86-FAST-NEXT: testb $64, %cl +; X86-FAST-NEXT: movl 52(%ebp), %ebx ; X86-FAST-NEXT: je .LBB6_1 ; X86-FAST-NEXT: # %bb.2: -; X86-FAST-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-FAST-NEXT: movl %edi, %edx -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-FAST-NEXT: movl %esi, %ebp -; X86-FAST-NEXT: movl %ebx, %esi -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-FAST-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl %esi, %edx +; X86-FAST-NEXT: movl 32(%ebp), %esi +; X86-FAST-NEXT: movl %ebx, %edi +; X86-FAST-NEXT: movl %eax, %ebx +; X86-FAST-NEXT: movl 36(%ebp), %eax ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: je .LBB6_4 ; X86-FAST-NEXT: jmp .LBB6_5 ; X86-FAST-NEXT: .LBB6_1: -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-FAST-NEXT: movl %ebp, (%esp) # 4-byte Spill -; X86-FAST-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-FAST-NEXT: movl 40(%ebp), %edi +; X86-FAST-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-FAST-NEXT: movl 44(%ebp), %edi ; X86-FAST-NEXT: testb $32, %cl ; X86-FAST-NEXT: jne .LBB6_5 ; X86-FAST-NEXT: .LBB6_4: -; X86-FAST-NEXT: movl %edi, %ebx -; X86-FAST-NEXT: movl %esi, %edi -; X86-FAST-NEXT: movl %edx, %esi -; X86-FAST-NEXT: movl %ebp, %edx -; X86-FAST-NEXT: movl (%esp), %ebp # 4-byte Reload +; X86-FAST-NEXT: movl %esi, %eax +; X86-FAST-NEXT: movl %ebx, %esi +; X86-FAST-NEXT: movl %edx, %ebx +; X86-FAST-NEXT: movl %edi, %edx +; X86-FAST-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-FAST-NEXT: .LBB6_5: -; X86-FAST-NEXT: shrdl %cl, %edx, %ebp -; X86-FAST-NEXT: shrdl %cl, %esi, %edx -; X86-FAST-NEXT: shrdl %cl, %edi, %esi +; X86-FAST-NEXT: shrdl %cl, %edx, %edi +; X86-FAST-NEXT: shrdl %cl, %ebx, %edx +; X86-FAST-NEXT: shrdl %cl, %esi, %ebx ; X86-FAST-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-FAST-NEXT: shrdl %cl, %ebx, %edi -; X86-FAST-NEXT: movl %edi, 12(%eax) -; X86-FAST-NEXT: movl %esi, 8(%eax) +; X86-FAST-NEXT: shrdl %cl, %eax, %esi +; X86-FAST-NEXT: movl 8(%ebp), %eax +; X86-FAST-NEXT: movl %esi, 12(%eax) +; X86-FAST-NEXT: movl %ebx, 8(%eax) ; X86-FAST-NEXT: movl %edx, 4(%eax) -; X86-FAST-NEXT: movl %ebp, (%eax) -; X86-FAST-NEXT: addl $4, %esp +; X86-FAST-NEXT: movl %edi, (%eax) +; X86-FAST-NEXT: leal -12(%ebp), %esp ; X86-FAST-NEXT: popl %esi ; X86-FAST-NEXT: popl %edi ; X86-FAST-NEXT: popl %ebx @@ -312,78 +314,88 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-LABEL: var_shift_i128: ; X86-SLOW: # %bb.0: ; X86-SLOW-NEXT: pushl %ebp +; X86-SLOW-NEXT: movl %esp, %ebp ; X86-SLOW-NEXT: pushl %ebx ; X86-SLOW-NEXT: pushl %edi ; X86-SLOW-NEXT: pushl %esi -; X86-SLOW-NEXT: subl $8, %esp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SLOW-NEXT: testb $64, %cl +; X86-SLOW-NEXT: andl $-16, %esp +; X86-SLOW-NEXT: subl $16, %esp +; X86-SLOW-NEXT: movl 24(%ebp), %edx +; X86-SLOW-NEXT: movl 28(%ebp), %esi +; X86-SLOW-NEXT: movl 48(%ebp), %ebx +; X86-SLOW-NEXT: movl 56(%ebp), %eax +; X86-SLOW-NEXT: testb $64, %al +; X86-SLOW-NEXT: movl 52(%ebp), %edi ; X86-SLOW-NEXT: je .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %ebp, %eax -; X86-SLOW-NEXT: movl %ebx, %ebp -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SLOW-NEXT: movl %edi, %edx +; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %edx, %ebx +; X86-SLOW-NEXT: movl 32(%ebp), %edx +; X86-SLOW-NEXT: movl %edi, %eax ; X86-SLOW-NEXT: movl %esi, %edi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SLOW-NEXT: testb $32, %cl -; X86-SLOW-NEXT: jne .LBB6_5 -; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %ebx, %esi -; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebp, %edi -; X86-SLOW-NEXT: movl %edx, %ebp -; X86-SLOW-NEXT: movl %eax, %edx -; X86-SLOW-NEXT: jmp .LBB6_6 +; X86-SLOW-NEXT: movl 36(%ebp), %esi +; X86-SLOW-NEXT: jmp .LBB6_3 ; X86-SLOW-NEXT: .LBB6_1: -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-SLOW-NEXT: movl 40(%ebp), %eax +; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl 44(%ebp), %eax +; X86-SLOW-NEXT: .LBB6_3: +; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: testb $32, %cl ; X86-SLOW-NEXT: je .LBB6_4 -; X86-SLOW-NEXT: .LBB6_5: -; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: # %bb.5: +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: jmp .LBB6_6 +; X86-SLOW-NEXT: .LBB6_4: +; X86-SLOW-NEXT: movl %edx, %esi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, %ebx +; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: .LBB6_6: -; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: movl %ecx, %ebx -; X86-SLOW-NEXT: notb %bl -; X86-SLOW-NEXT: leal (%ebp,%ebp), %eax -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: orl %edx, %eax -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: shrl %cl, %eax +; X86-SLOW-NEXT: movl %eax, %edx +; X86-SLOW-NEXT: movl %ecx, %eax +; X86-SLOW-NEXT: notb %al +; X86-SLOW-NEXT: movl %ebx, %edi +; X86-SLOW-NEXT: addl %ebx, %ebx +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %ebx +; X86-SLOW-NEXT: orl %edx, %ebx +; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %ebp -; X86-SLOW-NEXT: leal (%edi,%edi), %edx -; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-SLOW-NEXT: leal (%ebx,%ebx), %edx +; X86-SLOW-NEXT: movl %eax, %ecx ; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: orl %ebp, %edx -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: orl %edi, %edx +; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl (%esp), %edi # 4-byte Reload -; X86-SLOW-NEXT: leal (%edi,%edi), %ebp -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shll %cl, %ebp -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: shrl %cl, %ebx +; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: leal (%edi,%edi), %ebx +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: shll %cl, %ebx +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SLOW-NEXT: shrl %cl, %edi ; X86-SLOW-NEXT: addl %esi, %esi -; X86-SLOW-NEXT: movl %ebx, %ecx +; X86-SLOW-NEXT: movl %eax, %ecx ; X86-SLOW-NEXT: shll %cl, %esi ; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SLOW-NEXT: movl 8(%ebp), %ecx ; X86-SLOW-NEXT: movl %esi, 12(%ecx) -; X86-SLOW-NEXT: movl %ebp, 8(%ecx) +; X86-SLOW-NEXT: movl %ebx, 8(%ecx) ; X86-SLOW-NEXT: movl %edx, 4(%ecx) +; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload ; X86-SLOW-NEXT: movl %eax, (%ecx) ; X86-SLOW-NEXT: movl %ecx, %eax -; X86-SLOW-NEXT: addl $8, %esp +; X86-SLOW-NEXT: leal -12(%ebp), %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi ; X86-SLOW-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index a464d78f9af38..df97f49440f74 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -74,43 +74,57 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SSE2-LABEL: fshl_i128: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp ; X86-SSE2-NEXT: pushl %ebx ; X86-SSE2-NEXT: pushl %edi ; X86-SSE2-NEXT: pushl %esi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movl 48(%ebp), %edi +; X86-SSE2-NEXT: movl 52(%ebp), %eax +; X86-SSE2-NEXT: movl 24(%ebp), %edx +; X86-SSE2-NEXT: movl 56(%ebp), %ecx ; X86-SSE2-NEXT: testb $64, %cl -; X86-SSE2-NEXT: movl %esi, %eax -; X86-SSE2-NEXT: cmovnel %ebx, %eax -; X86-SSE2-NEXT: movl %edx, %ebp -; X86-SSE2-NEXT: cmovnel %edi, %ebp -; X86-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %edi -; X86-SSE2-NEXT: cmovnel {{[0-9]+}}(%esp), %ebx -; X86-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %edx -; X86-SSE2-NEXT: cmovel {{[0-9]+}}(%esp), %esi +; X86-SSE2-NEXT: movl %edx, %ecx +; X86-SSE2-NEXT: cmovnel %edi, %ecx +; X86-SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 28(%ebp), %esi +; X86-SSE2-NEXT: movl %esi, %ebx +; X86-SSE2-NEXT: cmovnel %eax, %ebx +; X86-SSE2-NEXT: cmovnel 44(%ebp), %eax +; X86-SSE2-NEXT: cmovnel 40(%ebp), %edi +; X86-SSE2-NEXT: cmovel 36(%ebp), %esi +; X86-SSE2-NEXT: cmovel 32(%ebp), %edx +; X86-SSE2-NEXT: movl 56(%ebp), %ecx ; X86-SSE2-NEXT: testb $32, %cl -; X86-SSE2-NEXT: cmovnel %esi, %edx -; X86-SSE2-NEXT: cmovnel %ebp, %esi -; X86-SSE2-NEXT: cmovnel %eax, %ebp -; X86-SSE2-NEXT: cmovel %edi, %ebx +; X86-SSE2-NEXT: cmovnel %edx, %esi +; X86-SSE2-NEXT: cmovnel %ebx, %edx +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: cmovnel %ecx, %ebx ; X86-SSE2-NEXT: cmovel %eax, %edi -; X86-SSE2-NEXT: movl %edi, %eax -; X86-SSE2-NEXT: shldl %cl, %ebx, %eax -; X86-SSE2-NEXT: movl %ebp, %ebx -; X86-SSE2-NEXT: shldl %cl, %edi, %ebx -; X86-SSE2-NEXT: movl %esi, %edi -; X86-SSE2-NEXT: shldl %cl, %ebp, %edi +; X86-SSE2-NEXT: cmovel %ecx, %eax +; X86-SSE2-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl 56(%ebp), %ecx ; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SSE2-NEXT: shldl %cl, %esi, %edx -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: movl %edx, 12(%ecx) -; X86-SSE2-NEXT: movl %edi, 8(%ecx) -; X86-SSE2-NEXT: movl %ebx, 4(%ecx) -; X86-SSE2-NEXT: movl %eax, (%ecx) -; X86-SSE2-NEXT: movl %ecx, %eax +; X86-SSE2-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; X86-SSE2-NEXT: movl %ebx, %edi +; X86-SSE2-NEXT: movl 56(%ebp), %ecx +; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SSE2-NEXT: shldl %cl, %eax, %edi +; X86-SSE2-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SSE2-NEXT: movl %edx, %edi +; X86-SSE2-NEXT: movl 56(%ebp), %ecx +; X86-SSE2-NEXT: shldl %cl, %ebx, %edi +; X86-SSE2-NEXT: movl 8(%ebp), %eax +; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SSE2-NEXT: shldl %cl, %edx, %esi +; X86-SSE2-NEXT: movl %esi, 12(%eax) +; X86-SSE2-NEXT: movl %edi, 8(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, 4(%eax) +; X86-SSE2-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SSE2-NEXT: movl %ecx, (%eax) +; X86-SSE2-NEXT: leal -12(%ebp), %esp ; X86-SSE2-NEXT: popl %esi ; X86-SSE2-NEXT: popl %edi ; X86-SSE2-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll index 2849e448a0534..b4546c1e983c4 100644 --- a/llvm/test/CodeGen/X86/i128-add.ll +++ b/llvm/test/CodeGen/X86/i128-add.ll @@ -5,17 +5,20 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind { ; X86-LABEL: add_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: addl 40(%ebp), %esi +; X86-NEXT: adcl 44(%ebp), %edi +; X86-NEXT: adcl 48(%ebp), %ecx +; X86-NEXT: adcl 52(%ebp), %edx ; X86-NEXT: addl $1, %esi ; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %ecx @@ -24,8 +27,10 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind { ; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: add_i128: diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll b/llvm/test/CodeGen/X86/i128-fp128-abi.ll index 4152dcf07f7e7..2174d5056e6ce 100644 --- a/llvm/test/CodeGen/X86/i128-fp128-abi.ll +++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll @@ -55,41 +55,47 @@ define void @store(PrimTy %x, ptr %p) nounwind { ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: pushl %edi ; CHECK-X86-NEXT: pushl %esi -; CHECK-X86-NEXT: movl 12(%esp), %eax -; CHECK-X86-NEXT: movl 16(%esp), %ecx -; CHECK-X86-NEXT: movl 20(%esp), %edx -; CHECK-X86-NEXT: movl 24(%esp), %esi -; CHECK-X86-NEXT: movl 28(%esp), %edi +; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: movl 16(%esp), %eax +; CHECK-X86-NEXT: movl 20(%esp), %ecx +; CHECK-X86-NEXT: movl 24(%esp), %edx +; CHECK-X86-NEXT: movl 28(%esp), %esi +; CHECK-X86-NEXT: movl 32(%esp), %edi ; CHECK-X86-NEXT: movl %esi, 12(%edi) ; CHECK-X86-NEXT: movl %edx, 8(%edi) ; CHECK-X86-NEXT: movl %ecx, 4(%edi) ; CHECK-X86-NEXT: movl %eax, (%edi) +; CHECK-X86-NEXT: addl $4, %esp ; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: popl %edi ; CHECK-X86-NEXT: retl ; ; CHECK-MSVC32-LABEL: store: ; CHECK-MSVC32: # %bb.0: +; CHECK-MSVC32-NEXT: pushl %ebp +; CHECK-MSVC32-NEXT: movl %esp, %ebp ; CHECK-MSVC32-NEXT: pushl %edi ; CHECK-MSVC32-NEXT: pushl %esi -; CHECK-MSVC32-NEXT: movl 12(%esp), %eax -; CHECK-MSVC32-NEXT: movl 16(%esp), %ecx -; CHECK-MSVC32-NEXT: movl 20(%esp), %edx -; CHECK-MSVC32-NEXT: movl 24(%esp), %esi -; CHECK-MSVC32-NEXT: movl 28(%esp), %edi +; CHECK-MSVC32-NEXT: andl $-16, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 12(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 16(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 20(%ebp), %esi +; CHECK-MSVC32-NEXT: movl 24(%ebp), %edi ; CHECK-MSVC32-NEXT: movl %esi, 12(%edi) ; CHECK-MSVC32-NEXT: movl %edx, 8(%edi) ; CHECK-MSVC32-NEXT: movl %ecx, 4(%edi) ; CHECK-MSVC32-NEXT: movl %eax, (%edi) +; CHECK-MSVC32-NEXT: leal -8(%ebp), %esp ; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %edi +; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl store PrimTy %x, ptr %p ret void } ; Illustrate stack alignment -; FIXME(#77401): alignment on x86-32 is ABI-incorrect. define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind { ; CHECK-X64-F128-LABEL: store_perturbed: ; CHECK-X64-F128: # %bb.0: @@ -130,34 +136,41 @@ define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind { ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: pushl %edi ; CHECK-X86-NEXT: pushl %esi -; CHECK-X86-NEXT: movl 16(%esp), %eax -; CHECK-X86-NEXT: movl 20(%esp), %ecx -; CHECK-X86-NEXT: movl 24(%esp), %edx -; CHECK-X86-NEXT: movl 28(%esp), %esi -; CHECK-X86-NEXT: movl 32(%esp), %edi +; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: movl 32(%esp), %eax +; CHECK-X86-NEXT: movl 36(%esp), %ecx +; CHECK-X86-NEXT: movl 40(%esp), %edx +; CHECK-X86-NEXT: movl 44(%esp), %esi +; CHECK-X86-NEXT: movl 48(%esp), %edi ; CHECK-X86-NEXT: movl %esi, 12(%edi) ; CHECK-X86-NEXT: movl %edx, 8(%edi) ; CHECK-X86-NEXT: movl %ecx, 4(%edi) ; CHECK-X86-NEXT: movl %eax, (%edi) +; CHECK-X86-NEXT: addl $4, %esp ; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: popl %edi ; CHECK-X86-NEXT: retl ; ; CHECK-MSVC32-LABEL: store_perturbed: ; CHECK-MSVC32: # %bb.0: +; CHECK-MSVC32-NEXT: pushl %ebp +; CHECK-MSVC32-NEXT: movl %esp, %ebp ; CHECK-MSVC32-NEXT: pushl %edi ; CHECK-MSVC32-NEXT: pushl %esi -; CHECK-MSVC32-NEXT: movl 16(%esp), %eax -; CHECK-MSVC32-NEXT: movl 20(%esp), %ecx -; CHECK-MSVC32-NEXT: movl 24(%esp), %edx -; CHECK-MSVC32-NEXT: movl 28(%esp), %esi -; CHECK-MSVC32-NEXT: movl 32(%esp), %edi +; CHECK-MSVC32-NEXT: andl $-16, %esp +; CHECK-MSVC32-NEXT: movl 24(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 28(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 32(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 36(%ebp), %esi +; CHECK-MSVC32-NEXT: movl 40(%ebp), %edi ; CHECK-MSVC32-NEXT: movl %esi, 12(%edi) ; CHECK-MSVC32-NEXT: movl %edx, 8(%edi) ; CHECK-MSVC32-NEXT: movl %ecx, 4(%edi) ; CHECK-MSVC32-NEXT: movl %eax, (%edi) +; CHECK-MSVC32-NEXT: leal -8(%ebp), %esp ; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %edi +; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl store PrimTy %x, ptr %p ret void @@ -271,34 +284,41 @@ define PrimTy @first_arg(PrimTy %x) nounwind { ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: pushl %edi ; CHECK-X86-NEXT: pushl %esi -; CHECK-X86-NEXT: movl 12(%esp), %eax -; CHECK-X86-NEXT: movl 16(%esp), %ecx -; CHECK-X86-NEXT: movl 20(%esp), %edx -; CHECK-X86-NEXT: movl 24(%esp), %esi -; CHECK-X86-NEXT: movl 28(%esp), %edi +; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: movl 16(%esp), %eax +; CHECK-X86-NEXT: movl 32(%esp), %ecx +; CHECK-X86-NEXT: movl 36(%esp), %edx +; CHECK-X86-NEXT: movl 40(%esp), %esi +; CHECK-X86-NEXT: movl 44(%esp), %edi ; CHECK-X86-NEXT: movl %edi, 12(%eax) ; CHECK-X86-NEXT: movl %esi, 8(%eax) ; CHECK-X86-NEXT: movl %edx, 4(%eax) ; CHECK-X86-NEXT: movl %ecx, (%eax) +; CHECK-X86-NEXT: addl $4, %esp ; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: popl %edi ; CHECK-X86-NEXT: retl $4 ; ; CHECK-MSVC32-LABEL: first_arg: ; CHECK-MSVC32: # %bb.0: +; CHECK-MSVC32-NEXT: pushl %ebp +; CHECK-MSVC32-NEXT: movl %esp, %ebp ; CHECK-MSVC32-NEXT: pushl %edi ; CHECK-MSVC32-NEXT: pushl %esi -; CHECK-MSVC32-NEXT: movl 12(%esp), %eax -; CHECK-MSVC32-NEXT: movl 16(%esp), %ecx -; CHECK-MSVC32-NEXT: movl 20(%esp), %edx -; CHECK-MSVC32-NEXT: movl 24(%esp), %esi -; CHECK-MSVC32-NEXT: movl 28(%esp), %edi +; CHECK-MSVC32-NEXT: andl $-16, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 24(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 28(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 32(%ebp), %esi +; CHECK-MSVC32-NEXT: movl 36(%ebp), %edi ; CHECK-MSVC32-NEXT: movl %edi, 12(%eax) ; CHECK-MSVC32-NEXT: movl %esi, 8(%eax) ; CHECK-MSVC32-NEXT: movl %edx, 4(%eax) ; CHECK-MSVC32-NEXT: movl %ecx, (%eax) +; CHECK-MSVC32-NEXT: leal -8(%ebp), %esp ; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %edi +; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl ret PrimTy %x } @@ -344,34 +364,41 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounw ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: pushl %edi ; CHECK-X86-NEXT: pushl %esi -; CHECK-X86-NEXT: movl 12(%esp), %eax -; CHECK-X86-NEXT: movl 48(%esp), %ecx -; CHECK-X86-NEXT: movl 52(%esp), %edx -; CHECK-X86-NEXT: movl 56(%esp), %esi -; CHECK-X86-NEXT: movl 60(%esp), %edi +; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: movl 16(%esp), %eax +; CHECK-X86-NEXT: movl 64(%esp), %ecx +; CHECK-X86-NEXT: movl 68(%esp), %edx +; CHECK-X86-NEXT: movl 72(%esp), %esi +; CHECK-X86-NEXT: movl 76(%esp), %edi ; CHECK-X86-NEXT: movl %edi, 12(%eax) ; CHECK-X86-NEXT: movl %esi, 8(%eax) ; CHECK-X86-NEXT: movl %edx, 4(%eax) ; CHECK-X86-NEXT: movl %ecx, (%eax) +; CHECK-X86-NEXT: addl $4, %esp ; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: popl %edi ; CHECK-X86-NEXT: retl $4 ; ; CHECK-MSVC32-LABEL: leading_args: ; CHECK-MSVC32: # %bb.0: +; CHECK-MSVC32-NEXT: pushl %ebp +; CHECK-MSVC32-NEXT: movl %esp, %ebp ; CHECK-MSVC32-NEXT: pushl %edi ; CHECK-MSVC32-NEXT: pushl %esi -; CHECK-MSVC32-NEXT: movl 12(%esp), %eax -; CHECK-MSVC32-NEXT: movl 48(%esp), %ecx -; CHECK-MSVC32-NEXT: movl 52(%esp), %edx -; CHECK-MSVC32-NEXT: movl 56(%esp), %esi -; CHECK-MSVC32-NEXT: movl 60(%esp), %edi +; CHECK-MSVC32-NEXT: andl $-16, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 56(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 60(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 64(%ebp), %esi +; CHECK-MSVC32-NEXT: movl 68(%ebp), %edi ; CHECK-MSVC32-NEXT: movl %edi, 12(%eax) ; CHECK-MSVC32-NEXT: movl %esi, 8(%eax) ; CHECK-MSVC32-NEXT: movl %edx, 4(%eax) ; CHECK-MSVC32-NEXT: movl %ecx, (%eax) +; CHECK-MSVC32-NEXT: leal -8(%ebp), %esp ; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %edi +; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl ret PrimTy %x } @@ -417,34 +444,41 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, Pr ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: pushl %edi ; CHECK-X86-NEXT: pushl %esi -; CHECK-X86-NEXT: movl 12(%esp), %eax -; CHECK-X86-NEXT: movl 72(%esp), %ecx -; CHECK-X86-NEXT: movl 76(%esp), %edx -; CHECK-X86-NEXT: movl 80(%esp), %esi -; CHECK-X86-NEXT: movl 84(%esp), %edi +; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: movl 16(%esp), %eax +; CHECK-X86-NEXT: movl 80(%esp), %ecx +; CHECK-X86-NEXT: movl 84(%esp), %edx +; CHECK-X86-NEXT: movl 88(%esp), %esi +; CHECK-X86-NEXT: movl 92(%esp), %edi ; CHECK-X86-NEXT: movl %edi, 12(%eax) ; CHECK-X86-NEXT: movl %esi, 8(%eax) ; CHECK-X86-NEXT: movl %edx, 4(%eax) ; CHECK-X86-NEXT: movl %ecx, (%eax) +; CHECK-X86-NEXT: addl $4, %esp ; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: popl %edi ; CHECK-X86-NEXT: retl $4 ; ; CHECK-MSVC32-LABEL: many_leading_args: ; CHECK-MSVC32: # %bb.0: +; CHECK-MSVC32-NEXT: pushl %ebp +; CHECK-MSVC32-NEXT: movl %esp, %ebp ; CHECK-MSVC32-NEXT: pushl %edi ; CHECK-MSVC32-NEXT: pushl %esi -; CHECK-MSVC32-NEXT: movl 12(%esp), %eax -; CHECK-MSVC32-NEXT: movl 72(%esp), %ecx -; CHECK-MSVC32-NEXT: movl 76(%esp), %edx -; CHECK-MSVC32-NEXT: movl 80(%esp), %esi -; CHECK-MSVC32-NEXT: movl 84(%esp), %edi +; CHECK-MSVC32-NEXT: andl $-16, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 72(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 76(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 80(%ebp), %esi +; CHECK-MSVC32-NEXT: movl 84(%ebp), %edi ; CHECK-MSVC32-NEXT: movl %edi, 12(%eax) ; CHECK-MSVC32-NEXT: movl %esi, 8(%eax) ; CHECK-MSVC32-NEXT: movl %edx, 4(%eax) ; CHECK-MSVC32-NEXT: movl %ecx, (%eax) +; CHECK-MSVC32-NEXT: leal -8(%ebp), %esp ; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %edi +; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl ret PrimTy %x } @@ -488,34 +522,41 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy ; CHECK-X86: # %bb.0: ; CHECK-X86-NEXT: pushl %edi ; CHECK-X86-NEXT: pushl %esi -; CHECK-X86-NEXT: movl 12(%esp), %eax -; CHECK-X86-NEXT: movl 56(%esp), %ecx -; CHECK-X86-NEXT: movl 60(%esp), %edx -; CHECK-X86-NEXT: movl 64(%esp), %esi -; CHECK-X86-NEXT: movl 68(%esp), %edi +; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: movl 16(%esp), %eax +; CHECK-X86-NEXT: movl 64(%esp), %ecx +; CHECK-X86-NEXT: movl 68(%esp), %edx +; CHECK-X86-NEXT: movl 72(%esp), %esi +; CHECK-X86-NEXT: movl 76(%esp), %edi ; CHECK-X86-NEXT: movl %edi, 12(%eax) ; CHECK-X86-NEXT: movl %esi, 8(%eax) ; CHECK-X86-NEXT: movl %edx, 4(%eax) ; CHECK-X86-NEXT: movl %ecx, (%eax) +; CHECK-X86-NEXT: addl $4, %esp ; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: popl %edi ; CHECK-X86-NEXT: retl $4 ; ; CHECK-MSVC32-LABEL: trailing_arg: ; CHECK-MSVC32: # %bb.0: +; CHECK-MSVC32-NEXT: pushl %ebp +; CHECK-MSVC32-NEXT: movl %esp, %ebp ; CHECK-MSVC32-NEXT: pushl %edi ; CHECK-MSVC32-NEXT: pushl %esi -; CHECK-MSVC32-NEXT: movl 12(%esp), %eax -; CHECK-MSVC32-NEXT: movl 56(%esp), %ecx -; CHECK-MSVC32-NEXT: movl 60(%esp), %edx -; CHECK-MSVC32-NEXT: movl 64(%esp), %esi -; CHECK-MSVC32-NEXT: movl 68(%esp), %edi +; CHECK-MSVC32-NEXT: andl $-16, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 56(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 60(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 64(%ebp), %esi +; CHECK-MSVC32-NEXT: movl 68(%ebp), %edi ; CHECK-MSVC32-NEXT: movl %edi, 12(%eax) ; CHECK-MSVC32-NEXT: movl %esi, 8(%eax) ; CHECK-MSVC32-NEXT: movl %edx, 4(%eax) ; CHECK-MSVC32-NEXT: movl %ecx, (%eax) +; CHECK-MSVC32-NEXT: leal -8(%ebp), %esp ; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %edi +; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl ret PrimTy %x } @@ -571,32 +612,43 @@ define void @call_first_arg(PrimTy %x) nounwind { ; ; CHECK-X86-LABEL: call_first_arg: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: subl $40, %esp -; CHECK-X86-NEXT: leal 12(%esp), %eax -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: pushl %esi +; CHECK-X86-NEXT: subl $56, %esp +; CHECK-X86-NEXT: movl 64(%esp), %eax +; CHECK-X86-NEXT: movl 68(%esp), %ecx +; CHECK-X86-NEXT: movl 72(%esp), %edx +; CHECK-X86-NEXT: movl 76(%esp), %esi +; CHECK-X86-NEXT: movl %esi, 28(%esp) +; CHECK-X86-NEXT: movl %edx, 24(%esp) +; CHECK-X86-NEXT: movl %ecx, 20(%esp) +; CHECK-X86-NEXT: movl %eax, 16(%esp) +; CHECK-X86-NEXT: leal 32(%esp), %eax +; CHECK-X86-NEXT: movl %eax, (%esp) ; CHECK-X86-NEXT: calll first_arg@PLT -; CHECK-X86-NEXT: addl $56, %esp +; CHECK-X86-NEXT: addl $52, %esp +; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: retl ; ; CHECK-MSVC32-LABEL: call_first_arg: ; CHECK-MSVC32: # %bb.0: ; CHECK-MSVC32-NEXT: pushl %ebp ; CHECK-MSVC32-NEXT: movl %esp, %ebp +; CHECK-MSVC32-NEXT: pushl %esi ; CHECK-MSVC32-NEXT: andl $-16, %esp -; CHECK-MSVC32-NEXT: subl $32, %esp -; CHECK-MSVC32-NEXT: movl %esp, %eax -; CHECK-MSVC32-NEXT: pushl 20(%ebp) -; CHECK-MSVC32-NEXT: pushl 16(%ebp) -; CHECK-MSVC32-NEXT: pushl 12(%ebp) -; CHECK-MSVC32-NEXT: pushl 8(%ebp) -; CHECK-MSVC32-NEXT: pushl %eax +; CHECK-MSVC32-NEXT: subl $64, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 12(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 16(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 20(%ebp), %esi +; CHECK-MSVC32-NEXT: movl %esi, 28(%esp) +; CHECK-MSVC32-NEXT: movl %edx, 24(%esp) +; CHECK-MSVC32-NEXT: movl %ecx, 20(%esp) +; CHECK-MSVC32-NEXT: movl %eax, 16(%esp) +; CHECK-MSVC32-NEXT: leal 32(%esp), %eax +; CHECK-MSVC32-NEXT: movl %eax, (%esp) ; CHECK-MSVC32-NEXT: calll _first_arg -; CHECK-MSVC32-NEXT: addl $20, %esp -; CHECK-MSVC32-NEXT: movl %ebp, %esp +; CHECK-MSVC32-NEXT: leal -4(%ebp), %esp +; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl call PrimTy @first_arg(PrimTy %x) @@ -686,48 +738,59 @@ define void @call_leading_args(PrimTy %x) nounwind { ; ; CHECK-X86-LABEL: call_leading_args: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: subl $40, %esp -; CHECK-X86-NEXT: leal 12(%esp), %eax -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: pushl %esi +; CHECK-X86-NEXT: subl $88, %esp +; CHECK-X86-NEXT: movl 96(%esp), %eax +; CHECK-X86-NEXT: movl 100(%esp), %ecx +; CHECK-X86-NEXT: movl 104(%esp), %edx +; CHECK-X86-NEXT: movl 108(%esp), %esi +; CHECK-X86-NEXT: movl %esi, 60(%esp) +; CHECK-X86-NEXT: movl %edx, 56(%esp) +; CHECK-X86-NEXT: movl %ecx, 52(%esp) +; CHECK-X86-NEXT: movl %eax, 48(%esp) +; CHECK-X86-NEXT: leal 64(%esp), %eax +; CHECK-X86-NEXT: movl %eax, (%esp) +; CHECK-X86-NEXT: movl $0, 32(%esp) +; CHECK-X86-NEXT: movl $0, 28(%esp) +; CHECK-X86-NEXT: movl $0, 24(%esp) +; CHECK-X86-NEXT: movl $0, 20(%esp) +; CHECK-X86-NEXT: movl $0, 16(%esp) +; CHECK-X86-NEXT: movl $0, 12(%esp) +; CHECK-X86-NEXT: movl $0, 8(%esp) +; CHECK-X86-NEXT: movl $0, 4(%esp) ; CHECK-X86-NEXT: calll leading_args@PLT -; CHECK-X86-NEXT: addl $88, %esp +; CHECK-X86-NEXT: addl $84, %esp +; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: retl ; ; CHECK-MSVC32-LABEL: call_leading_args: ; CHECK-MSVC32: # %bb.0: ; CHECK-MSVC32-NEXT: pushl %ebp ; CHECK-MSVC32-NEXT: movl %esp, %ebp +; CHECK-MSVC32-NEXT: pushl %esi ; CHECK-MSVC32-NEXT: andl $-16, %esp -; CHECK-MSVC32-NEXT: subl $32, %esp -; CHECK-MSVC32-NEXT: movl %esp, %eax -; CHECK-MSVC32-NEXT: pushl 20(%ebp) -; CHECK-MSVC32-NEXT: pushl 16(%ebp) -; CHECK-MSVC32-NEXT: pushl 12(%ebp) -; CHECK-MSVC32-NEXT: pushl 8(%ebp) -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl %eax +; CHECK-MSVC32-NEXT: subl $96, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 12(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 16(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 20(%ebp), %esi +; CHECK-MSVC32-NEXT: movl %esi, 60(%esp) +; CHECK-MSVC32-NEXT: movl %edx, 56(%esp) +; CHECK-MSVC32-NEXT: movl %ecx, 52(%esp) +; CHECK-MSVC32-NEXT: movl %eax, 48(%esp) +; CHECK-MSVC32-NEXT: leal 64(%esp), %eax +; CHECK-MSVC32-NEXT: movl %eax, (%esp) +; CHECK-MSVC32-NEXT: movl $0, 32(%esp) +; CHECK-MSVC32-NEXT: movl $0, 28(%esp) +; CHECK-MSVC32-NEXT: movl $0, 24(%esp) +; CHECK-MSVC32-NEXT: movl $0, 20(%esp) +; CHECK-MSVC32-NEXT: movl $0, 16(%esp) +; CHECK-MSVC32-NEXT: movl $0, 12(%esp) +; CHECK-MSVC32-NEXT: movl $0, 8(%esp) +; CHECK-MSVC32-NEXT: movl $0, 4(%esp) ; CHECK-MSVC32-NEXT: calll _leading_args -; CHECK-MSVC32-NEXT: addl $52, %esp -; CHECK-MSVC32-NEXT: movl %ebp, %esp +; CHECK-MSVC32-NEXT: leal -4(%ebp), %esp +; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl call PrimTy @leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy %x) @@ -836,56 +899,67 @@ define void @call_many_leading_args(PrimTy %x) nounwind { ; ; CHECK-X86-LABEL: call_many_leading_args: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: subl $40, %esp -; CHECK-X86-NEXT: leal 12(%esp), %eax -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: pushl %esi +; CHECK-X86-NEXT: subl $104, %esp +; CHECK-X86-NEXT: movl 112(%esp), %eax +; CHECK-X86-NEXT: movl 116(%esp), %ecx +; CHECK-X86-NEXT: movl 120(%esp), %edx +; CHECK-X86-NEXT: movl 124(%esp), %esi +; CHECK-X86-NEXT: movl %esi, 76(%esp) +; CHECK-X86-NEXT: movl %edx, 72(%esp) +; CHECK-X86-NEXT: movl %ecx, 68(%esp) +; CHECK-X86-NEXT: movl %eax, 64(%esp) +; CHECK-X86-NEXT: leal 80(%esp), %eax +; CHECK-X86-NEXT: movl %eax, (%esp) +; CHECK-X86-NEXT: movl $0, 60(%esp) +; CHECK-X86-NEXT: movl $0, 56(%esp) +; CHECK-X86-NEXT: movl $0, 52(%esp) +; CHECK-X86-NEXT: movl $0, 48(%esp) +; CHECK-X86-NEXT: movl $0, 32(%esp) +; CHECK-X86-NEXT: movl $0, 28(%esp) +; CHECK-X86-NEXT: movl $0, 24(%esp) +; CHECK-X86-NEXT: movl $0, 20(%esp) +; CHECK-X86-NEXT: movl $0, 16(%esp) +; CHECK-X86-NEXT: movl $0, 12(%esp) +; CHECK-X86-NEXT: movl $0, 8(%esp) +; CHECK-X86-NEXT: movl $0, 4(%esp) ; CHECK-X86-NEXT: calll many_leading_args@PLT -; CHECK-X86-NEXT: addl $104, %esp +; CHECK-X86-NEXT: addl $100, %esp +; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: retl ; ; CHECK-MSVC32-LABEL: call_many_leading_args: ; CHECK-MSVC32: # %bb.0: ; CHECK-MSVC32-NEXT: pushl %ebp ; CHECK-MSVC32-NEXT: movl %esp, %ebp +; CHECK-MSVC32-NEXT: pushl %esi ; CHECK-MSVC32-NEXT: andl $-16, %esp -; CHECK-MSVC32-NEXT: subl $32, %esp -; CHECK-MSVC32-NEXT: movl %esp, %eax -; CHECK-MSVC32-NEXT: pushl 20(%ebp) -; CHECK-MSVC32-NEXT: pushl 16(%ebp) -; CHECK-MSVC32-NEXT: pushl 12(%ebp) -; CHECK-MSVC32-NEXT: pushl 8(%ebp) -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl %eax +; CHECK-MSVC32-NEXT: subl $112, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 12(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 16(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 20(%ebp), %esi +; CHECK-MSVC32-NEXT: movl %esi, 76(%esp) +; CHECK-MSVC32-NEXT: movl %edx, 72(%esp) +; CHECK-MSVC32-NEXT: movl %ecx, 68(%esp) +; CHECK-MSVC32-NEXT: movl %eax, 64(%esp) +; CHECK-MSVC32-NEXT: leal 80(%esp), %eax +; CHECK-MSVC32-NEXT: movl %eax, (%esp) +; CHECK-MSVC32-NEXT: movl $0, 60(%esp) +; CHECK-MSVC32-NEXT: movl $0, 56(%esp) +; CHECK-MSVC32-NEXT: movl $0, 52(%esp) +; CHECK-MSVC32-NEXT: movl $0, 48(%esp) +; CHECK-MSVC32-NEXT: movl $0, 32(%esp) +; CHECK-MSVC32-NEXT: movl $0, 28(%esp) +; CHECK-MSVC32-NEXT: movl $0, 24(%esp) +; CHECK-MSVC32-NEXT: movl $0, 20(%esp) +; CHECK-MSVC32-NEXT: movl $0, 16(%esp) +; CHECK-MSVC32-NEXT: movl $0, 12(%esp) +; CHECK-MSVC32-NEXT: movl $0, 8(%esp) +; CHECK-MSVC32-NEXT: movl $0, 4(%esp) ; CHECK-MSVC32-NEXT: calll _many_leading_args -; CHECK-MSVC32-NEXT: addl $68, %esp -; CHECK-MSVC32-NEXT: movl %ebp, %esp +; CHECK-MSVC32-NEXT: leal -4(%ebp), %esp +; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl call PrimTy @many_leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy Prim0, PrimTy %x) @@ -975,48 +1049,59 @@ define void @call_trailing_arg(PrimTy %x) nounwind { ; ; CHECK-X86-LABEL: call_trailing_arg: ; CHECK-X86: # %bb.0: -; CHECK-X86-NEXT: subl $40, %esp -; CHECK-X86-NEXT: leal 12(%esp), %eax -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl 56(%esp) -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl $0 -; CHECK-X86-NEXT: pushl %eax +; CHECK-X86-NEXT: pushl %esi +; CHECK-X86-NEXT: subl $88, %esp +; CHECK-X86-NEXT: movl 96(%esp), %eax +; CHECK-X86-NEXT: movl 100(%esp), %ecx +; CHECK-X86-NEXT: movl 104(%esp), %edx +; CHECK-X86-NEXT: movl 108(%esp), %esi +; CHECK-X86-NEXT: movl %esi, 60(%esp) +; CHECK-X86-NEXT: movl %edx, 56(%esp) +; CHECK-X86-NEXT: movl %ecx, 52(%esp) +; CHECK-X86-NEXT: movl %eax, 48(%esp) +; CHECK-X86-NEXT: leal 64(%esp), %eax +; CHECK-X86-NEXT: movl %eax, (%esp) +; CHECK-X86-NEXT: movl $0, 32(%esp) +; CHECK-X86-NEXT: movl $0, 28(%esp) +; CHECK-X86-NEXT: movl $0, 24(%esp) +; CHECK-X86-NEXT: movl $0, 20(%esp) +; CHECK-X86-NEXT: movl $0, 16(%esp) +; CHECK-X86-NEXT: movl $0, 12(%esp) +; CHECK-X86-NEXT: movl $0, 8(%esp) +; CHECK-X86-NEXT: movl $0, 4(%esp) ; CHECK-X86-NEXT: calll trailing_arg@PLT -; CHECK-X86-NEXT: addl $88, %esp +; CHECK-X86-NEXT: addl $84, %esp +; CHECK-X86-NEXT: popl %esi ; CHECK-X86-NEXT: retl ; ; CHECK-MSVC32-LABEL: call_trailing_arg: ; CHECK-MSVC32: # %bb.0: ; CHECK-MSVC32-NEXT: pushl %ebp ; CHECK-MSVC32-NEXT: movl %esp, %ebp +; CHECK-MSVC32-NEXT: pushl %esi ; CHECK-MSVC32-NEXT: andl $-16, %esp -; CHECK-MSVC32-NEXT: subl $32, %esp -; CHECK-MSVC32-NEXT: movl %esp, %eax -; CHECK-MSVC32-NEXT: pushl 20(%ebp) -; CHECK-MSVC32-NEXT: pushl 16(%ebp) -; CHECK-MSVC32-NEXT: pushl 12(%ebp) -; CHECK-MSVC32-NEXT: pushl 8(%ebp) -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl $0 -; CHECK-MSVC32-NEXT: pushl %eax +; CHECK-MSVC32-NEXT: subl $96, %esp +; CHECK-MSVC32-NEXT: movl 8(%ebp), %eax +; CHECK-MSVC32-NEXT: movl 12(%ebp), %ecx +; CHECK-MSVC32-NEXT: movl 16(%ebp), %edx +; CHECK-MSVC32-NEXT: movl 20(%ebp), %esi +; CHECK-MSVC32-NEXT: movl %esi, 60(%esp) +; CHECK-MSVC32-NEXT: movl %edx, 56(%esp) +; CHECK-MSVC32-NEXT: movl %ecx, 52(%esp) +; CHECK-MSVC32-NEXT: movl %eax, 48(%esp) +; CHECK-MSVC32-NEXT: leal 64(%esp), %eax +; CHECK-MSVC32-NEXT: movl %eax, (%esp) +; CHECK-MSVC32-NEXT: movl $0, 32(%esp) +; CHECK-MSVC32-NEXT: movl $0, 28(%esp) +; CHECK-MSVC32-NEXT: movl $0, 24(%esp) +; CHECK-MSVC32-NEXT: movl $0, 20(%esp) +; CHECK-MSVC32-NEXT: movl $0, 16(%esp) +; CHECK-MSVC32-NEXT: movl $0, 12(%esp) +; CHECK-MSVC32-NEXT: movl $0, 8(%esp) +; CHECK-MSVC32-NEXT: movl $0, 4(%esp) ; CHECK-MSVC32-NEXT: calll _trailing_arg -; CHECK-MSVC32-NEXT: addl $52, %esp -; CHECK-MSVC32-NEXT: movl %ebp, %esp +; CHECK-MSVC32-NEXT: leal -4(%ebp), %esp +; CHECK-MSVC32-NEXT: popl %esi ; CHECK-MSVC32-NEXT: popl %ebp ; CHECK-MSVC32-NEXT: retl call PrimTy @trailing_arg(i64 0, i64 0, i64 0, i64 0, PrimTy %x) diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll index 717f52f198ee8..7d5757392c982 100644 --- a/llvm/test/CodeGen/X86/i128-sdiv.ll +++ b/llvm/test/CodeGen/X86/i128-sdiv.ll @@ -8,18 +8,21 @@ define i128 @test1(i128 %x) nounwind { ; X86-LABEL: test1: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 36(%ebp), %ecx ; X86-NEXT: movl %ecx, %esi ; X86-NEXT: sarl $31, %esi ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl $30, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 24(%ebp), %edi ; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %edx +; X86-NEXT: adcl 28(%ebp), %esi +; X86-NEXT: adcl 32(%ebp), %edx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: shrdl $2, %ecx, %edx ; X86-NEXT: movl %ecx, %esi @@ -29,8 +32,10 @@ define i128 @test1(i128 %x) nounwind { ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl %esi, 4(%eax) ; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test1: @@ -52,38 +57,44 @@ define i128 @test1(i128 %x) nounwind { define i128 @test2(i128 %x) nounwind { ; X86-LABEL: test2: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: shrl $30, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: addl %esi, %edi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %esi -; X86-NEXT: adcl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: adcl $0, %edx -; X86-NEXT: shrdl $2, %edx, %ecx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: addl %edx, %esi +; X86-NEXT: adcl 28(%ebp), %edx +; X86-NEXT: adcl 32(%ebp), %ecx +; X86-NEXT: adcl $0, %eax +; X86-NEXT: shrdl $2, %eax, %ecx +; X86-NEXT: movl %eax, %esi ; X86-NEXT: sarl $31, %esi -; X86-NEXT: sarl $2, %edx -; X86-NEXT: xorl %edi, %edi +; X86-NEXT: sarl $2, %eax +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: negl %ecx +; X86-NEXT: movl $0, %edi +; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl $0, %edx +; X86-NEXT: sbbl %esi, %ebx ; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: sbbl %esi, %edi +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %edx, 8(%eax) -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test2: diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll index 3f890b7f2443a..9011832421326 100644 --- a/llvm/test/CodeGen/X86/i128-udiv.ll +++ b/llvm/test/CodeGen/X86/i128-udiv.ll @@ -8,15 +8,21 @@ define i128 @test1(i128 %x) nounwind { ; X86-LABEL: test1: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %edx ; X86-NEXT: shrdl $2, %edx, %ecx ; X86-NEXT: shrl $2, %edx ; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: movl $0, 12(%eax) ; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test1: diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll index 55c318e87a5a0..bdceeefbcfaba 100644 --- a/llvm/test/CodeGen/X86/iabs.ll +++ b/llvm/test/CodeGen/X86/iabs.ll @@ -123,31 +123,34 @@ define i64 @test_i64(i64 %a) nounwind { define i128 @test_i128(i128 %a) nounwind { ; X86-LABEL: test_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: xorl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: xorl %edx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %edx, %ebx -; X86-NEXT: subl %edx, %ebx -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: sbbl %edx, %esi -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl %ebx, (%eax) -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: xorl %eax, %ecx +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: xorl %eax, %edx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: xorl %eax, %esi +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: xorl %eax, %edi +; X86-NEXT: subl %eax, %edi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: test_i128: diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll index c52b3ed6c926d..4a6c1d0ae5deb 100644 --- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll +++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll @@ -10,33 +10,39 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind { ; X86-LABEL: opt_setcc_lt_power_of_2: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %edi +; X86-NEXT: movl 24(%ebp), %esi ; X86-NEXT: .p2align 4 ; X86-NEXT: .LBB0_1: # %loop ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: addl $1, %edi -; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl $1, %esi +; X86-NEXT: adcl $0, %edi ; X86-NEXT: adcl $0, %edx ; X86-NEXT: adcl $0, %ecx ; X86-NEXT: movl %edx, %ebx ; X86-NEXT: orl %ecx, %ebx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: orl %edx, %ebp -; X86-NEXT: orl %ecx, %ebp -; X86-NEXT: shrdl $28, %ebx, %ebp +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: shrdl $28, %ebx, %esi +; X86-NEXT: movl %eax, %esi ; X86-NEXT: jne .LBB0_1 ; X86-NEXT: # %bb.2: # %exit -; X86-NEXT: movl %edi, (%eax) -; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -73,15 +79,21 @@ exit: define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: orl 20(%ebp), %ecx +; X86-NEXT: movl 8(%ebp), %edx ; X86-NEXT: orl %eax, %edx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: shldl $15, %edx, %ecx ; X86-NEXT: sete %al +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_eq_zero: @@ -98,15 +110,21 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind { define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_srl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: orl 20(%ebp), %ecx +; X86-NEXT: movl 8(%ebp), %edx ; X86-NEXT: orl %eax, %edx ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: shldl $15, %edx, %ecx ; X86-NEXT: setne %al +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_srl_ne_zero: @@ -123,13 +141,19 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind { define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_eq_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl 20(%ebp), %ecx ; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl 8(%ebp), %eax +; X86-NEXT: orl 12(%ebp), %eax ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: sete %al +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_eq_zero: @@ -146,13 +170,19 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind { define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_ne_zero: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl 20(%ebp), %ecx ; X86-NEXT: shll $17, %ecx -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl 8(%ebp), %eax +; X86-NEXT: orl 12(%ebp), %eax ; X86-NEXT: orl %ecx, %eax ; X86-NEXT: setne %al +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_ne_zero: @@ -170,13 +200,17 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind { define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind { ; X86-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl 16(%ebp), %edx +; X86-NEXT: movl 20(%ebp), %esi ; X86-NEXT: shldl $17, %edx, %esi ; X86-NEXT: shldl $17, %ecx, %edx ; X86-NEXT: shldl $17, %eax, %ecx @@ -194,9 +228,11 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind { ; X86-NEXT: calll use@PLT ; X86-NEXT: addl $16, %esp ; X86-NEXT: movl %ebx, %eax +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl ; ; X64-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users: diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll index fc1cc1f65627a..e10e48f9aea08 100644 --- a/llvm/test/CodeGen/X86/mul128.ll +++ b/llvm/test/CodeGen/X86/mul128.ll @@ -18,85 +18,80 @@ define i128 @foo(i128 %t, i128 %u) { ; X86: # %bb.0: ; X86-NEXT: pushl %ebp ; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp ; X86-NEXT: pushl %ebx -; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: pushl %edi -; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi -; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $8, %esp -; X86-NEXT: .cfi_def_cfa_offset 28 +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 -; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: imull %ecx, %ebp -; X86-NEXT: movl %eax, %esi +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl 40(%ebp), %edi +; X86-NEXT: movl 44(%ebp), %esi +; X86-NEXT: imull %ecx, %esi +; X86-NEXT: movl %edi, %eax ; X86-NEXT: mull %ecx -; X86-NEXT: movl %eax, %edi +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: addl %ebp, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: imull %esi, %eax +; X86-NEXT: addl %esi, %ebx +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: imull %edi, %eax ; X86-NEXT: addl %eax, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: imull %ecx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: mull %ebp -; X86-NEXT: addl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: imull %ebp, %esi +; X86-NEXT: movl 48(%ebp), %eax +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: imull 28(%ebp), %ecx +; X86-NEXT: movl 24(%ebp), %edi +; X86-NEXT: mull %edi +; X86-NEXT: addl %ecx, %edx +; X86-NEXT: movl 52(%ebp), %esi +; X86-NEXT: imull %edi, %esi ; X86-NEXT: addl %edx, %esi -; X86-NEXT: addl %edi, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload +; X86-NEXT: movl %eax, (%esp) # 4-byte Spill ; X86-NEXT: adcl %ebx, %esi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: mull %edi +; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %ebx -; X86-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: mull %edi +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: mull %ecx ; X86-NEXT: movl %edx, %edi ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addl %ebx, %ecx ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: addl %ecx, %ebx -; X86-NEXT: adcl %edi, %ebp +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: mull 44(%ebp) +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: addl %ecx, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: adcl %edi, %ebx ; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: mull 44(%ebp) +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: movzbl %cl, %ecx ; X86-NEXT: adcl %ecx, %edx -; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload ; X86-NEXT: adcl %esi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, 4(%ecx) -; X86-NEXT: movl (%esp), %esi # 4-byte Reload +; X86-NEXT: movl 8(%ebp), %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NEXT: movl %esi, 4(%ecx) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: movl %esi, (%ecx) ; X86-NEXT: movl %eax, 8(%ecx) ; X86-NEXT: movl %edx, 12(%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl $8, %esp -; X86-NEXT: .cfi_def_cfa_offset 20 +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi -; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi -; X86-NEXT: .cfi_def_cfa_offset 12 ; X86-NEXT: popl %ebx -; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: popl %ebp -; X86-NEXT: .cfi_def_cfa_offset 4 +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl $4 %k = mul i128 %t, %u ret i128 %k diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll index 961205c50d976..724b2dc4c431a 100644 --- a/llvm/test/CodeGen/X86/neg-abs.ll +++ b/llvm/test/CodeGen/X86/neg-abs.ll @@ -105,31 +105,35 @@ define i128 @neg_abs_i128(i128 %x) nounwind { ; X86-LABEL: neg_abs_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edx, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: sarl $31, %ecx +; X86-NEXT: xorl %ecx, %eax +; X86-NEXT: movl 32(%ebp), %edx ; X86-NEXT: xorl %ecx, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 28(%ebp), %esi ; X86-NEXT: xorl %ecx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 24(%ebp), %edi ; X86-NEXT: xorl %ecx, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: xorl %ecx, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: subl %ebx, %ebp ; X86-NEXT: movl %ecx, %ebx -; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: subl %edi, %ebx ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: sbbl %edx, %ecx -; X86-NEXT: movl %ebp, (%eax) -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -259,37 +263,42 @@ define i64 @sub_abs_i64(i64 %x, i64 %y) nounwind { define i128 @sub_abs_i128(i128 %x, i128 %y) nounwind { ; X86-LABEL: sub_abs_i128: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 36(%ebp), %eax ; X86-NEXT: movl %eax, %edx ; X86-NEXT: sarl $31, %edx ; X86-NEXT: xorl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 32(%ebp), %ecx ; X86-NEXT: xorl %edx, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 28(%ebp), %esi ; X86-NEXT: xorl %edx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 24(%ebp), %edi ; X86-NEXT: xorl %edx, %edi ; X86-NEXT: subl %edx, %edi ; X86-NEXT: sbbl %edx, %esi ; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: sbbl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl 40(%ebp), %edx ; X86-NEXT: subl %edi, %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl 44(%ebp), %edi ; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl 48(%ebp), %esi ; X86-NEXT: sbbl %ecx, %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl 52(%ebp), %ecx ; X86-NEXT: sbbl %eax, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %esi, 8(%eax) ; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 ; ; X64-LABEL: sub_abs_i128: diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll index 35c7c0e09f394..3004b8b72fcc5 100644 --- a/llvm/test/CodeGen/X86/popcnt.ll +++ b/llvm/test/CodeGen/X86/popcnt.ll @@ -340,84 +340,87 @@ define i64 @cnt64(i64 %x) nounwind readnone { define i128 @cnt128(i128 %x) nounwind readnone { ; X86-NOSSE-LABEL: cnt128: ; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: pushl %ebp +; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOSSE-NEXT: movl %edi, %ebx -; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %ebx, %edi -; X86-NOSSE-NEXT: movl %edi, %ebx -; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 -; X86-NOSSE-NEXT: shrl $2, %edi -; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %ebx, %edi -; X86-NOSSE-NEXT: movl %edi, %ebx -; X86-NOSSE-NEXT: shrl $4, %ebx -; X86-NOSSE-NEXT: addl %edi, %ebx -; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %ebx, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 +; X86-NOSSE-NEXT: andl $-16, %esp +; X86-NOSSE-NEXT: movl 24(%ebp), %eax +; X86-NOSSE-NEXT: movl 32(%ebp), %ecx +; X86-NOSSE-NEXT: movl 36(%ebp), %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %esi ; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %ebx, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl $4, %ebx -; X86-NOSSE-NEXT: addl %esi, %ebx -; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %esi -; X86-NOSSE-NEXT: addl %edi, %esi -; X86-NOSSE-NEXT: movl %edx, %edi -; X86-NOSSE-NEXT: shrl %edi -; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %edi, %edx -; X86-NOSSE-NEXT: movl %edx, %edi -; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X86-NOSSE-NEXT: shrl $2, %edx -; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %edi, %edx -; X86-NOSSE-NEXT: movl %edx, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %edx, %edi -; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edx -; X86-NOSSE-NEXT: movl %ecx, %edi -; X86-NOSSE-NEXT: shrl %edi -; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %edi, %ecx -; X86-NOSSE-NEXT: movl %ecx, %edi -; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edx, %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %esi, %edx +; X86-NOSSE-NEXT: movl %ecx, %esi +; X86-NOSSE-NEXT: shrl %esi +; X86-NOSSE-NEXT: andl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %esi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %esi +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %ecx ; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: addl %esi, %ecx ; X86-NOSSE-NEXT: movl %ecx, %edi ; X86-NOSSE-NEXT: shrl $4, %edi ; X86-NOSSE-NEXT: addl %ecx, %edi +; X86-NOSSE-NEXT: movl 28(%ebp), %esi +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx ; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; X86-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %ecx ; X86-NOSSE-NEXT: addl %edx, %ecx -; X86-NOSSE-NEXT: addl %esi, %ecx -; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edx, %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %esi, %edx +; X86-NOSSE-NEXT: movl %eax, %esi +; X86-NOSSE-NEXT: shrl %esi +; X86-NOSSE-NEXT: andl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %esi, %eax +; X86-NOSSE-NEXT: movl %eax, %esi +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %esi, %eax +; X86-NOSSE-NEXT: movl %eax, %esi +; X86-NOSSE-NEXT: shrl $4, %esi +; X86-NOSSE-NEXT: addl %eax, %esi +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %esi, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: addl %ecx, %edx +; X86-NOSSE-NEXT: movl 8(%ebp), %eax +; X86-NOSSE-NEXT: movl %edx, (%eax) ; X86-NOSSE-NEXT: movl $0, 12(%eax) ; X86-NOSSE-NEXT: movl $0, 8(%eax) ; X86-NOSSE-NEXT: movl $0, 4(%eax) +; X86-NOSSE-NEXT: leal -8(%ebp), %esp ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi -; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl $4 ; ; X64-BASE-LABEL: cnt128: @@ -462,20 +465,26 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; ; X86-POPCNT-LABEL: cnt128: ; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %ebp +; X86-POPCNT-NEXT: movl %esp, %ebp ; X86-POPCNT-NEXT: pushl %esi -; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx +; X86-POPCNT-NEXT: andl $-16, %esp +; X86-POPCNT-NEXT: subl $16, %esp +; X86-POPCNT-NEXT: movl 8(%ebp), %eax +; X86-POPCNT-NEXT: popcntl 36(%ebp), %ecx +; X86-POPCNT-NEXT: popcntl 32(%ebp), %edx ; X86-POPCNT-NEXT: addl %ecx, %edx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: popcntl 28(%ebp), %ecx +; X86-POPCNT-NEXT: popcntl 24(%ebp), %esi ; X86-POPCNT-NEXT: addl %ecx, %esi ; X86-POPCNT-NEXT: addl %edx, %esi ; X86-POPCNT-NEXT: movl %esi, (%eax) ; X86-POPCNT-NEXT: movl $0, 12(%eax) ; X86-POPCNT-NEXT: movl $0, 8(%eax) ; X86-POPCNT-NEXT: movl $0, 4(%eax) +; X86-POPCNT-NEXT: leal -4(%ebp), %esp ; X86-POPCNT-NEXT: popl %esi +; X86-POPCNT-NEXT: popl %ebp ; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128: @@ -522,7 +531,11 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; ; X86-SSE2-LABEL: cnt128: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movl 8(%ebp), %eax ; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: psrlw $1, %xmm0 @@ -564,11 +577,17 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X86-SSE2-NEXT: movl $0, 12(%eax) ; X86-SSE2-NEXT: movl $0, 8(%eax) ; X86-SSE2-NEXT: movl $0, 4(%eax) +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl $4 ; ; X86-SSSE3-LABEL: cnt128: ; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSSE3-NEXT: pushl %ebp +; X86-SSSE3-NEXT: movl %esp, %ebp +; X86-SSSE3-NEXT: andl $-16, %esp +; X86-SSSE3-NEXT: subl $16, %esp +; X86-SSSE3-NEXT: movl 8(%ebp), %eax ; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 @@ -600,6 +619,8 @@ define i128 @cnt128(i128 %x) nounwind readnone { ; X86-SSSE3-NEXT: movl $0, 12(%eax) ; X86-SSSE3-NEXT: movl $0, 8(%eax) ; X86-SSSE3-NEXT: movl $0, 4(%eax) +; X86-SSSE3-NEXT: movl %ebp, %esp +; X86-SSSE3-NEXT: popl %ebp ; X86-SSSE3-NEXT: retl $4 %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) ret i128 %cnt @@ -928,87 +949,92 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-NOSSE-LABEL: cnt128_optsize: ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %ebp +; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: pushl %ebx ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NOSSE-NEXT: movl %ebx, %ecx -; X86-NOSSE-NEXT: shrl %ecx -; X86-NOSSE-NEXT: movl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %edi, %ecx -; X86-NOSSE-NEXT: subl %ecx, %ebx +; X86-NOSSE-NEXT: andl $-16, %esp +; X86-NOSSE-NEXT: subl $16, %esp +; X86-NOSSE-NEXT: movl 32(%ebp), %edx +; X86-NOSSE-NEXT: movl 36(%ebp), %esi +; X86-NOSSE-NEXT: movl %esi, %eax +; X86-NOSSE-NEXT: shrl %eax +; X86-NOSSE-NEXT: movl $1431655765, %ecx # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %ecx, %eax +; X86-NOSSE-NEXT: subl %eax, %esi ; X86-NOSSE-NEXT: movl $858993459, %ecx # imm = 0x33333333 -; X86-NOSSE-NEXT: movl %ebx, %ebp -; X86-NOSSE-NEXT: andl %ecx, %ebp +; X86-NOSSE-NEXT: movl %esi, %edi +; X86-NOSSE-NEXT: andl %ecx, %edi +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: addl %edi, %esi +; X86-NOSSE-NEXT: movl %esi, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %esi, %edi +; X86-NOSSE-NEXT: movl %edx, %esi +; X86-NOSSE-NEXT: shrl %esi +; X86-NOSSE-NEXT: movl $1431655765, %eax # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %eax, %esi +; X86-NOSSE-NEXT: subl %esi, %edx +; X86-NOSSE-NEXT: movl %edx, %esi +; X86-NOSSE-NEXT: andl %ecx, %esi +; X86-NOSSE-NEXT: shrl $2, %edx +; X86-NOSSE-NEXT: andl %ecx, %edx +; X86-NOSSE-NEXT: addl %esi, %edx +; X86-NOSSE-NEXT: movl %edx, %ebx +; X86-NOSSE-NEXT: shrl $4, %ebx +; X86-NOSSE-NEXT: addl %edx, %ebx +; X86-NOSSE-NEXT: movl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %edx, %edi +; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edi +; X86-NOSSE-NEXT: andl %edx, %ebx +; X86-NOSSE-NEXT: imull $16843009, %ebx, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx +; X86-NOSSE-NEXT: addl %edi, %edx +; X86-NOSSE-NEXT: movl 28(%ebp), %ebx +; X86-NOSSE-NEXT: movl %ebx, %edi +; X86-NOSSE-NEXT: shrl %edi +; X86-NOSSE-NEXT: andl %eax, %edi +; X86-NOSSE-NEXT: subl %edi, %ebx +; X86-NOSSE-NEXT: movl %ebx, %edi +; X86-NOSSE-NEXT: andl %ecx, %edi ; X86-NOSSE-NEXT: shrl $2, %ebx ; X86-NOSSE-NEXT: andl %ecx, %ebx -; X86-NOSSE-NEXT: addl %ebp, %ebx -; X86-NOSSE-NEXT: movl %ebx, %ebp -; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %ebx, %ebp +; X86-NOSSE-NEXT: addl %edi, %ebx +; X86-NOSSE-NEXT: movl %ebx, %edi +; X86-NOSSE-NEXT: shrl $4, %edi +; X86-NOSSE-NEXT: addl %ebx, %edi +; X86-NOSSE-NEXT: movl 24(%ebp), %eax ; X86-NOSSE-NEXT: movl %eax, %ebx ; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl %edi, %ebx +; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: andl %esi, %ebx ; X86-NOSSE-NEXT: subl %ebx, %eax ; X86-NOSSE-NEXT: movl %eax, %ebx ; X86-NOSSE-NEXT: andl %ecx, %ebx ; X86-NOSSE-NEXT: shrl $2, %eax ; X86-NOSSE-NEXT: andl %ecx, %eax ; X86-NOSSE-NEXT: addl %ebx, %eax -; X86-NOSSE-NEXT: movl %eax, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %eax, %edi -; X86-NOSSE-NEXT: movl $252645135, %ebx # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: andl %ebx, %ebp -; X86-NOSSE-NEXT: imull $16843009, %ebp, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: movl %eax, %ecx +; X86-NOSSE-NEXT: shrl $4, %ecx +; X86-NOSSE-NEXT: addl %eax, %ecx +; X86-NOSSE-NEXT: movl $252645135, %eax # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: andl %eax, %edi +; X86-NOSSE-NEXT: andl %eax, %ecx +; X86-NOSSE-NEXT: imull $16843009, %edi, %eax # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %eax -; X86-NOSSE-NEXT: andl %ebx, %edi -; X86-NOSSE-NEXT: imull $16843009, %edi, %edi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: addl %eax, %edi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %ebp # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %ebp, %eax -; X86-NOSSE-NEXT: subl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: shrl $2, %esi -; X86-NOSSE-NEXT: andl %ecx, %esi -; X86-NOSSE-NEXT: addl %eax, %esi -; X86-NOSSE-NEXT: movl %esi, %ebp -; X86-NOSSE-NEXT: shrl $4, %ebp -; X86-NOSSE-NEXT: addl %esi, %ebp -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: shrl %eax -; X86-NOSSE-NEXT: movl $1431655765, %esi # imm = 0x55555555 -; X86-NOSSE-NEXT: andl %esi, %eax -; X86-NOSSE-NEXT: subl %eax, %edx -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: andl %ecx, %eax -; X86-NOSSE-NEXT: shrl $2, %edx -; X86-NOSSE-NEXT: andl %ecx, %edx -; X86-NOSSE-NEXT: addl %eax, %edx -; X86-NOSSE-NEXT: movl %edx, %eax -; X86-NOSSE-NEXT: shrl $4, %eax -; X86-NOSSE-NEXT: addl %edx, %eax -; X86-NOSSE-NEXT: andl %ebx, %ebp -; X86-NOSSE-NEXT: andl %ebx, %eax -; X86-NOSSE-NEXT: imull $16843009, %ebp, %ecx # imm = 0x1010101 +; X86-NOSSE-NEXT: imull $16843009, %ecx, %ecx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %ecx -; X86-NOSSE-NEXT: imull $16843009, %eax, %edx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edx -; X86-NOSSE-NEXT: addl %ecx, %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: addl %edi, %edx -; X86-NOSSE-NEXT: xorl %ecx, %ecx -; X86-NOSSE-NEXT: movl %ecx, 12(%eax) -; X86-NOSSE-NEXT: movl %ecx, 8(%eax) -; X86-NOSSE-NEXT: movl %ecx, 4(%eax) -; X86-NOSSE-NEXT: movl %edx, (%eax) +; X86-NOSSE-NEXT: addl %eax, %ecx +; X86-NOSSE-NEXT: movl 8(%ebp), %eax +; X86-NOSSE-NEXT: addl %edx, %ecx +; X86-NOSSE-NEXT: xorl %edx, %edx +; X86-NOSSE-NEXT: movl %edx, 12(%eax) +; X86-NOSSE-NEXT: movl %edx, 8(%eax) +; X86-NOSSE-NEXT: movl %edx, 4(%eax) +; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: leal -12(%ebp), %esp ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi ; X86-NOSSE-NEXT: popl %ebx @@ -1057,13 +1083,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; ; X86-POPCNT-LABEL: cnt128_optsize: ; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %ebp +; X86-POPCNT-NEXT: movl %esp, %ebp ; X86-POPCNT-NEXT: pushl %esi -; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx +; X86-POPCNT-NEXT: andl $-16, %esp +; X86-POPCNT-NEXT: subl $16, %esp +; X86-POPCNT-NEXT: movl 8(%ebp), %eax +; X86-POPCNT-NEXT: popcntl 36(%ebp), %ecx +; X86-POPCNT-NEXT: popcntl 32(%ebp), %edx ; X86-POPCNT-NEXT: addl %ecx, %edx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: popcntl 28(%ebp), %ecx +; X86-POPCNT-NEXT: popcntl 24(%ebp), %esi ; X86-POPCNT-NEXT: addl %ecx, %esi ; X86-POPCNT-NEXT: addl %edx, %esi ; X86-POPCNT-NEXT: xorl %ecx, %ecx @@ -1071,7 +1101,9 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-POPCNT-NEXT: movl %ecx, 8(%eax) ; X86-POPCNT-NEXT: movl %ecx, 4(%eax) ; X86-POPCNT-NEXT: movl %esi, (%eax) +; X86-POPCNT-NEXT: leal -4(%ebp), %esp ; X86-POPCNT-NEXT: popl %esi +; X86-POPCNT-NEXT: popl %ebp ; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128_optsize: @@ -1118,7 +1150,11 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; ; X86-SSE2-LABEL: cnt128_optsize: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movl 8(%ebp), %eax ; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: psrlw $1, %xmm0 @@ -1161,11 +1197,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl $4 ; ; X86-SSSE3-LABEL: cnt128_optsize: ; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSSE3-NEXT: pushl %ebp +; X86-SSSE3-NEXT: movl %esp, %ebp +; X86-SSSE3-NEXT: andl $-16, %esp +; X86-SSSE3-NEXT: subl $16, %esp +; X86-SSSE3-NEXT: movl 8(%ebp), %eax ; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 @@ -1198,6 +1240,8 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize { ; X86-SSSE3-NEXT: movl %ecx, 8(%eax) ; X86-SSSE3-NEXT: movl %ecx, 4(%eax) ; X86-SSSE3-NEXT: movl %edx, (%eax) +; X86-SSSE3-NEXT: movl %ebp, %esp +; X86-SSSE3-NEXT: popl %ebp ; X86-SSSE3-NEXT: retl $4 %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) ret i128 %cnt @@ -1415,85 +1459,88 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 { define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-NOSSE-LABEL: cnt128_pgso: ; X86-NOSSE: # %bb.0: -; X86-NOSSE-NEXT: pushl %ebx +; X86-NOSSE-NEXT: pushl %ebp +; X86-NOSSE-NEXT: movl %esp, %ebp ; X86-NOSSE-NEXT: pushl %edi ; X86-NOSSE-NEXT: pushl %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NOSSE-NEXT: movl %edi, %ebx -; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %ebx, %edi -; X86-NOSSE-NEXT: movl %edi, %ebx -; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 -; X86-NOSSE-NEXT: shrl $2, %edi -; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %ebx, %edi -; X86-NOSSE-NEXT: movl %edi, %ebx -; X86-NOSSE-NEXT: shrl $4, %ebx -; X86-NOSSE-NEXT: addl %edi, %ebx -; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: imull $16843009, %ebx, %edi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl %ebx -; X86-NOSSE-NEXT: andl $1431655765, %ebx # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %ebx, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: andl $858993459, %ebx # imm = 0x33333333 +; X86-NOSSE-NEXT: andl $-16, %esp +; X86-NOSSE-NEXT: movl 24(%ebp), %eax +; X86-NOSSE-NEXT: movl 32(%ebp), %ecx +; X86-NOSSE-NEXT: movl 36(%ebp), %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %esi ; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %ebx, %esi -; X86-NOSSE-NEXT: movl %esi, %ebx -; X86-NOSSE-NEXT: shrl $4, %ebx -; X86-NOSSE-NEXT: addl %esi, %ebx -; X86-NOSSE-NEXT: andl $252645135, %ebx # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: imull $16843009, %ebx, %esi # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %esi -; X86-NOSSE-NEXT: addl %edi, %esi -; X86-NOSSE-NEXT: movl %edx, %edi -; X86-NOSSE-NEXT: shrl %edi -; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %edi, %edx -; X86-NOSSE-NEXT: movl %edx, %edi -; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 -; X86-NOSSE-NEXT: shrl $2, %edx -; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %edi, %edx -; X86-NOSSE-NEXT: movl %edx, %edi -; X86-NOSSE-NEXT: shrl $4, %edi -; X86-NOSSE-NEXT: addl %edx, %edi -; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F -; X86-NOSSE-NEXT: imull $16843009, %edi, %edx # imm = 0x1010101 -; X86-NOSSE-NEXT: shrl $24, %edx -; X86-NOSSE-NEXT: movl %ecx, %edi -; X86-NOSSE-NEXT: shrl %edi -; X86-NOSSE-NEXT: andl $1431655765, %edi # imm = 0x55555555 -; X86-NOSSE-NEXT: subl %edi, %ecx -; X86-NOSSE-NEXT: movl %ecx, %edi -; X86-NOSSE-NEXT: andl $858993459, %edi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edx, %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %esi, %edx +; X86-NOSSE-NEXT: movl %ecx, %esi +; X86-NOSSE-NEXT: shrl %esi +; X86-NOSSE-NEXT: andl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %esi, %ecx +; X86-NOSSE-NEXT: movl %ecx, %esi +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 ; X86-NOSSE-NEXT: shrl $2, %ecx ; X86-NOSSE-NEXT: andl $858993459, %ecx # imm = 0x33333333 -; X86-NOSSE-NEXT: addl %edi, %ecx +; X86-NOSSE-NEXT: addl %esi, %ecx ; X86-NOSSE-NEXT: movl %ecx, %edi ; X86-NOSSE-NEXT: shrl $4, %edi ; X86-NOSSE-NEXT: addl %ecx, %edi +; X86-NOSSE-NEXT: movl 28(%ebp), %esi +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx ; X86-NOSSE-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F ; X86-NOSSE-NEXT: imull $16843009, %edi, %ecx # imm = 0x1010101 ; X86-NOSSE-NEXT: shrl $24, %ecx ; X86-NOSSE-NEXT: addl %edx, %ecx -; X86-NOSSE-NEXT: addl %esi, %ecx -; X86-NOSSE-NEXT: xorl %edx, %edx -; X86-NOSSE-NEXT: movl %edx, 12(%eax) -; X86-NOSSE-NEXT: movl %edx, 8(%eax) -; X86-NOSSE-NEXT: movl %edx, 4(%eax) -; X86-NOSSE-NEXT: movl %ecx, (%eax) +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: shrl %edx +; X86-NOSSE-NEXT: andl $1431655765, %edx # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %edx, %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: andl $858993459, %edx # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %esi +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %edx, %esi +; X86-NOSSE-NEXT: movl %esi, %edx +; X86-NOSSE-NEXT: shrl $4, %edx +; X86-NOSSE-NEXT: addl %esi, %edx +; X86-NOSSE-NEXT: movl %eax, %esi +; X86-NOSSE-NEXT: shrl %esi +; X86-NOSSE-NEXT: andl $1431655765, %esi # imm = 0x55555555 +; X86-NOSSE-NEXT: subl %esi, %eax +; X86-NOSSE-NEXT: movl %eax, %esi +; X86-NOSSE-NEXT: andl $858993459, %esi # imm = 0x33333333 +; X86-NOSSE-NEXT: shrl $2, %eax +; X86-NOSSE-NEXT: andl $858993459, %eax # imm = 0x33333333 +; X86-NOSSE-NEXT: addl %esi, %eax +; X86-NOSSE-NEXT: movl %eax, %esi +; X86-NOSSE-NEXT: shrl $4, %esi +; X86-NOSSE-NEXT: addl %eax, %esi +; X86-NOSSE-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %edx, %eax # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %eax +; X86-NOSSE-NEXT: andl $252645135, %esi # imm = 0xF0F0F0F +; X86-NOSSE-NEXT: imull $16843009, %esi, %edx # imm = 0x1010101 +; X86-NOSSE-NEXT: shrl $24, %edx +; X86-NOSSE-NEXT: addl %eax, %edx +; X86-NOSSE-NEXT: movl 8(%ebp), %eax +; X86-NOSSE-NEXT: addl %ecx, %edx +; X86-NOSSE-NEXT: xorl %ecx, %ecx +; X86-NOSSE-NEXT: movl %ecx, 12(%eax) +; X86-NOSSE-NEXT: movl %ecx, 8(%eax) +; X86-NOSSE-NEXT: movl %ecx, 4(%eax) +; X86-NOSSE-NEXT: movl %edx, (%eax) +; X86-NOSSE-NEXT: leal -8(%ebp), %esp ; X86-NOSSE-NEXT: popl %esi ; X86-NOSSE-NEXT: popl %edi -; X86-NOSSE-NEXT: popl %ebx +; X86-NOSSE-NEXT: popl %ebp ; X86-NOSSE-NEXT: retl $4 ; ; X64-BASE-LABEL: cnt128_pgso: @@ -1538,13 +1585,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; ; X86-POPCNT-LABEL: cnt128_pgso: ; X86-POPCNT: # %bb.0: +; X86-POPCNT-NEXT: pushl %ebp +; X86-POPCNT-NEXT: movl %esp, %ebp ; X86-POPCNT-NEXT: pushl %esi -; X86-POPCNT-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %edx +; X86-POPCNT-NEXT: andl $-16, %esp +; X86-POPCNT-NEXT: subl $16, %esp +; X86-POPCNT-NEXT: movl 8(%ebp), %eax +; X86-POPCNT-NEXT: popcntl 36(%ebp), %ecx +; X86-POPCNT-NEXT: popcntl 32(%ebp), %edx ; X86-POPCNT-NEXT: addl %ecx, %edx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %ecx -; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %esi +; X86-POPCNT-NEXT: popcntl 28(%ebp), %ecx +; X86-POPCNT-NEXT: popcntl 24(%ebp), %esi ; X86-POPCNT-NEXT: addl %ecx, %esi ; X86-POPCNT-NEXT: addl %edx, %esi ; X86-POPCNT-NEXT: xorl %ecx, %ecx @@ -1552,7 +1603,9 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-POPCNT-NEXT: movl %ecx, 8(%eax) ; X86-POPCNT-NEXT: movl %ecx, 4(%eax) ; X86-POPCNT-NEXT: movl %esi, (%eax) +; X86-POPCNT-NEXT: leal -4(%ebp), %esp ; X86-POPCNT-NEXT: popl %esi +; X86-POPCNT-NEXT: popl %ebp ; X86-POPCNT-NEXT: retl $4 ; ; X64-POPCNT-LABEL: cnt128_pgso: @@ -1599,7 +1652,11 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; ; X86-SSE2-LABEL: cnt128_pgso: ; X86-SSE2: # %bb.0: -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSE2-NEXT: pushl %ebp +; X86-SSE2-NEXT: movl %esp, %ebp +; X86-SSE2-NEXT: andl $-16, %esp +; X86-SSE2-NEXT: subl $16, %esp +; X86-SSE2-NEXT: movl 8(%ebp), %eax ; X86-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSE2-NEXT: movdqa %xmm2, %xmm0 ; X86-SSE2-NEXT: psrlw $1, %xmm0 @@ -1642,11 +1699,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-SSE2-NEXT: movl %ecx, 8(%eax) ; X86-SSE2-NEXT: movl %ecx, 4(%eax) ; X86-SSE2-NEXT: movl %edx, (%eax) +; X86-SSE2-NEXT: movl %ebp, %esp +; X86-SSE2-NEXT: popl %ebp ; X86-SSE2-NEXT: retl $4 ; ; X86-SSSE3-LABEL: cnt128_pgso: ; X86-SSSE3: # %bb.0: -; X86-SSSE3-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SSSE3-NEXT: pushl %ebp +; X86-SSSE3-NEXT: movl %esp, %ebp +; X86-SSSE3-NEXT: andl $-16, %esp +; X86-SSSE3-NEXT: subl $16, %esp +; X86-SSSE3-NEXT: movl 8(%ebp), %eax ; X86-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X86-SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; X86-SSSE3-NEXT: movdqa %xmm2, %xmm3 @@ -1679,6 +1742,8 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 { ; X86-SSSE3-NEXT: movl %ecx, 8(%eax) ; X86-SSSE3-NEXT: movl %ecx, 4(%eax) ; X86-SSSE3-NEXT: movl %edx, (%eax) +; X86-SSSE3-NEXT: movl %ebp, %esp +; X86-SSSE3-NEXT: popl %ebp ; X86-SSSE3-NEXT: retl $4 %cnt = tail call i128 @llvm.ctpop.i128(i128 %x) ret i128 %cnt diff --git a/llvm/test/CodeGen/X86/pr46004.ll b/llvm/test/CodeGen/X86/pr46004.ll index f7c7da089c365..829d6dfceba3d 100644 --- a/llvm/test/CodeGen/X86/pr46004.ll +++ b/llvm/test/CodeGen/X86/pr46004.ll @@ -6,7 +6,17 @@ define void @fuzz22357(i128 %a0) { ; X86-LABEL: fuzz22357: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp ; X86-NEXT: movb $0, (%eax) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl ; ; X64-LABEL: fuzz22357: @@ -24,6 +34,15 @@ define void @fuzz22357(i128 %a0) { define void @fuzz22723(i128 %a0) { ; X86-LABEL: fuzz22723: ; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl ; ; X64-LABEL: fuzz22723: diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll index 50a967e1c2a1a..ce9723b3a84bc 100644 --- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll @@ -762,11 +762,15 @@ define i32 @x_to_s32(x86_fp80 %a) nounwind { define i32 @t_to_u32(fp128 %a) nounwind { ; X86-AVX512-WIN-LABEL: t_to_u32: ; X86-AVX512-WIN: # %bb.0: -; X86-AVX512-WIN-NEXT: subl $16, %esp -; X86-AVX512-WIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-WIN-NEXT: pushl %ebp +; X86-AVX512-WIN-NEXT: movl %esp, %ebp +; X86-AVX512-WIN-NEXT: andl $-16, %esp +; X86-AVX512-WIN-NEXT: subl $32, %esp +; X86-AVX512-WIN-NEXT: vmovups 8(%ebp), %xmm0 ; X86-AVX512-WIN-NEXT: vmovups %xmm0, (%esp) ; X86-AVX512-WIN-NEXT: calll ___fixunstfsi -; X86-AVX512-WIN-NEXT: addl $16, %esp +; X86-AVX512-WIN-NEXT: movl %ebp, %esp +; X86-AVX512-WIN-NEXT: popl %ebp ; X86-AVX512-WIN-NEXT: retl ; ; X86-AVX512-LIN-LABEL: t_to_u32: @@ -797,12 +801,18 @@ define i32 @t_to_u32(fp128 %a) nounwind { ; ; X86-SSE-WIN-LABEL: t_to_u32: ; X86-SSE-WIN: # %bb.0: -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE-WIN-NEXT: pushl %ebp +; X86-SSE-WIN-NEXT: movl %esp, %ebp +; X86-SSE-WIN-NEXT: andl $-16, %esp +; X86-SSE-WIN-NEXT: subl $16, %esp +; X86-SSE-WIN-NEXT: pushl 20(%ebp) +; X86-SSE-WIN-NEXT: pushl 16(%ebp) +; X86-SSE-WIN-NEXT: pushl 12(%ebp) +; X86-SSE-WIN-NEXT: pushl 8(%ebp) ; X86-SSE-WIN-NEXT: calll ___fixunstfsi ; X86-SSE-WIN-NEXT: addl $16, %esp +; X86-SSE-WIN-NEXT: movl %ebp, %esp +; X86-SSE-WIN-NEXT: popl %ebp ; X86-SSE-WIN-NEXT: retl ; ; X86-SSE-LIN-LABEL: t_to_u32: @@ -835,12 +845,18 @@ define i32 @t_to_u32(fp128 %a) nounwind { ; ; X87-WIN-LABEL: t_to_u32: ; X87-WIN: # %bb.0: -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X87-WIN-NEXT: pushl %ebp +; X87-WIN-NEXT: movl %esp, %ebp +; X87-WIN-NEXT: andl $-16, %esp +; X87-WIN-NEXT: subl $16, %esp +; X87-WIN-NEXT: pushl 20(%ebp) +; X87-WIN-NEXT: pushl 16(%ebp) +; X87-WIN-NEXT: pushl 12(%ebp) +; X87-WIN-NEXT: pushl 8(%ebp) ; X87-WIN-NEXT: calll ___fixunstfsi ; X87-WIN-NEXT: addl $16, %esp +; X87-WIN-NEXT: movl %ebp, %esp +; X87-WIN-NEXT: popl %ebp ; X87-WIN-NEXT: retl ; ; X87-LIN-LABEL: t_to_u32: @@ -860,11 +876,15 @@ define i32 @t_to_u32(fp128 %a) nounwind { define i32 @t_to_s32(fp128 %a) nounwind { ; X86-AVX512-WIN-LABEL: t_to_s32: ; X86-AVX512-WIN: # %bb.0: -; X86-AVX512-WIN-NEXT: subl $16, %esp -; X86-AVX512-WIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-WIN-NEXT: pushl %ebp +; X86-AVX512-WIN-NEXT: movl %esp, %ebp +; X86-AVX512-WIN-NEXT: andl $-16, %esp +; X86-AVX512-WIN-NEXT: subl $32, %esp +; X86-AVX512-WIN-NEXT: vmovups 8(%ebp), %xmm0 ; X86-AVX512-WIN-NEXT: vmovups %xmm0, (%esp) ; X86-AVX512-WIN-NEXT: calll ___fixtfsi -; X86-AVX512-WIN-NEXT: addl $16, %esp +; X86-AVX512-WIN-NEXT: movl %ebp, %esp +; X86-AVX512-WIN-NEXT: popl %ebp ; X86-AVX512-WIN-NEXT: retl ; ; X86-AVX512-LIN-LABEL: t_to_s32: @@ -895,12 +915,18 @@ define i32 @t_to_s32(fp128 %a) nounwind { ; ; X86-SSE-WIN-LABEL: t_to_s32: ; X86-SSE-WIN: # %bb.0: -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE-WIN-NEXT: pushl %ebp +; X86-SSE-WIN-NEXT: movl %esp, %ebp +; X86-SSE-WIN-NEXT: andl $-16, %esp +; X86-SSE-WIN-NEXT: subl $16, %esp +; X86-SSE-WIN-NEXT: pushl 20(%ebp) +; X86-SSE-WIN-NEXT: pushl 16(%ebp) +; X86-SSE-WIN-NEXT: pushl 12(%ebp) +; X86-SSE-WIN-NEXT: pushl 8(%ebp) ; X86-SSE-WIN-NEXT: calll ___fixtfsi ; X86-SSE-WIN-NEXT: addl $16, %esp +; X86-SSE-WIN-NEXT: movl %ebp, %esp +; X86-SSE-WIN-NEXT: popl %ebp ; X86-SSE-WIN-NEXT: retl ; ; X86-SSE-LIN-LABEL: t_to_s32: @@ -933,12 +959,18 @@ define i32 @t_to_s32(fp128 %a) nounwind { ; ; X87-WIN-LABEL: t_to_s32: ; X87-WIN: # %bb.0: -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X87-WIN-NEXT: pushl %ebp +; X87-WIN-NEXT: movl %esp, %ebp +; X87-WIN-NEXT: andl $-16, %esp +; X87-WIN-NEXT: subl $16, %esp +; X87-WIN-NEXT: pushl 20(%ebp) +; X87-WIN-NEXT: pushl 16(%ebp) +; X87-WIN-NEXT: pushl 12(%ebp) +; X87-WIN-NEXT: pushl 8(%ebp) ; X87-WIN-NEXT: calll ___fixtfsi ; X87-WIN-NEXT: addl $16, %esp +; X87-WIN-NEXT: movl %ebp, %esp +; X87-WIN-NEXT: popl %ebp ; X87-WIN-NEXT: retl ; ; X87-LIN-LABEL: t_to_s32: diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll index f516db8b30ffe..3287869f2c601 100644 --- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll +++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll @@ -1417,11 +1417,15 @@ define i64 @x_to_s64(x86_fp80 %a) nounwind { define i64 @t_to_u64(fp128 %a) nounwind { ; X86-AVX512-WIN-LABEL: t_to_u64: ; X86-AVX512-WIN: # %bb.0: -; X86-AVX512-WIN-NEXT: subl $16, %esp -; X86-AVX512-WIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-WIN-NEXT: pushl %ebp +; X86-AVX512-WIN-NEXT: movl %esp, %ebp +; X86-AVX512-WIN-NEXT: andl $-16, %esp +; X86-AVX512-WIN-NEXT: subl $32, %esp +; X86-AVX512-WIN-NEXT: vmovups 8(%ebp), %xmm0 ; X86-AVX512-WIN-NEXT: vmovups %xmm0, (%esp) ; X86-AVX512-WIN-NEXT: calll ___fixunstfdi -; X86-AVX512-WIN-NEXT: addl $16, %esp +; X86-AVX512-WIN-NEXT: movl %ebp, %esp +; X86-AVX512-WIN-NEXT: popl %ebp ; X86-AVX512-WIN-NEXT: retl ; ; X86-AVX512-LIN-LABEL: t_to_u64: @@ -1452,12 +1456,18 @@ define i64 @t_to_u64(fp128 %a) nounwind { ; ; X86-SSE-WIN-LABEL: t_to_u64: ; X86-SSE-WIN: # %bb.0: -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE-WIN-NEXT: pushl %ebp +; X86-SSE-WIN-NEXT: movl %esp, %ebp +; X86-SSE-WIN-NEXT: andl $-16, %esp +; X86-SSE-WIN-NEXT: subl $16, %esp +; X86-SSE-WIN-NEXT: pushl 20(%ebp) +; X86-SSE-WIN-NEXT: pushl 16(%ebp) +; X86-SSE-WIN-NEXT: pushl 12(%ebp) +; X86-SSE-WIN-NEXT: pushl 8(%ebp) ; X86-SSE-WIN-NEXT: calll ___fixunstfdi ; X86-SSE-WIN-NEXT: addl $16, %esp +; X86-SSE-WIN-NEXT: movl %ebp, %esp +; X86-SSE-WIN-NEXT: popl %ebp ; X86-SSE-WIN-NEXT: retl ; ; X86-SSE-LIN-LABEL: t_to_u64: @@ -1490,12 +1500,18 @@ define i64 @t_to_u64(fp128 %a) nounwind { ; ; X87-WIN-LABEL: t_to_u64: ; X87-WIN: # %bb.0: -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X87-WIN-NEXT: pushl %ebp +; X87-WIN-NEXT: movl %esp, %ebp +; X87-WIN-NEXT: andl $-16, %esp +; X87-WIN-NEXT: subl $16, %esp +; X87-WIN-NEXT: pushl 20(%ebp) +; X87-WIN-NEXT: pushl 16(%ebp) +; X87-WIN-NEXT: pushl 12(%ebp) +; X87-WIN-NEXT: pushl 8(%ebp) ; X87-WIN-NEXT: calll ___fixunstfdi ; X87-WIN-NEXT: addl $16, %esp +; X87-WIN-NEXT: movl %ebp, %esp +; X87-WIN-NEXT: popl %ebp ; X87-WIN-NEXT: retl ; ; X87-LIN-LABEL: t_to_u64: @@ -1515,11 +1531,15 @@ define i64 @t_to_u64(fp128 %a) nounwind { define i64 @t_to_s64(fp128 %a) nounwind { ; X86-AVX512-WIN-LABEL: t_to_s64: ; X86-AVX512-WIN: # %bb.0: -; X86-AVX512-WIN-NEXT: subl $16, %esp -; X86-AVX512-WIN-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; X86-AVX512-WIN-NEXT: pushl %ebp +; X86-AVX512-WIN-NEXT: movl %esp, %ebp +; X86-AVX512-WIN-NEXT: andl $-16, %esp +; X86-AVX512-WIN-NEXT: subl $32, %esp +; X86-AVX512-WIN-NEXT: vmovups 8(%ebp), %xmm0 ; X86-AVX512-WIN-NEXT: vmovups %xmm0, (%esp) ; X86-AVX512-WIN-NEXT: calll ___fixtfdi -; X86-AVX512-WIN-NEXT: addl $16, %esp +; X86-AVX512-WIN-NEXT: movl %ebp, %esp +; X86-AVX512-WIN-NEXT: popl %ebp ; X86-AVX512-WIN-NEXT: retl ; ; X86-AVX512-LIN-LABEL: t_to_s64: @@ -1550,12 +1570,18 @@ define i64 @t_to_s64(fp128 %a) nounwind { ; ; X86-SSE-WIN-LABEL: t_to_s64: ; X86-SSE-WIN: # %bb.0: -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X86-SSE-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X86-SSE-WIN-NEXT: pushl %ebp +; X86-SSE-WIN-NEXT: movl %esp, %ebp +; X86-SSE-WIN-NEXT: andl $-16, %esp +; X86-SSE-WIN-NEXT: subl $16, %esp +; X86-SSE-WIN-NEXT: pushl 20(%ebp) +; X86-SSE-WIN-NEXT: pushl 16(%ebp) +; X86-SSE-WIN-NEXT: pushl 12(%ebp) +; X86-SSE-WIN-NEXT: pushl 8(%ebp) ; X86-SSE-WIN-NEXT: calll ___fixtfdi ; X86-SSE-WIN-NEXT: addl $16, %esp +; X86-SSE-WIN-NEXT: movl %ebp, %esp +; X86-SSE-WIN-NEXT: popl %ebp ; X86-SSE-WIN-NEXT: retl ; ; X86-SSE-LIN-LABEL: t_to_s64: @@ -1588,12 +1614,18 @@ define i64 @t_to_s64(fp128 %a) nounwind { ; ; X87-WIN-LABEL: t_to_s64: ; X87-WIN: # %bb.0: -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) -; X87-WIN-NEXT: pushl {{[0-9]+}}(%esp) +; X87-WIN-NEXT: pushl %ebp +; X87-WIN-NEXT: movl %esp, %ebp +; X87-WIN-NEXT: andl $-16, %esp +; X87-WIN-NEXT: subl $16, %esp +; X87-WIN-NEXT: pushl 20(%ebp) +; X87-WIN-NEXT: pushl 16(%ebp) +; X87-WIN-NEXT: pushl 12(%ebp) +; X87-WIN-NEXT: pushl 8(%ebp) ; X87-WIN-NEXT: calll ___fixtfdi ; X87-WIN-NEXT: addl $16, %esp +; X87-WIN-NEXT: movl %ebp, %esp +; X87-WIN-NEXT: popl %ebp ; X87-WIN-NEXT: retl ; ; X87-LIN-LABEL: t_to_s64: diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll index 874913629e9e3..8a287229a1cb1 100644 --- a/llvm/test/CodeGen/X86/scmp.ll +++ b/llvm/test/CodeGen/X86/scmp.ll @@ -118,30 +118,33 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind { ; X86-LABEL: scmp.8.128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: sbbl %edx, %ebp -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: sbbl %eax, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: sbbl %ebp, %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 24(%ebp), %ecx +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %edi +; X86-NEXT: cmpl %ecx, 8(%ebp) +; X86-NEXT: sbbl %eax, %edi +; X86-NEXT: movl 32(%ebp), %edi +; X86-NEXT: movl 16(%ebp), %ebx +; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: movl 36(%ebp), %ebx +; X86-NEXT: movl 20(%ebp), %edx +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: setl %cl -; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl %ebx, %edx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: movl 24(%ebp), %esi +; X86-NEXT: cmpl 8(%ebp), %esi +; X86-NEXT: sbbl 12(%ebp), %eax +; X86-NEXT: sbbl 16(%ebp), %edi +; X86-NEXT: sbbl %edx, %ebx ; X86-NEXT: setl %al ; X86-NEXT: subb %cl, %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll index 4925f8bc6c8b0..392bc83d9d5d8 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix.ll @@ -307,69 +307,70 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $80, %esp +; X86-NEXT: subl $112, %esp ; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl 12(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %edi +; X86-NEXT: movl 16(%ebp), %eax ; X86-NEXT: movl 20(%ebp), %edx -; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, %esi ; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %eax, %edi -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: shldl $31, %eax, %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $31, %ecx, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: shldl $31, %edi, %esi +; X86-NEXT: shldl $31, %ecx, %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NEXT: shll $31, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %edx -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %ecx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %eax +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl $1, %eax -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %ebx -; X86-NEXT: testl %esi, %esi -; X86-NEXT: sets %al -; X86-NEXT: testl %edi, %edi -; X86-NEXT: sets %cl -; X86-NEXT: xorb %al, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl 20(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl %edi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %esi +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: sets %al +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: sets %bl +; X86-NEXT: xorb %al, %bl ; X86-NEXT: calll __modti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: setne %al -; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: testb %bl, %al +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %edx ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index e7727a0ab6178..7df490f984928 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -370,67 +370,68 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $96, %esp -; X86-NEXT: movl 8(%ebp), %ecx -; X86-NEXT: movl 12(%ebp), %eax -; X86-NEXT: movl 20(%ebp), %esi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl %eax, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shldl $31, %eax, %edi -; X86-NEXT: shldl $31, %ecx, %eax +; X86-NEXT: subl $128, %esp +; X86-NEXT: movl 8(%ebp), %esi +; X86-NEXT: movl 12(%ebp), %edi +; X86-NEXT: movl 16(%ebp), %ecx +; X86-NEXT: movl 20(%ebp), %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: sarl $31, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: shll $31, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %esi -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %ecx +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: shldl $31, %edi, %ebx +; X86-NEXT: shldl $31, %esi, %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: shll $31, %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl 20(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 16(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: subl $1, %esi -; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $1, %edi ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: sets %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sbbl $0, %ebx ; X86-NEXT: testl %ecx, %ecx -; X86-NEXT: sets %dl -; X86-NEXT: xorb %al, %dl -; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl %eax +; X86-NEXT: sets %al +; X86-NEXT: testl %edx, %edx +; X86-NEXT: sets %cl +; X86-NEXT: xorb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: calll __modti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -438,41 +439,38 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: setne %al ; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: cmovel %esi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-NEXT: cmpl $-1, %esi -; X86-NEXT: movl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmpl $-1, %edi ; X86-NEXT: sbbl $2147483647, %ecx # imm = 0x7FFFFFFF -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: sbbl $0, %ecx -; X86-NEXT: movl %edi, %ecx +; X86-NEXT: movl %ebx, %ecx ; X86-NEXT: sbbl $0, %ecx ; X86-NEXT: movl $2147483647, %edx # imm = 0x7FFFFFFF -; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovgel %ecx, %edi -; X86-NEXT: movl %edi, %eax ; X86-NEXT: cmovgel %ecx, %ebx +; X86-NEXT: cmovgel %ecx, %eax ; X86-NEXT: movl $-1, %ecx -; X86-NEXT: cmovgel %ecx, %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: negl %edi -; X86-NEXT: movl $-2147483648, %edi # imm = 0x80000000 -; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: movl $-1, %edi -; X86-NEXT: sbbl %ebx, %edi -; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: cmovgel %ecx, %edi +; X86-NEXT: movl %edi, %esi +; X86-NEXT: negl %esi +; X86-NEXT: movl $-2147483648, %esi # imm = 0x80000000 +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl $-1, %esi +; X86-NEXT: sbbl %eax, %esi +; X86-NEXT: sbbl %ebx, %ecx ; X86-NEXT: movl $0, %eax -; X86-NEXT: cmovgel %eax, %esi +; X86-NEXT: cmovgel %eax, %edi ; X86-NEXT: movl $-2147483648, %eax # imm = 0x80000000 ; X86-NEXT: cmovgel %eax, %edx -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -805,137 +803,155 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $208, %esp -; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: subl $240, %esp +; X86-NEXT: movl 12(%ebp), %esi +; X86-NEXT: movl 20(%ebp), %edi ; X86-NEXT: movl 16(%ebp), %ebx -; X86-NEXT: movl 32(%ebp), %eax -; X86-NEXT: movl %eax, %edi -; X86-NEXT: movl %eax, %ecx -; X86-NEXT: sarl $31, %edi -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: sarl $31, %edx -; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl 32(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) ; X86-NEXT: leal (%ebx,%ebx), %eax ; X86-NEXT: shrl $31, %ebx ; X86-NEXT: shldl $31, %eax, %ebx -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __modti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl 20(%ebp), %ecx +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl 36(%ebp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: leal (%ecx,%ecx), %edx -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %edx, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl 36(%ebp) ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %edx +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: leal (%edi,%edi), %eax +; X86-NEXT: shrl $31, %edi +; X86-NEXT: shldl $31, %eax, %edi +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl 32(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl 36(%ebp) -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __modti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 28(%ebp), %edx -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: sarl $31, %ebx -; X86-NEXT: movl 12(%ebp), %ecx -; X86-NEXT: movl %ecx, %edi +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, %edi ; X86-NEXT: sarl $31, %edi -; X86-NEXT: leal (%ecx,%ecx), %eax -; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %eax, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sarl $31, %ebx +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: leal (%esi,%esi), %eax +; X86-NEXT: shrl $31, %esi +; X86-NEXT: shldl $31, %eax, %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __modti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl 40(%ebp), %esi -; X86-NEXT: sarl $31, %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl %ecx, %eax +; X86-NEXT: sarl $31, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: movl 24(%ebp), %ecx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sarl $31, %eax -; X86-NEXT: leal (%ecx,%ecx), %edx +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: leal (%ecx,%ecx), %eax ; X86-NEXT: shrl $31, %ecx -; X86-NEXT: shldl $31, %edx, %ecx +; X86-NEXT: shldl $31, %eax, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %edx -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl %esi -; X86-NEXT: pushl 40(%ebp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %edx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: movl 28(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl %ebx -; X86-NEXT: pushl 28(%ebp) -; X86-NEXT: pushl %edi -; X86-NEXT: pushl %edi -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __divti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl 40(%ebp), %eax +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %eax @@ -949,18 +965,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %edx +; X86-NEXT: testl %edi, %edi +; X86-NEXT: sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: sets %bl -; X86-NEXT: testl %edi, %edi -; X86-NEXT: sets %bh -; X86-NEXT: xorb %bl, %bh +; X86-NEXT: xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: orl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl {{[0-9]+}}(%esp), %esi ; X86-NEXT: orl %edi, %esi -; X86-NEXT: setne %bl -; X86-NEXT: testb %bh, %bl +; X86-NEXT: setne %bh +; X86-NEXT: testb %bl, %bh ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload @@ -1107,36 +1123,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: subl $1, %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: sbbl $0, %esi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl $0, %edi +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NEXT: sets %al -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: testl %edx, %edx -; X86-NEXT: sets %ah -; X86-NEXT: xorb %al, %ah -; X86-NEXT: movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl 40(%ebp) -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %edx -; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %eax +; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload +; X86-NEXT: sets %cl +; X86-NEXT: xorb %al, %cl +; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: calll __modti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -1144,38 +1148,38 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X86-NEXT: orl %eax, %ecx ; X86-NEXT: setne %al ; X86-NEXT: testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload -; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload ; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload ; X86-NEXT: cmpl $-1, %ebx -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl %esi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl $0, %eax -; X86-NEXT: movl %esi, %eax +; X86-NEXT: movl %edi, %eax ; X86-NEXT: sbbl $0, %eax ; X86-NEXT: movl $0, %eax -; X86-NEXT: cmovgel %eax, %esi -; X86-NEXT: cmovgel %eax, %ecx ; X86-NEXT: cmovgel %eax, %edi +; X86-NEXT: cmovgel %eax, %ecx +; X86-NEXT: cmovgel %eax, %esi ; X86-NEXT: movl $-1, %edx ; X86-NEXT: cmovgel %edx, %ebx ; X86-NEXT: movl %ebx, %eax ; X86-NEXT: negl %eax ; X86-NEXT: movl $-1, %eax -; X86-NEXT: sbbl %edi, %eax +; X86-NEXT: sbbl %esi, %eax ; X86-NEXT: movl $-1, %eax ; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: movl $-1, %eax -; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: sbbl %edi, %eax ; X86-NEXT: movl $0, %eax ; X86-NEXT: cmovgel %eax, %ebx -; X86-NEXT: cmovgel %edx, %edi -; X86-NEXT: shldl $31, %ebx, %edi +; X86-NEXT: cmovgel %edx, %esi +; X86-NEXT: shldl $31, %ebx, %esi ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 12(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl %ecx, 8(%eax) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll index 76cb4e87bae18..dfeef48897e06 100644 --- a/llvm/test/CodeGen/X86/shift-combine.ll +++ b/llvm/test/CodeGen/X86/shift-combine.ll @@ -792,14 +792,24 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32 define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) { ; X86-LABEL: combineShiftOfShiftedLogic: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: pushl %ebp +; X86-NEXT: .cfi_def_cfa_offset 8 +; X86-NEXT: .cfi_offset %ebp, -8 +; X86-NEXT: movl %esp, %ebp +; X86-NEXT: .cfi_def_cfa_register %ebp +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 24(%ebp), %eax +; X86-NEXT: movl 28(%ebp), %ecx ; X86-NEXT: movl %eax, 20(%ecx) ; X86-NEXT: movl $0, 16(%ecx) ; X86-NEXT: movl $0, 12(%ecx) ; X86-NEXT: movl $0, 8(%ecx) ; X86-NEXT: movl $0, 4(%ecx) ; X86-NEXT: movl $0, (%ecx) +; X86-NEXT: movl %ebp, %esp +; X86-NEXT: popl %ebp +; X86-NEXT: .cfi_def_cfa %esp, 4 ; X86-NEXT: retl ; ; X64-LABEL: combineShiftOfShiftedLogic: diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 767bd772ab7a3..9323cd5b1917f 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -212,9 +212,18 @@ entry: } define void @test_lshr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind { -; ALL-LABEL: test_lshr_i128_outofrange: -; ALL: # %bb.0: # %entry -; ALL-NEXT: ret{{[l|q]}} +; i686-LABEL: test_lshr_i128_outofrange: +; i686: # %bb.0: # %entry +; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp +; i686-NEXT: andl $-16, %esp +; i686-NEXT: movl %ebp, %esp +; i686-NEXT: popl %ebp +; i686-NEXT: retl +; +; x86_64-LABEL: test_lshr_i128_outofrange: +; x86_64: # %bb.0: # %entry +; x86_64-NEXT: retq entry: %0 = lshr i128 %x, -1 store i128 %0, ptr %r, align 16 @@ -222,9 +231,18 @@ entry: } define void @test_ashr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind { -; ALL-LABEL: test_ashr_i128_outofrange: -; ALL: # %bb.0: # %entry -; ALL-NEXT: ret{{[l|q]}} +; i686-LABEL: test_ashr_i128_outofrange: +; i686: # %bb.0: # %entry +; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp +; i686-NEXT: andl $-16, %esp +; i686-NEXT: movl %ebp, %esp +; i686-NEXT: popl %ebp +; i686-NEXT: retl +; +; x86_64-LABEL: test_ashr_i128_outofrange: +; x86_64: # %bb.0: # %entry +; x86_64-NEXT: retq entry: %0 = ashr i128 %x, -1 store i128 %0, ptr %r, align 16 @@ -232,9 +250,18 @@ entry: } define void @test_shl_i128_outofrange(i128 %x, ptr nocapture %r) nounwind { -; ALL-LABEL: test_shl_i128_outofrange: -; ALL: # %bb.0: # %entry -; ALL-NEXT: ret{{[l|q]}} +; i686-LABEL: test_shl_i128_outofrange: +; i686: # %bb.0: # %entry +; i686-NEXT: pushl %ebp +; i686-NEXT: movl %esp, %ebp +; i686-NEXT: andl $-16, %esp +; i686-NEXT: movl %ebp, %esp +; i686-NEXT: popl %ebp +; i686-NEXT: retl +; +; x86_64-LABEL: test_shl_i128_outofrange: +; x86_64: # %bb.0: # %entry +; x86_64-NEXT: retq entry: %0 = shl i128 %x, -1 store i128 %0, ptr %r, align 16 @@ -874,26 +901,31 @@ define <2 x i256> @shl_zext_lshr_outofrange(<2 x i128> %a0) { define i128 @lshr_shl_mask(i128 %a0) { ; i686-LABEL: lshr_shl_mask: ; i686: # %bb.0: -; i686-NEXT: pushl %edi +; i686-NEXT: pushl %ebp ; i686-NEXT: .cfi_def_cfa_offset 8 +; i686-NEXT: .cfi_offset %ebp, -8 +; i686-NEXT: movl %esp, %ebp +; i686-NEXT: .cfi_def_cfa_register %ebp +; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi -; i686-NEXT: .cfi_def_cfa_offset 12 -; i686-NEXT: .cfi_offset %esi, -12 -; i686-NEXT: .cfi_offset %edi, -8 -; i686-NEXT: movl {{[0-9]+}}(%esp), %eax -; i686-NEXT: movl {{[0-9]+}}(%esp), %ecx -; i686-NEXT: movl {{[0-9]+}}(%esp), %edx -; i686-NEXT: movl {{[0-9]+}}(%esp), %esi +; i686-NEXT: andl $-16, %esp +; i686-NEXT: .cfi_offset %esi, -16 +; i686-NEXT: .cfi_offset %edi, -12 +; i686-NEXT: movl 8(%ebp), %eax +; i686-NEXT: movl 24(%ebp), %ecx +; i686-NEXT: movl 28(%ebp), %edx +; i686-NEXT: movl 32(%ebp), %esi ; i686-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF -; i686-NEXT: andl {{[0-9]+}}(%esp), %edi +; i686-NEXT: andl 36(%ebp), %edi ; i686-NEXT: movl %edi, 12(%eax) ; i686-NEXT: movl %esi, 8(%eax) ; i686-NEXT: movl %edx, 4(%eax) ; i686-NEXT: movl %ecx, (%eax) +; i686-NEXT: leal -8(%ebp), %esp ; i686-NEXT: popl %esi -; i686-NEXT: .cfi_def_cfa_offset 8 ; i686-NEXT: popl %edi -; i686-NEXT: .cfi_def_cfa_offset 4 +; i686-NEXT: popl %ebp +; i686-NEXT: .cfi_def_cfa %esp, 4 ; i686-NEXT: retl $4 ; ; x86_64-LABEL: lshr_shl_mask: diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll index 86891e964d96d..509d4443e930a 100644 --- a/llvm/test/CodeGen/X86/smax.ll +++ b/llvm/test/CodeGen/X86/smax.ll @@ -151,31 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: test_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ebx +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: cmpl 24(%ebp), %ebx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sbbl 28(%ebp), %esi +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sbbl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: movl 52(%ebp), %ecx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovll %ebx, %edx -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovll %ebp, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: cmovll 24(%ebp), %ebx +; X86-NEXT: cmovll 28(%ebp), %edi +; X86-NEXT: cmovll 32(%ebp), %edx +; X86-NEXT: cmovll %esi, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -717,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; ; X86-LABEL: test_signbits_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shrdl $28, %edi, %ecx -; X86-NEXT: sarl $28, %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %ecx +; X86-NEXT: movl 52(%ebp), %edx +; X86-NEXT: shrdl $28, %edx, %ecx +; X86-NEXT: sarl $28, %edx ; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: cmovll %esi, %ecx -; X86-NEXT: cmovll %edx, %edi -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: sarl $31, %edi +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %ax = ashr i128 %a, 64 %bx = ashr i128 %b, 92 diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll index 8907f6c4cd598..5e9fe27b41d2c 100644 --- a/llvm/test/CodeGen/X86/smin.ll +++ b/llvm/test/CodeGen/X86/smin.ll @@ -151,32 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: test_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %edx, %ebx -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: cmpl %ecx, 24(%ebp) +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl 48(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %ebx +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovll %ebx, %edx -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovll {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovll %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: cmovll 24(%ebp), %ecx +; X86-NEXT: cmovll 28(%ebp), %edx +; X86-NEXT: cmovll 32(%ebp), %esi +; X86-NEXT: cmovll %edi, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -718,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; ; X86-LABEL: test_signbits_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shrdl $28, %edi, %ecx -; X86-NEXT: sarl $28, %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %ecx +; X86-NEXT: movl 52(%ebp), %edx +; X86-NEXT: shrdl $28, %edx, %ecx +; X86-NEXT: sarl $28, %edx ; X86-NEXT: cmpl %ecx, %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: cmovll %esi, %ecx -; X86-NEXT: cmovll %edx, %edi -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: sarl $31, %edi +; X86-NEXT: cmovll %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %ax = ashr i128 %a, 64 %bx = ashr i128 %b, 92 diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll index 6a52acfe2fb30..7f17299b39e33 100644 --- a/llvm/test/CodeGen/X86/ucmp.ll +++ b/llvm/test/CodeGen/X86/ucmp.ll @@ -107,29 +107,33 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind { ; X86-LABEL: ucmp.8.128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl 12(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %ecx +; X86-NEXT: cmpl %eax, 24(%ebp) ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: sbbl %edx, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %eax +; X86-NEXT: movl 16(%ebp), %ebx +; X86-NEXT: movl 32(%ebp), %eax +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: movl 20(%ebp), %ecx +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: movl %esi, %eax ; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: setb %al -; X86-NEXT: cmpl %ebx, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl %ebp, %esi -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx -; X86-NEXT: sbbl %edi, %ecx +; X86-NEXT: movl 8(%ebp), %edi +; X86-NEXT: cmpl 24(%ebp), %edi +; X86-NEXT: sbbl 28(%ebp), %edx +; X86-NEXT: sbbl 32(%ebp), %ebx +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: sbbb $0, %al +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll index 5b1e0545502b8..82dfeeee13293 100644 --- a/llvm/test/CodeGen/X86/udiv_fix.ll +++ b/llvm/test/CodeGen/X86/udiv_fix.ll @@ -153,26 +153,28 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $32, %esp +; X86-NEXT: subl $80, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl 16(%ebp), %edx +; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: shrl %edx +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: shldl $31, %eax, %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: shll $31, %eax -; X86-NEXT: movl %esp, %esi -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %esi +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: subl $4, %esp +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: leal -4(%ebp), %esp ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll index 30a7f80b2315d..3da5973f9f903 100644 --- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll @@ -194,32 +194,34 @@ define i64 @func5(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %esi ; X86-NEXT: andl $-16, %esp -; X86-NEXT: subl $32, %esp +; X86-NEXT: subl $80, %esp ; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl 12(%ebp), %ecx +; X86-NEXT: movl 16(%ebp), %edx +; X86-NEXT: movl 20(%ebp), %esi +; X86-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: movl %ecx, %edx ; X86-NEXT: shrl %edx +; X86-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NEXT: shldl $31, %eax, %ecx +; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NEXT: shll $31, %eax -; X86-NEXT: movl %esp, %esi -; X86-NEXT: pushl $0 -; X86-NEXT: pushl $0 -; X86-NEXT: pushl 20(%ebp) -; X86-NEXT: pushl 16(%ebp) -; X86-NEXT: pushl $0 -; X86-NEXT: pushl %edx -; X86-NEXT: pushl %ecx -; X86-NEXT: pushl %eax -; X86-NEXT: pushl %esi +; X86-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: leal {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, (%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) +; X86-NEXT: movl $0, {{[0-9]+}}(%esp) ; X86-NEXT: calll __udivti3 -; X86-NEXT: addl $32, %esp +; X86-NEXT: subl $4, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: orl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl $-1, %eax ; X86-NEXT: movl $-1, %edx ; X86-NEXT: jne .LBB4_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl (%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: .LBB4_2: ; X86-NEXT: leal -4(%ebp), %esp diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll index f589d4a7b04a9..7ef859978cdbf 100644 --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -232,31 +232,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: test_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl %ebx, %edx -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ebx +; X86-NEXT: movl 44(%ebp), %edi +; X86-NEXT: cmpl 24(%ebp), %ebx +; X86-NEXT: movl %edi, %esi +; X86-NEXT: sbbl 28(%ebp), %esi +; X86-NEXT: movl 48(%ebp), %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: sbbl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %esi +; X86-NEXT: movl 52(%ebp), %ecx ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovbl %ebx, %edx -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %edi -; X86-NEXT: cmovbl %ebp, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbl %esi, %eax +; X86-NEXT: cmovbl 24(%ebp), %ebx +; X86-NEXT: cmovbl 28(%ebp), %edi +; X86-NEXT: cmovbl 32(%ebp), %edx +; X86-NEXT: cmovbl %esi, %ecx +; X86-NEXT: movl 8(%ebp), %eax ; X86-NEXT: movl %ecx, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -282,37 +285,40 @@ define i128 @test_i128_1(i128 %a) nounwind { ; X86-LABEL: test_i128_1: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 32(%ebp), %edx +; X86-NEXT: movl 24(%ebp), %eax ; X86-NEXT: cmpl $1, %eax -; X86-NEXT: movl %eax, %ebx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: testl %edx, %edx -; X86-NEXT: movl $1, %edi -; X86-NEXT: cmovnel %eax, %edi -; X86-NEXT: cmovel %ebx, %edi -; X86-NEXT: xorl %ebx, %ebx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: negl %ebp -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl $1, %ebp -; X86-NEXT: cmovbl %eax, %ebp -; X86-NEXT: movl %ecx, %eax -; X86-NEXT: cmovbl %edx, %ebx -; X86-NEXT: orl %esi, %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %esi, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: cmovel %edi, %ebp -; X86-NEXT: cmovel %edx, %ebx -; X86-NEXT: movl %ebx, 4(%eax) -; X86-NEXT: movl %ebp, (%eax) +; X86-NEXT: movl %eax, %ecx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: cmpl $0, 28(%ebp) +; X86-NEXT: movl $1, %esi +; X86-NEXT: cmovnel %eax, %esi +; X86-NEXT: cmovel %ecx, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: movl %edx, %ecx +; X86-NEXT: negl %ecx +; X86-NEXT: movl 36(%ebp), %ecx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: movl $1, %ebx +; X86-NEXT: cmovbl %eax, %ebx +; X86-NEXT: cmovbl 28(%ebp), %edi +; X86-NEXT: movl %edx, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: cmovel %esi, %ebx +; X86-NEXT: cmovel 28(%ebp), %edi +; X86-NEXT: movl %edi, 4(%eax) +; X86-NEXT: movl %ebx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -1312,29 +1318,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; ; X86-LABEL: test_signbits_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shrdl $28, %edi, %ecx -; X86-NEXT: sarl $28, %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %ecx +; X86-NEXT: movl 52(%ebp), %edx +; X86-NEXT: shrdl $28, %edx, %ecx +; X86-NEXT: sarl $28, %edx ; X86-NEXT: cmpl %esi, %ecx -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %edx, %ebx +; X86-NEXT: movl %edx, %edi +; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: cmovbl %esi, %ecx -; X86-NEXT: cmovbl %edx, %edi -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: sarl $31, %edi +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %ax = ashr i128 %a, 64 %bx = ashr i128 %b, 92 diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll index 7a5cdbb9ce758..c927abf3a4263 100644 --- a/llvm/test/CodeGen/X86/umin.ll +++ b/llvm/test/CodeGen/X86/umin.ll @@ -147,32 +147,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind { ; X86-LABEL: test_i128: ; X86: # %bb.0: ; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl %edx, %ebx -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl %eax, %ebp -; X86-NEXT: sbbl %ecx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: subl $16, %esp +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl 44(%ebp), %edx +; X86-NEXT: movl 28(%ebp), %esi +; X86-NEXT: cmpl %ecx, 24(%ebp) +; X86-NEXT: sbbl %edx, %esi +; X86-NEXT: movl 48(%ebp), %esi +; X86-NEXT: movl 32(%ebp), %ebx +; X86-NEXT: sbbl %esi, %ebx +; X86-NEXT: movl 52(%ebp), %ebx +; X86-NEXT: movl 36(%ebp), %edi ; X86-NEXT: movl %edi, %eax -; X86-NEXT: sbbl %ebp, %eax -; X86-NEXT: cmovbl %ebx, %edx -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmovbl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmovbl %edi, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl %ebp, 12(%eax) -; X86-NEXT: movl %ecx, 8(%eax) -; X86-NEXT: movl %esi, 4(%eax) -; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: sbbl %ebx, %eax +; X86-NEXT: cmovbl 24(%ebp), %ecx +; X86-NEXT: cmovbl 28(%ebp), %edx +; X86-NEXT: cmovbl 32(%ebp), %esi +; X86-NEXT: cmovbl %edi, %ebx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: leal -12(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -727,29 +729,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind { ; ; X86-LABEL: test_signbits_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %ebp +; X86-NEXT: movl %esp, %ebp ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: shrdl $28, %edi, %ecx -; X86-NEXT: sarl $28, %edi +; X86-NEXT: andl $-16, %esp +; X86-NEXT: movl 32(%ebp), %esi +; X86-NEXT: movl 36(%ebp), %eax +; X86-NEXT: movl 48(%ebp), %ecx +; X86-NEXT: movl 52(%ebp), %edx +; X86-NEXT: shrdl $28, %edx, %ecx +; X86-NEXT: sarl $28, %edx ; X86-NEXT: cmpl %ecx, %esi -; X86-NEXT: movl %edx, %ebx -; X86-NEXT: sbbl %edi, %ebx +; X86-NEXT: movl %eax, %edi +; X86-NEXT: sbbl %edx, %edi ; X86-NEXT: cmovbl %esi, %ecx -; X86-NEXT: cmovbl %edx, %edi -; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: sarl $31, %edi +; X86-NEXT: cmovbl %eax, %edx +; X86-NEXT: movl 8(%ebp), %eax +; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: movl %ecx, (%eax) -; X86-NEXT: movl %edi, 12(%eax) -; X86-NEXT: movl %edi, 8(%eax) +; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: leal -8(%ebp), %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp ; X86-NEXT: retl $4 %ax = ashr i128 %a, 64 %bx = ashr i128 %b, 92 diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll index 4c3170304b980..89afd1b00444b 100644 --- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll @@ -38,8 +38,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: pushl %esi ; X86-NEXT: .cfi_def_cfa_offset 20 -; X86-NEXT: subl $24, %esp -; X86-NEXT: .cfi_def_cfa_offset 44 +; X86-NEXT: subl $28, %esp +; X86-NEXT: .cfi_def_cfa_offset 48 ; X86-NEXT: .cfi_offset %esi, -20 ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 @@ -147,7 +147,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; X86-NEXT: andb $1, %al ; X86-NEXT: movb %al, 16(%ecx) ; X86-NEXT: movl %ecx, %eax -; X86-NEXT: addl $24, %esp +; X86-NEXT: addl $28, %esp ; X86-NEXT: .cfi_def_cfa_offset 20 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll index a15d633d85381..12dccca76eb19 100644 --- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll +++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll @@ -92,6 +92,8 @@ define i32 @test_wide(i128 %a, i128 %b) { ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %esi ; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: subl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset %esi, -8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -101,15 +103,15 @@ define i32 @test_wide(i128 %a, i128 %b) { ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: sbbl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: jge .LBB4_2 +; CHECK-NEXT: jge .LBB4_3 ; CHECK-NEXT: # %bb.1: # %bb1 ; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: popl %esi -; CHECK-NEXT: .cfi_def_cfa_offset 4 -; CHECK-NEXT: retl -; CHECK-NEXT: .LBB4_2: # %bb2 -; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: jmp .LBB4_2 +; CHECK-NEXT: .LBB4_3: # %bb2 ; CHECK-NEXT: movl $2, %eax +; CHECK-NEXT: .LBB4_2: # %bb1 +; CHECK-NEXT: addl $8, %esp +; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: popl %esi ; CHECK-NEXT: .cfi_def_cfa_offset 4 ; CHECK-NEXT: retl From 28e1e7e1b4b059a2e42f68061475cddb4ad0a6a3 Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Thu, 17 Jul 2025 12:32:49 +0300 Subject: [PATCH 156/813] Revert "[Clang] Do not treat Foo -> const Foo conversion sequences as perfect" (#149272) Reverts llvm/llvm-project#148613 Considering object argument conversion qualifications perfect leads to situations where we prefer a non-template const qualified function over a non-qualified template function, which is very wrong indeed. I explored solutions to work around that, but instead, we might want to go the GCC road and prefer the friend overload in the #147374 example, as this seems a lot more consistent and reliable --- clang/include/clang/Sema/Overload.h | 11 ++------ clang/lib/Sema/SemaOverload.cpp | 14 +--------- ...overload-resolution-deferred-templates.cpp | 28 ------------------- 3 files changed, 4 insertions(+), 49 deletions(-) diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index 9135ff949eeab..a70335bef9dd4 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -350,11 +350,6 @@ class Sema; LLVM_PREFERRED_TYPE(bool) unsigned BindsToRvalue : 1; - /// Whether this was an identity conversion with qualification - /// conversion for the implicit object argument. - LLVM_PREFERRED_TYPE(bool) - unsigned IsImplicitObjectArgumentQualificationConversion : 1; - /// Whether this binds an implicit object argument to a /// non-static member function without a ref-qualifier. LLVM_PREFERRED_TYPE(bool) @@ -453,11 +448,11 @@ class Sema; #endif return true; } + if (!C.hasSameType(getFromType(), getToType(2))) + return false; if (BindsToRvalue && IsLvalueReference) return false; - if (IsImplicitObjectArgumentQualificationConversion) - return C.hasSameUnqualifiedType(getFromType(), getToType(2)); - return C.hasSameType(getFromType(), getToType(2)); + return true; } ImplicitConversionRank getRank() const; diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index f3baf0c3ef3bc..1b54628c5e564 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -245,7 +245,6 @@ void StandardConversionSequence::setAsIdentityConversion() { IsLvalueReference = true; BindsToFunctionLvalue = false; BindsToRvalue = false; - IsImplicitObjectArgumentQualificationConversion = false; BindsImplicitObjectArgumentWithoutRefQualifier = false; ObjCLifetimeConversionBinding = false; FromBracedInitList = false; @@ -5318,7 +5317,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType, ICS.Standard.DirectBinding = BindsDirectly; ICS.Standard.IsLvalueReference = !isRValRef; ICS.Standard.BindsToFunctionLvalue = T2->isFunctionType(); - ICS.Standard.IsImplicitObjectArgumentQualificationConversion = false; ICS.Standard.BindsToRvalue = InitCategory.isRValue(); ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier = false; ICS.Standard.ObjCLifetimeConversionBinding = @@ -5498,7 +5496,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType, ICS.Standard.IsLvalueReference = !isRValRef; ICS.Standard.BindsToFunctionLvalue = false; ICS.Standard.BindsToRvalue = true; - ICS.Standard.IsImplicitObjectArgumentQualificationConversion = false; ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier = false; ICS.Standard.ObjCLifetimeConversionBinding = false; } else if (ICS.isUserDefined()) { @@ -5521,8 +5518,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType, ICS.UserDefined.After.IsLvalueReference = !isRValRef; ICS.UserDefined.After.BindsToFunctionLvalue = false; ICS.UserDefined.After.BindsToRvalue = !LValRefType; - ICS.UserDefined.After.IsImplicitObjectArgumentQualificationConversion = - false; ICS.UserDefined.After.BindsImplicitObjectArgumentWithoutRefQualifier = false; ICS.UserDefined.After.ObjCLifetimeConversionBinding = false; ICS.UserDefined.After.FromBracedInitList = false; @@ -5807,7 +5802,6 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType, StandardConversionSequence &SCS = Result.isStandard() ? Result.Standard : Result.UserDefined.After; SCS.ReferenceBinding = true; - SCS.IsImplicitObjectArgumentQualificationConversion = false; SCS.IsLvalueReference = ToType->isLValueReferenceType(); SCS.BindsToRvalue = true; SCS.BindsToFunctionLvalue = false; @@ -6005,12 +5999,8 @@ static ImplicitConversionSequence TryObjectArgumentInitialization( // affects the conversion rank. QualType ClassTypeCanon = S.Context.getCanonicalType(ClassType); ImplicitConversionKind SecondKind; - bool IsQualificationConversion = false; - if (ImplicitParamType.getCanonicalType() == FromTypeCanon) { + if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) { SecondKind = ICK_Identity; - } else if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) { - SecondKind = ICK_Identity; - IsQualificationConversion = true; } else if (S.IsDerivedFrom(Loc, FromType, ClassType)) { SecondKind = ICK_Derived_To_Base; } else if (!Method->isExplicitObjectMemberFunction()) { @@ -6051,8 +6041,6 @@ static ImplicitConversionSequence TryObjectArgumentInitialization( ICS.Standard.setFromType(FromType); ICS.Standard.setAllToTypes(ImplicitParamType); ICS.Standard.ReferenceBinding = true; - ICS.Standard.IsImplicitObjectArgumentQualificationConversion = - IsQualificationConversion; ICS.Standard.DirectBinding = true; ICS.Standard.IsLvalueReference = Method->getRefQualifier() != RQ_RValue; ICS.Standard.BindsToFunctionLvalue = false; diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp index 135865c8450f5..46c3670848529 100644 --- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp +++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp @@ -283,31 +283,3 @@ void f() { } #endif - -namespace GH147374 { - -struct String {}; -template void operator+(T, String &&) = delete; - -struct Bar { - void operator+(String) const; // expected-note {{candidate function}} - friend void operator+(Bar, String) {}; // expected-note {{candidate function}} -}; - -struct Baz { - void operator+(String); // expected-note {{candidate function}} - friend void operator+(Baz, String) {}; // expected-note {{candidate function}} -}; - -void test() { - Bar a; - String b; - a + b; - //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Bar' and 'String')}} - - Baz z; - z + b; - //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Baz' and 'String')}} -} - -} From 2cdcc4f2c6a0d36a5b534e16d5892ff8b03f3c88 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 17 Jul 2025 10:42:34 +0100 Subject: [PATCH 157/813] [VPlan] Allow cloning of VPWidenRecipe without underlying instr (NFC). Update VPWidenRecipe::clone() to use the constructor w/o mandatory Instruction, to facilitate cloning VPWidenRecipe without underlying instructions. Split off from https://github.com/llvm/llvm-project/pull/148239. --- llvm/lib/Transforms/Vectorize/VPlan.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 703cfe969577d..204268e586b43 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1357,9 +1357,10 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags, public: VPWidenRecipe(unsigned Opcode, ArrayRef Operands, - const VPIRFlags &Flags, DebugLoc DL) + const VPIRFlags &Flags, const VPIRMetadata &Metadata, + DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL), - Opcode(Opcode) {} + VPIRMetadata(Metadata), Opcode(Opcode) {} VPWidenRecipe(Instruction &I, ArrayRef Operands) : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I), @@ -1368,8 +1369,9 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags, ~VPWidenRecipe() override = default; VPWidenRecipe *clone() override { - auto *R = new VPWidenRecipe(*getUnderlyingInstr(), operands()); - R->transferFlags(*this); + auto *R = + new VPWidenRecipe(getOpcode(), operands(), *this, *this, getDebugLoc()); + R->setUnderlyingValue(getUnderlyingValue()); return R; } From 4797a6c4e8244ab06829b2e462b1329e94286dbf Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 17 Jul 2025 10:37:41 +0100 Subject: [PATCH 158/813] [lldb][test] TestChildCountTruncation.test: add missing command --- lldb/test/Shell/Settings/TestChildCountTruncation.test | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test index 3b75498297b05..a96a0d8310eeb 100644 --- a/lldb/test/Shell/Settings/TestChildCountTruncation.test +++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test @@ -50,11 +50,12 @@ EXPR-NOT: *** Some of the displayed variables have more members run frame variable arr +dwim-print arr VAR: (lldb) frame variable arr VAR: *** Some of the displayed variables have more members VAR-SAME: use the --show-all-children option to frame variable -VAR: (lldb) frame variable arr +VAR: (lldb) dwim-print arr VAR-NOT: *** Some of the displayed variables have more members #--- with-setting-commands.input From efa5063ba7a7151056439b70901219311c531cec Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Thu, 17 Jul 2025 17:52:12 +0800 Subject: [PATCH 159/813] [LoongArch] Optimize inserting element to high part of 256bits vector (#146816) --- .../LoongArch/LoongArchISelLowering.cpp | 5 +- .../CodeGen/LoongArch/lasx/build-vector.ll | 154 ++++++++---------- .../lasx/ir-instruction/insertelement.ll | 6 +- 3 files changed, 69 insertions(+), 96 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 39a1d542dd309..2378664ca8155 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -6000,10 +6000,9 @@ emitPseudoXVINSGR2VR(MachineInstr &MI, MachineBasicBlock *BB, Register ScratchReg1 = XSrc; if (Idx >= HalfSize) { ScratchReg1 = MRI.createVirtualRegister(RC); - BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_Q), ScratchReg1) + BuildMI(*BB, MI, DL, TII->get(LoongArch::XVPERMI_D), ScratchReg1) .addReg(XSrc) - .addReg(XSrc) - .addImm(1); + .addImm(14); } Register ScratchSubReg1 = MRI.createVirtualRegister(SubRC); diff --git a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll index b06f6523e977c..f25e988b52dc9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/build-vector.ll @@ -250,84 +250,68 @@ define void @buildvector_v32i8(ptr %dst, i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, ; CHECK-NEXT: vinsgr2vr.b $vr0, $a2, 14 ; CHECK-NEXT: ld.b $a1, $sp, 72 ; CHECK-NEXT: vinsgr2vr.b $vr0, $a3, 15 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: ld.b $a2, $sp, 80 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 ; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 0 -; CHECK-NEXT: ld.b $a1, $sp, 80 -; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 1 ; CHECK-NEXT: ld.b $a1, $sp, 88 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2 -; CHECK-NEXT: ld.b $a1, $sp, 96 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 1 +; CHECK-NEXT: ld.b $a2, $sp, 96 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 3 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2 ; CHECK-NEXT: ld.b $a1, $sp, 104 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4 -; CHECK-NEXT: ld.b $a1, $sp, 112 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 3 +; CHECK-NEXT: ld.b $a2, $sp, 112 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 5 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4 ; CHECK-NEXT: ld.b $a1, $sp, 120 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6 -; CHECK-NEXT: ld.b $a1, $sp, 128 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 5 +; CHECK-NEXT: ld.b $a2, $sp, 128 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 7 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6 ; CHECK-NEXT: ld.b $a1, $sp, 136 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8 -; CHECK-NEXT: ld.b $a1, $sp, 144 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 7 +; CHECK-NEXT: ld.b $a2, $sp, 144 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 9 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8 ; CHECK-NEXT: ld.b $a1, $sp, 152 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10 -; CHECK-NEXT: ld.b $a1, $sp, 160 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 9 +; CHECK-NEXT: ld.b $a2, $sp, 160 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 11 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10 ; CHECK-NEXT: ld.b $a1, $sp, 168 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12 -; CHECK-NEXT: ld.b $a1, $sp, 176 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 11 +; CHECK-NEXT: ld.b $a2, $sp, 176 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 13 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12 ; CHECK-NEXT: ld.b $a1, $sp, 184 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 13 +; CHECK-NEXT: ld.b $a2, $sp, 192 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 ; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 14 -; CHECK-NEXT: ld.b $a1, $sp, 192 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 15 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 15 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret @@ -371,8 +355,15 @@ entry: define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind { ; CHECK-LABEL: buildvector_v16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: ld.h $t0, $sp, 8 -; CHECK-NEXT: ld.h $t1, $sp, 0 +; CHECK-NEXT: ld.h $t0, $sp, 64 +; CHECK-NEXT: ld.h $t1, $sp, 56 +; CHECK-NEXT: ld.h $t2, $sp, 48 +; CHECK-NEXT: ld.h $t3, $sp, 40 +; CHECK-NEXT: ld.h $t4, $sp, 32 +; CHECK-NEXT: ld.h $t5, $sp, 24 +; CHECK-NEXT: ld.h $t6, $sp, 16 +; CHECK-NEXT: ld.h $t7, $sp, 8 +; CHECK-NEXT: ld.h $t8, $sp, 0 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a1, 0 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a2, 1 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a3, 2 @@ -380,45 +371,30 @@ define void @buildvector_v16i16(ptr %dst, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i1 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a5, 4 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a6, 5 ; CHECK-NEXT: vinsgr2vr.h $vr0, $a7, 6 -; CHECK-NEXT: vinsgr2vr.h $vr0, $t1, 7 -; CHECK-NEXT: ld.h $a1, $sp, 16 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $t0, 0 +; CHECK-NEXT: vinsgr2vr.h $vr0, $t8, 7 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t7, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a2, $sp, 24 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t6, 1 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a1, $sp, 32 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a2, 2 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t5, 2 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a2, $sp, 40 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t4, 3 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a1, $sp, 48 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a2, 4 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t3, 4 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a2, $sp, 56 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t2, 5 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: ld.h $a1, $sp, 64 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a2, 6 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t1, 6 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 -; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 +; CHECK-NEXT: vinsgr2vr.h $vr1, $t0, 7 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll index 25106b456d2f7..3a4f6efd2c893 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll @@ -18,8 +18,7 @@ define void @insert_32xi8_upper(ptr %src, ptr %dst, i8 %in) nounwind { ; CHECK-LABEL: insert_32xi8_upper: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 ; CHECK-NEXT: vinsgr2vr.b $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: xvst $xr0, $a1, 0 @@ -47,8 +46,7 @@ define void @insert_16xi16_upper(ptr %src, ptr %dst, i16 %in) nounwind { ; CHECK-LABEL: insert_16xi16_upper: ; CHECK: # %bb.0: ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvori.b $xr1, $xr0, 0 -; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1 +; CHECK-NEXT: xvpermi.d $xr1, $xr0, 14 ; CHECK-NEXT: vinsgr2vr.h $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 ; CHECK-NEXT: xvst $xr0, $a1, 0 From f04650bb799ce867f629a7d564e03057e8d9b4b0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 17 Jul 2025 19:08:22 +0900 Subject: [PATCH 160/813] LoongArch: Add test for llvm.exp10 intrinsic (#148606) --- llvm/test/CodeGen/LoongArch/llvm.exp10.ll | 362 ++++++++++++++++++++++ 1 file changed, 362 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/llvm.exp10.ll diff --git a/llvm/test/CodeGen/LoongArch/llvm.exp10.ll b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll new file mode 100644 index 0000000000000..7a52531daa802 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/llvm.exp10.ll @@ -0,0 +1,362 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=loongarch32 -mattr=+d < %s | FileCheck -check-prefix=LA32 %s +; RUN: llc -mtriple=loongarch64 -mattr=+d < %s | FileCheck -check-prefix=LA64 %s + +define half @exp10_f16(half %x) #0 { +; LA32-LABEL: exp10_f16: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: bl __extendhfsf2 +; LA32-NEXT: bl exp10f +; LA32-NEXT: bl __truncsfhf2 +; LA32-NEXT: movfr2gr.s $a0, $fa0 +; LA32-NEXT: lu12i.w $a1, -16 +; LA32-NEXT: or $a0, $a0, $a1 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: exp10_f16: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(__extendhfsf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfhf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: lu12i.w $a1, -16 +; LA64-NEXT: or $a0, $a0, $a1 +; LA64-NEXT: movgr2fr.w $fa0, $a0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret + %r = call half @llvm.exp10.f16(half %x) + ret half %r +} + +define <2 x half> @exp10_v2f16(<2 x half> %x) #0 { +; LA32-LABEL: exp10_v2f16: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill +; LA32-NEXT: movgr2fr.w $fs0, $a1 +; LA32-NEXT: movgr2fr.w $fa0, $a0 +; LA32-NEXT: bl __extendhfsf2 +; LA32-NEXT: bl exp10f +; LA32-NEXT: bl __truncsfhf2 +; LA32-NEXT: movfr2gr.s $fp, $fa0 +; LA32-NEXT: fmov.s $fa0, $fs0 +; LA32-NEXT: bl __extendhfsf2 +; LA32-NEXT: bl exp10f +; LA32-NEXT: bl __truncsfhf2 +; LA32-NEXT: movfr2gr.s $a1, $fa0 +; LA32-NEXT: move $a0, $fp +; LA32-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: exp10_v2f16: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: movgr2fr.w $fa0, $a1 +; LA64-NEXT: pcaddu18i $ra, %call36(__extendhfsf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfhf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $sp, 2 +; LA64-NEXT: movgr2fr.w $fa0, $fp +; LA64-NEXT: pcaddu18i $ra, %call36(__extendhfsf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: pcaddu18i $ra, %call36(__truncsfhf2) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.s $a0, $fa0 +; LA64-NEXT: st.h $a0, $sp, 0 +; LA64-NEXT: vld $vr0, $sp, 0 +; LA64-NEXT: vpickve2gr.h $a0, $vr0, 0 +; LA64-NEXT: vpickve2gr.h $a1, $vr0, 1 +; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %r = call <2 x half> @llvm.exp10.v2f16(<2 x half> %x) + ret <2 x half> %r +} + +define float @exp10_f32(float %x) #0 { +; LA32-LABEL: exp10_f32: +; LA32: # %bb.0: +; LA32-NEXT: b exp10f +; +; LA64-LABEL: exp10_f32: +; LA64: # %bb.0: +; LA64-NEXT: pcaddu18i $t8, %call36(exp10f) +; LA64-NEXT: jr $t8 + %r = call float @llvm.exp10.f32(float %x) + ret float %r +} + +define <2 x float> @exp10_v2f32(<2 x float> %x) #0 { +; LA32-LABEL: exp10_v2f32: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 16 # 8-byte Folded Spill +; LA32-NEXT: fst.d $fs1, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: fmov.s $fs0, $fa1 +; LA32-NEXT: bl exp10f +; LA32-NEXT: fmov.s $fs1, $fa0 +; LA32-NEXT: fmov.s $fa0, $fs0 +; LA32-NEXT: bl exp10f +; LA32-NEXT: fmov.s $fa1, $fa0 +; LA32-NEXT: fmov.s $fa0, $fs1 +; LA32-NEXT: fld.d $fs1, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: fld.d $fs0, $sp, 16 # 8-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: exp10_v2f32: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vreplvei.w $vr0, $vr0, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vreplvei.w $vr0, $vr0, 1 +; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10f) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 +; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vpackev.w $vr0, $vr0, $vr1 +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret + %r = call <2 x float> @llvm.exp10.v2f32(<2 x float> %x) + ret <2 x float> %r +} + +define double @exp10_f64(double %x) #0 { +; LA32-LABEL: exp10_f64: +; LA32: # %bb.0: +; LA32-NEXT: b exp10 +; +; LA64-LABEL: exp10_f64: +; LA64: # %bb.0: +; LA64-NEXT: pcaddu18i $t8, %call36(exp10) +; LA64-NEXT: jr $t8 + %r = call double @llvm.exp10.f64(double %x) + ret double %r +} + +define <2 x double> @exp10_v2f64(<2 x double> %x) #0 { +; LA32-LABEL: exp10_v2f64: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 16 # 8-byte Folded Spill +; LA32-NEXT: fst.d $fs1, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: fmov.d $fs0, $fa1 +; LA32-NEXT: bl exp10 +; LA32-NEXT: fmov.d $fs1, $fa0 +; LA32-NEXT: fmov.d $fa0, $fs0 +; LA32-NEXT: bl exp10 +; LA32-NEXT: fmov.d $fa1, $fa0 +; LA32-NEXT: fmov.d $fa0, $fs1 +; LA32-NEXT: fld.d $fs1, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: fld.d $fs0, $sp, 16 # 8-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: exp10_v2f64: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.d $a0, $fa0 +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vreplvei.d $vr0, $vr0, 1 +; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: movfr2gr.d $a0, $fa0 +; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 1 +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret + %r = call <2 x double> @llvm.exp10.v2f64(<2 x double> %x) + ret <2 x double> %r +} + +define fp128 @exp10_f128(fp128 %x) #0 { +; LA32-LABEL: exp10_f128: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -48 +; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32-NEXT: ld.w $a2, $a1, 0 +; LA32-NEXT: ld.w $a3, $a1, 4 +; LA32-NEXT: ld.w $a4, $a1, 8 +; LA32-NEXT: ld.w $a1, $a1, 12 +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: st.w $a1, $sp, 20 +; LA32-NEXT: st.w $a4, $sp, 16 +; LA32-NEXT: st.w $a3, $sp, 12 +; LA32-NEXT: addi.w $a0, $sp, 24 +; LA32-NEXT: addi.w $a1, $sp, 8 +; LA32-NEXT: st.w $a2, $sp, 8 +; LA32-NEXT: bl exp10l +; LA32-NEXT: ld.w $a0, $sp, 36 +; LA32-NEXT: ld.w $a1, $sp, 32 +; LA32-NEXT: ld.w $a2, $sp, 28 +; LA32-NEXT: ld.w $a3, $sp, 24 +; LA32-NEXT: st.w $a0, $fp, 12 +; LA32-NEXT: st.w $a1, $fp, 8 +; LA32-NEXT: st.w $a2, $fp, 4 +; LA32-NEXT: st.w $a3, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 48 +; LA32-NEXT: ret +; +; LA64-LABEL: exp10_f128: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: pcaddu18i $ra, %call36(exp10l) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret + %r = call fp128 @llvm.exp10.f128(fp128 %x) + ret fp128 %r +} + +define <2 x fp128> @exp10_v2f128(<2 x fp128> %x) #0 { +; LA32-LABEL: exp10_v2f128: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -96 +; LA32-NEXT: st.w $ra, $sp, 92 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 88 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 84 # 4-byte Folded Spill +; LA32-NEXT: st.w $s1, $sp, 80 # 4-byte Folded Spill +; LA32-NEXT: st.w $s2, $sp, 76 # 4-byte Folded Spill +; LA32-NEXT: st.w $s3, $sp, 72 # 4-byte Folded Spill +; LA32-NEXT: ld.w $s0, $a1, 16 +; LA32-NEXT: ld.w $s1, $a1, 20 +; LA32-NEXT: ld.w $s2, $a1, 24 +; LA32-NEXT: ld.w $s3, $a1, 28 +; LA32-NEXT: ld.w $a2, $a1, 0 +; LA32-NEXT: ld.w $a3, $a1, 4 +; LA32-NEXT: ld.w $a4, $a1, 8 +; LA32-NEXT: ld.w $a1, $a1, 12 +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: st.w $a1, $sp, 20 +; LA32-NEXT: st.w $a4, $sp, 16 +; LA32-NEXT: st.w $a3, $sp, 12 +; LA32-NEXT: addi.w $a0, $sp, 24 +; LA32-NEXT: addi.w $a1, $sp, 8 +; LA32-NEXT: st.w $a2, $sp, 8 +; LA32-NEXT: bl exp10l +; LA32-NEXT: st.w $s3, $sp, 52 +; LA32-NEXT: st.w $s2, $sp, 48 +; LA32-NEXT: st.w $s1, $sp, 44 +; LA32-NEXT: addi.w $a0, $sp, 56 +; LA32-NEXT: addi.w $a1, $sp, 40 +; LA32-NEXT: st.w $s0, $sp, 40 +; LA32-NEXT: bl exp10l +; LA32-NEXT: ld.w $a0, $sp, 24 +; LA32-NEXT: ld.w $a1, $sp, 28 +; LA32-NEXT: ld.w $a2, $sp, 32 +; LA32-NEXT: ld.w $a3, $sp, 36 +; LA32-NEXT: ld.w $a4, $sp, 68 +; LA32-NEXT: ld.w $a5, $sp, 64 +; LA32-NEXT: ld.w $a6, $sp, 60 +; LA32-NEXT: ld.w $a7, $sp, 56 +; LA32-NEXT: st.w $a4, $fp, 28 +; LA32-NEXT: st.w $a5, $fp, 24 +; LA32-NEXT: st.w $a6, $fp, 20 +; LA32-NEXT: st.w $a7, $fp, 16 +; LA32-NEXT: st.w $a3, $fp, 12 +; LA32-NEXT: st.w $a2, $fp, 8 +; LA32-NEXT: st.w $a1, $fp, 4 +; LA32-NEXT: st.w $a0, $fp, 0 +; LA32-NEXT: ld.w $s3, $sp, 72 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s2, $sp, 76 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s1, $sp, 80 # 4-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 84 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 88 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 92 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 96 +; LA32-NEXT: ret +; +; LA64-LABEL: exp10_v2f128: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -48 +; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $s3, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: ld.d $fp, $a1, 16 +; LA64-NEXT: ld.d $s0, $a1, 24 +; LA64-NEXT: ld.d $a2, $a1, 0 +; LA64-NEXT: ld.d $a1, $a1, 8 +; LA64-NEXT: move $s1, $a0 +; LA64-NEXT: move $a0, $a2 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10l) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: move $s2, $a0 +; LA64-NEXT: move $s3, $a1 +; LA64-NEXT: move $a0, $fp +; LA64-NEXT: move $a1, $s0 +; LA64-NEXT: pcaddu18i $ra, %call36(exp10l) +; LA64-NEXT: jirl $ra, $ra, 0 +; LA64-NEXT: st.d $a1, $s1, 24 +; LA64-NEXT: st.d $a0, $s1, 16 +; LA64-NEXT: st.d $s3, $s1, 8 +; LA64-NEXT: st.d $s2, $s1, 0 +; LA64-NEXT: ld.d $s3, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 48 +; LA64-NEXT: ret + %r = call <2 x fp128> @llvm.exp10.v2f128(<2 x fp128> %x) + ret <2 x fp128> %r +} + +attributes #0 = { nounwind } From b1fca543f7c34012697afd99c3dfe1306aa2acab Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Thu, 17 Jul 2025 11:20:47 +0100 Subject: [PATCH 161/813] [LLVM][AArch64ExpandPseudo] Preserve undef flags when expanding SVE 1/2/3-op pseudo instructions. (#149104) Fixes https://github.com/llvm/llvm-project/issues/149034 --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 18 +++-- .../AArch64/sve-pseudos-expand-undef.mir | 81 +++++++++++++++++++ 2 files changed, 93 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 7de66ccbf6f29..12fc976a70ea7 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -598,6 +598,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( llvm_unreachable("Unsupported ElementSize"); } + // Preserve undef state until DOP's reg is defined. + unsigned DOPRegState = MI.getOperand(DOPIdx).isUndef() ? RegState::Undef : 0; + // // Create the destructive operation (if required) // @@ -616,10 +619,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero)) .addReg(DstReg, RegState::Define) .addReg(MI.getOperand(PredIdx).getReg()) - .addReg(MI.getOperand(DOPIdx).getReg()); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState); // After the movprfx, the destructive operand is same as Dst DOPIdx = 0; + DOPRegState = 0; // Create the additional LSL to zero the lanes when the DstReg is not // unique. Zeros the lanes in z0 that aren't active in p0 with sequence @@ -638,8 +642,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( assert(DOPRegIsUnique && "The destructive operand should be unique"); PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx)) .addReg(DstReg, RegState::Define) - .addReg(MI.getOperand(DOPIdx).getReg()); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState); DOPIdx = 0; + DOPRegState = 0; } // @@ -647,10 +652,11 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( // DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode)) .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); + DOPRegState = DOPRegState | RegState::Kill; switch (DType) { case AArch64::DestructiveUnaryPassthru: - DOP.addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + DOP.addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) .add(MI.getOperand(PredIdx)) .add(MI.getOperand(SrcIdx)); break; @@ -659,12 +665,12 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) - .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) - .add(MI.getOperand(SrcIdx)); + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) + .add(MI.getOperand(SrcIdx)); break; case AArch64::DestructiveTernaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) - .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill) + .addReg(MI.getOperand(DOPIdx).getReg(), DOPRegState) .add(MI.getOperand(SrcIdx)) .add(MI.getOperand(Src2Idx)); break; diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir index a1d615c910792..c3c39f4d9cee2 100644 --- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir +++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir @@ -54,3 +54,84 @@ body: | renamable $z0 = FADD_ZPZZ_D_UNDEF killed $p0, killed $z1, killed $z2, implicit-def $z0_z1_z2_z3 RET_ReallyLR implicit $z0_z1_z2_z3 ... + +--- +name: unary_undef_operand +body: | + bb.0: + liveins: $p0, $z0 + + ; CHECK: name: unary_undef_operand + ; CHECK: $z0 = MOVPRFX_ZZ undef $z1 + ; CHECK: $z0 = ABS_ZPmZ_S internal killed $z0, renamable $p0, killed undef renamable $z1 + ; NOTE: Unary _UNDEF psuedo instructions ignore the passthru operand. + renamable $z0 = ABS_ZPmZ_S_UNDEF renamable $z0, renamable $p0, killed undef renamable $z1 + RET_ReallyLR + +... + +--- +name: binop_undef_operand +body: | + bb.0: + liveins: $p0, $z1 + + ; CHECK: name: binop_undef_operand + ; CHECK-NOT: MOVPRFX + ; CHECK: $z0 = SMIN_ZPmZ_S renamable $p0, killed undef $z0, killed renamable $z1 + renamable $z0 = SMIN_ZPZZ_S_UNDEF renamable $p0, undef renamable $z0, killed renamable $z1 + RET_ReallyLR + +... + +--- +name: binop_undef_operand_requires_movpfrx +body: | + bb.0: + liveins: $p0, $z1 + + ; CHECK: name: binop_undef_operand_requires_movpfrx + ; CHECK: $z0 = MOVPRFX_ZZ undef $z2 + ; CHECK: $z0 = SMIN_ZPmZ_S renamable $p0, internal killed $z0, killed renamable $z1 + renamable $z0 = SMIN_ZPZZ_S_UNDEF renamable $p0, undef renamable $z2, killed renamable $z1 + RET_ReallyLR + +... + +--- +name: binop_undef_operand_requires_zeroing_movpfrx +body: | + bb.0: + liveins: $p0, $z1 + + ; CHECK: name: binop_undef_operand_requires_zeroing_movpfrx + ; CHECK: $z0 = MOVPRFX_ZPzZ_S $p0, undef $z2 + ; CHECK: $z0 = ADD_ZPmZ_S renamable $p0, internal killed $z0, killed renamable $z1 + renamable $z0 = ADD_ZPZZ_S_ZERO renamable $p0, undef renamable $z2, killed renamable $z1 + RET_ReallyLR + +... + +--- +name: ternaryop_undef_operand +body: | + bb.0: + liveins: $p0, $z1, $z2 + ; CHECK: name: ternaryop_undef_operand + ; CHECK-NOT: MOVPRFX + ; CHECK: $z0 = MLA_ZPmZZ_B killed renamable $p0, killed undef $z0, killed renamable $z1, killed renamable $z2 + renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed undef renamable $z0, killed renamable $z1, killed renamable $z2 + RET_ReallyLR implicit $z0 +... + +--- +name: ternaryop_undef_operand_requires_movprfx +body: | + bb.0: + liveins: $p0, $z1, $z2 + ; CHECK: name: ternaryop_undef_operand_requires_movprfx + ; CHECK: $z0 = MOVPRFX_ZZ undef $z3 + ; CHECK: $z0 = MLA_ZPmZZ_B killed renamable $p0, internal killed $z0, killed renamable $z1, killed renamable $z2 + renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed undef renamable $z3, killed renamable $z1, killed renamable $z2 + RET_ReallyLR implicit $z0 +... From b7c14b6ded300b9190fe0b65881d04c54b2a9fbd Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Thu, 17 Jul 2025 11:40:43 +0100 Subject: [PATCH 162/813] [Debugify] Add 'acceptance-test' mode for the debugify report script (#147574) For the purposes of setting up CI that makes use of debugify, this patch adds an alternative mode for the llvm-original-di-preservation.py script, which produces terminal-friendly(-ish) YAML output instead of an HTML report, and sets the return code to 1 if the input file contains errors, or 0 if the input file contains no errors or does not exist, making it simple to use it in CI. This introduces a small change in existing usage, in that the path for the HTML report file is now passed with `--report-file ` rather than as a positional argument; I could make the argparse logic work without this change, but I believe that is simpler to understand this way, and to my knowledge debugify isn't currently being used in automated environments where changing this might cause issues. As a small change while passing by, I also changed `-compress` to `--compress`, for consistency. As a note for reviewers, the reason that we treat a non-existent input file as a pass is that this is actually the expected state: we use clang to compile numerous files, passing a filepath for debugify errors. Any errors found by debugify will be written to this file; if none are found, the file is untouched. This is also mentioned in a code comment, but I think it useful to state upfront. Finally, the justification for adding a new mode to this script instead of adding a separate script for the separate functionality is that this script understands debugify's output, and performs some deduplication that is useful for clarifying the resulting output. Writing a new script would require duplicating logic unnecessarily, and risks the scripts falling out-of-sync if changes are made to debugify's output. --- llvm/docs/HowToUpdateDebugInfo.rst | 2 +- .../acceptance-test.test | 70 ++++++++ .../llvm-original-di-preservation/basic.test | 12 +- llvm/utils/llvm-original-di-preservation.py | 154 ++++++++++++++---- 4 files changed, 198 insertions(+), 40 deletions(-) create mode 100644 llvm/test/tools/llvm-original-di-preservation/acceptance-test.test diff --git a/llvm/docs/HowToUpdateDebugInfo.rst b/llvm/docs/HowToUpdateDebugInfo.rst index abe21c6794a8a..915e2896023c5 100644 --- a/llvm/docs/HowToUpdateDebugInfo.rst +++ b/llvm/docs/HowToUpdateDebugInfo.rst @@ -504,7 +504,7 @@ as follows: .. code-block:: bash - $ llvm-original-di-preservation.py sample.json sample.html + $ llvm-original-di-preservation.py sample.json --report-file sample.html Testing of original debug info preservation can be invoked from front-end level as follows: diff --git a/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test b/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test new file mode 100644 index 0000000000000..0b8c33d24396a --- /dev/null +++ b/llvm/test/tools/llvm-original-di-preservation/acceptance-test.test @@ -0,0 +1,70 @@ +RUN: not %llvm-original-di-preservation %p/Inputs/sample.json --acceptance-test | FileCheck %s +CHECK: DILocation Bugs: +CHECK-NEXT: test.ll: +CHECK-NEXT: no-name: +CHECK-NEXT: - action: not-generate +CHECK-NEXT: bb_name: no-name +CHECK-NEXT: fn_name: fn +CHECK-NEXT: instr: extractvalue +CHECK-NEXT: - action: not-generate +CHECK-NEXT: bb_name: no-name +CHECK-NEXT: fn_name: fn +CHECK-NEXT: instr: insertvalue +CHECK-NEXT: - action: not-generate +CHECK-NEXT: bb_name: no-name +CHECK-NEXT: fn_name: fn1 +CHECK-NEXT: instr: insertvalue +CHECK-NEXT: - action: not-generate +CHECK-NEXT: bb_name: no-name +CHECK-NEXT: fn_name: fn1 +CHECK-NEXT: instr: extractvalue +CHECK: Errors detected for: + +RUN: not %llvm-original-di-preservation %p/Inputs/sample.json --acceptance-test --reduce | FileCheck %s --check-prefix=COMPRESS +COMPRESS: DILocation Bugs: +COMPRESS-NEXT: test.ll: +COMPRESS-NEXT: no-name: +COMPRESS-NEXT: - action: not-generate +COMPRESS-NEXT: bb_name: no-name +COMPRESS-NEXT: fn_name: fn +COMPRESS-NEXT: instr: extractvalue +COMPRESS-NEXT: - action: not-generate +COMPRESS-NEXT: bb_name: no-name +COMPRESS-NEXT: fn_name: fn +COMPRESS-NEXT: instr: insertvalue +COMPRESS: Errors detected for: + +RUN: not %llvm-original-di-preservation %p/Inputs/origin.json --acceptance-test --reduce | FileCheck %s --check-prefix=ORIGIN +ORIGIN: DILocation Bugs: +ORIGIN-NEXT: test.ll: +ORIGIN-NEXT: LoopVectorizePass: +ORIGIN-NEXT: - action: not-generate +ORIGIN-NEXT: bb_name: no-name +ORIGIN-NEXT: fn_name: fn +ORIGIN-NEXT: instr: add +ORIGIN-NEXT: origin: | +ORIGIN-NEXT: Stack Trace 0: +ORIGIN-NEXT: #0 0x00005895d035c935 llvm::DbgLocOrigin::DbgLocOrigin(bool) /tmp/llvm-project/llvm/lib/IR/DebugLoc.cpp:22:9 +ORIGIN-NEXT: #1 0x00005895d03af013 llvm::DILocAndCoverageTracking::DILocAndCoverageTracking() /tmp/llvm-project/llvm/include/llvm/IR/DebugLoc.h:90:11 +ORIGIN-NEXT: #2 0x00005895d03af013 llvm::DebugLoc::DebugLoc() /tmp/llvm-project/llvm/include/llvm/IR/DebugLoc.h:133:5 +ORIGIN-NEXT: #3 0x00005895d03af013 llvm::Instruction::Instruction(llvm::Type*, unsigned int, llvm::User::AllocInfo, llvm::InsertPosition) /tmp/llvm-project/llvm/lib/IR/Instruction.cpp:37:14 +ORIGIN-NEXT: #4 0x00005895d06862b5 llvm::PHINode::PHINode(llvm::Type*, unsigned int, llvm::Twine const&, llvm::InsertPosition) /tmp/llvm-project/llvm/include/llvm/IR/Instructions.h:0:9 +ORIGIN-NEXT: #5 0x00005895d06862b5 llvm::PHINode::Create(llvm::Type*, unsigned int, llvm::Twine const&, llvm::InsertPosition) /tmp/llvm-project/llvm/include/llvm/IR/Instructions.h:2651:9 +ORIGIN-NEXT: #6 0x00005895d06862b5 llvm::InstCombinerImpl::foldPHIArgGEPIntoPHI(llvm::PHINode&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp:617:9 +ORIGIN-NEXT: #7 0x00005895d0688fe0 llvm::InstCombinerImpl::visitPHINode(llvm::PHINode&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp:1456:22 +ORIGIN-NEXT: #8 0x00005895d05cd21f llvm::InstCombinerImpl::run() /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5327:22 +ORIGIN-NEXT: #9 0x00005895d05d067e combineInstructionsOverFunction(llvm::Function&, llvm::InstructionWorklist&, llvm::AAResults*, llvm::AssumptionCache&, llvm::TargetLibraryInfo&, llvm::TargetTransformInfo&, llvm::DominatorTree&, llvm::OptimizationRemarkEmitter&, llvm::BlockFrequencyInfo*, llvm::BranchProbabilityInfo*, llvm::ProfileSummaryInfo*, llvm::InstCombineOptions const&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5643:31 +ORIGIN-NEXT: #10 0x00005895d05cf9a9 llvm::InstCombinePass::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp:5706:8 +ORIGIN-NEXT: #11 0x00005895d107d07d llvm::detail::PassModel>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5 +ORIGIN-NEXT: #12 0x00005895d04204a7 llvm::PassManager>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerImpl.h:85:8 +ORIGIN-NEXT: #13 0x00005895ce4cb09d llvm::detail::PassModel>, llvm::AnalysisManager>::run(llvm::Function&, llvm::AnalysisManager&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5 +ORIGIN-NEXT: #14 0x00005895cfae2865 llvm::CGSCCToFunctionPassAdaptor::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:0:38 +ORIGIN-NEXT: #15 0x00005895ce4cad5d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5 +ORIGIN-NEXT: #16 0x00005895cfade813 llvm::PassManager, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:93:12 +ORIGIN-NEXT: #17 0x00005895d1e3968d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>, llvm::AnalysisManager, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5 +ORIGIN-NEXT: #18 0x00005895cfae1224 llvm::DevirtSCCRepeatedPass::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/lib/Analysis/CGSCCPassManager.cpp:0:38 +ORIGIN-NEXT: #19 0x00005895d1e5067d llvm::detail::PassModel, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&>::run(llvm::LazyCallGraph::SCC&, llvm::AnalysisManager&, llvm::LazyCallGraph&, llvm::CGSCCUpdateResult&) /tmp/llvm-project/llvm/include/llvm/IR/PassManagerInternal.h:91:5 +ORIGIN: Errors detected for: + +RUN: %llvm-original-di-preservation %p/Inputs/non-existent.json --acceptance-test | FileCheck %s --check-prefix=EMPTY +EMPTY: No errors detected for: diff --git a/llvm/test/tools/llvm-original-di-preservation/basic.test b/llvm/test/tools/llvm-original-di-preservation/basic.test index 5ef670b42c667..df43fbb3b5b9f 100644 --- a/llvm/test/tools/llvm-original-di-preservation/basic.test +++ b/llvm/test/tools/llvm-original-di-preservation/basic.test @@ -1,17 +1,17 @@ -RUN: %llvm-original-di-preservation %p/Inputs/sample.json %t.html | FileCheck %s +RUN: %llvm-original-di-preservation %p/Inputs/sample.json --report-html-file %t.html | FileCheck %s RUN: diff -w %p/Inputs/expected-sample.html %t.html CHECK: The {{.+}}.html generated. CHECK-NOT: Skipped lines: -RUN: %llvm-original-di-preservation %p/Inputs/corrupted.json %t2.html | FileCheck %s -check-prefix=CORRUPTED +RUN: %llvm-original-di-preservation %p/Inputs/corrupted.json --report-html-file %t2.html | FileCheck %s -check-prefix=CORRUPTED RUN: diff -w %p/Inputs/expected-skipped.html %t2.html CORRUPTED: Skipped lines: 3 CORRUPTED: Skipped bugs: 1 -RUN: %llvm-original-di-preservation -compress %p/Inputs/sample.json %t3.html | FileCheck %s -check-prefix=COMPRESSED +RUN: %llvm-original-di-preservation --reduce %p/Inputs/sample.json --report-html-file %t3.html | FileCheck %s -check-prefix=REDUCE RUN: diff -w %p/Inputs/expected-compressed.html %t3.html -COMPRESSED: The {{.+}}.html generated. -COMPRESSED-NOT: Skipped lines: +REDUCE: The {{.+}}.html generated. +REDUCE-NOT: Skipped lines: -RUN: %llvm-original-di-preservation %p/Inputs/origin.json %t4.html | FileCheck %s +RUN: %llvm-original-di-preservation %p/Inputs/origin.json --report-html-file %t4.html | FileCheck %s RUN: diff -w %p/Inputs/expected-origin.html %t4.html diff --git a/llvm/utils/llvm-original-di-preservation.py b/llvm/utils/llvm-original-di-preservation.py index 03793b1136f8d..b5ccd7a3224f8 100755 --- a/llvm/utils/llvm-original-di-preservation.py +++ b/llvm/utils/llvm-original-di-preservation.py @@ -11,7 +11,6 @@ from collections import defaultdict from collections import OrderedDict - class DILocBug: def __init__(self, origin, action, bb_name, fn_name, instr): self.origin = origin @@ -20,18 +19,35 @@ def __init__(self, origin, action, bb_name, fn_name, instr): self.fn_name = fn_name self.instr = instr - def __str__(self): + def key(self): return self.action + self.bb_name + self.fn_name + self.instr + def to_dict(self): + result = { + "instr": self.instr, + "fn_name": self.fn_name, + "bb_name": self.bb_name, + "action": self.action, + } + if self.origin: + result["origin"] = self.origin + return result + class DISPBug: def __init__(self, action, fn_name): self.action = action self.fn_name = fn_name - def __str__(self): + def key(self): return self.action + self.fn_name + def to_dict(self): + return { + "fn_name": self.fn_name, + "action": self.action, + } + class DIVarBug: def __init__(self, action, name, fn_name): @@ -39,9 +55,41 @@ def __init__(self, action, name, fn_name): self.name = name self.fn_name = fn_name - def __str__(self): + def key(self): return self.action + self.name + self.fn_name + def to_dict(self): + return { + "fn_name": self.fn_name, + "name": self.name, + "action": self.action, + } + + +def print_bugs_yaml(name, bugs_dict, indent=2): + def get_bug_line(indent_level: int, text: str, margin_mark: bool = False): + if margin_mark: + return "- ".rjust(indent_level * indent) + text + return " " * indent * indent_level + text + + print(f"{name}:") + for bugs_file, bugs_pass_dict in sorted(iter(bugs_dict.items())): + print(get_bug_line(1, f"{bugs_file}:")) + for bugs_pass, bugs_list in sorted(iter(bugs_pass_dict.items())): + print(get_bug_line(2, f"{bugs_pass}:")) + for bug in bugs_list: + bug_dict = bug.to_dict() + first_line = True + # First item needs a '-' in the margin. + for key, val in sorted(iter(bug_dict.items())): + if "\n" in val: + # Output block text for any multiline string. + print(get_bug_line(3, f"{key}: |", first_line)) + for line in val.splitlines(): + print(get_bug_line(4, line)) + else: + print(get_bug_line(3, f"{key}: {val}", first_line)) + first_line = False # Report the bugs in form of html. def generate_html_report( @@ -430,9 +478,16 @@ def get_json_chunk(file, start, size): # Parse the program arguments. def parse_program_args(parser): parser.add_argument("file_name", type=str, help="json file to process") - parser.add_argument("html_file", type=str, help="html file to output data") - parser.add_argument( - "-compress", action="store_true", help="create reduced html report" + parser.add_argument("--reduce", action="store_true", help="create reduced report") + + report_type_group = parser.add_mutually_exclusive_group(required=True) + report_type_group.add_argument( + "--report-html-file", type=str, help="output HTML file for the generated report" + ) + report_type_group.add_argument( + "--acceptance-test", + action="store_true", + help="if set, produce terminal-friendly output and return 0 iff the input file is empty or does not exist", ) return parser.parse_args() @@ -442,10 +497,22 @@ def Main(): parser = argparse.ArgumentParser() opts = parse_program_args(parser) - if not opts.html_file.endswith(".html"): + if opts.report_html_file is not None and not opts.report_html_file.endswith( + ".html" + ): print("error: The output file must be '.html'.") sys.exit(1) + if opts.acceptance_test: + if os.path.isdir(opts.file_name): + print(f"error: Directory passed as input file: '{opts.file_name}'") + sys.exit(1) + if not os.path.exists(opts.file_name): + # We treat an empty input file as a success, as debugify will generate an output file iff any errors are + # found, meaning we expect 0 errors to mean that the expected file does not exist. + print(f"No errors detected for: {opts.file_name}") + sys.exit(0) + # Use the defaultdict in order to make multidim dicts. di_location_bugs = defaultdict(lambda: defaultdict(list)) di_subprogram_bugs = defaultdict(lambda: defaultdict(list)) @@ -489,9 +556,9 @@ def Main(): skipped_lines += 1 continue - di_loc_bugs = di_location_bugs[bugs_file][bugs_pass] - di_sp_bugs = di_subprogram_bugs[bugs_file][bugs_pass] - di_var_bugs = di_variable_bugs[bugs_file][bugs_pass] + di_loc_bugs = di_location_bugs.get("bugs_file", {}).get("bugs_pass", []) + di_sp_bugs = di_subprogram_bugs.get("bugs_file", {}).get("bugs_pass", []) + di_var_bugs = di_variable_bugs.get("bugs_file", {}).get("bugs_pass", []) # Omit duplicated bugs. di_loc_set = set() @@ -515,9 +582,9 @@ def Main(): skipped_bugs += 1 continue di_loc_bug = DILocBug(origin, action, bb_name, fn_name, instr) - if not str(di_loc_bug) in di_loc_set: - di_loc_set.add(str(di_loc_bug)) - if opts.compress: + if not di_loc_bug.key() in di_loc_set: + di_loc_set.add(di_loc_bug.key()) + if opts.reduce: pass_instr = bugs_pass + instr if not pass_instr in di_loc_pass_instr_set: di_loc_pass_instr_set.add(pass_instr) @@ -538,9 +605,9 @@ def Main(): skipped_bugs += 1 continue di_sp_bug = DISPBug(action, name) - if not str(di_sp_bug) in di_sp_set: - di_sp_set.add(str(di_sp_bug)) - if opts.compress: + if not di_sp_bug.key() in di_sp_set: + di_sp_set.add(di_sp_bug.key()) + if opts.reduce: pass_fn = bugs_pass + name if not pass_fn in di_sp_pass_fn_set: di_sp_pass_fn_set.add(pass_fn) @@ -562,9 +629,9 @@ def Main(): skipped_bugs += 1 continue di_var_bug = DIVarBug(action, name, fn_name) - if not str(di_var_bug) in di_var_set: - di_var_set.add(str(di_var_bug)) - if opts.compress: + if not di_var_bug.key() in di_var_set: + di_var_set.add(di_var_bug.key()) + if opts.reduce: pass_var = bugs_pass + name if not pass_var in di_var_pass_var_set: di_var_pass_var_set.add(pass_var) @@ -582,19 +649,40 @@ def Main(): skipped_bugs += 1 continue - di_location_bugs[bugs_file][bugs_pass] = di_loc_bugs - di_subprogram_bugs[bugs_file][bugs_pass] = di_sp_bugs - di_variable_bugs[bugs_file][bugs_pass] = di_var_bugs - - generate_html_report( - di_location_bugs, - di_subprogram_bugs, - di_variable_bugs, - di_location_bugs_summary, - di_sp_bugs_summary, - di_var_bugs_summary, - opts.html_file, - ) + if di_loc_bugs: + di_location_bugs[bugs_file][bugs_pass] = di_loc_bugs + if di_sp_bugs: + di_subprogram_bugs[bugs_file][bugs_pass] = di_sp_bugs + if di_var_bugs: + di_variable_bugs[bugs_file][bugs_pass] = di_var_bugs + + if opts.report_html_file is not None: + generate_html_report( + di_location_bugs, + di_subprogram_bugs, + di_variable_bugs, + di_location_bugs_summary, + di_sp_bugs_summary, + di_var_bugs_summary, + opts.report_html_file, + ) + else: + # Pretty(ish) print the detected bugs, but check if any exist first so that we don't print an empty dict. + if di_location_bugs: + print_bugs_yaml("DILocation Bugs", di_location_bugs) + if di_subprogram_bugs: + print_bugs_yaml("DISubprogram Bugs", di_subprogram_bugs) + if di_variable_bugs: + print_bugs_yaml("DIVariable Bugs", di_variable_bugs) + + if opts.acceptance_test: + if any((di_location_bugs, di_subprogram_bugs, di_variable_bugs)): + # Add a newline gap after printing at least one error. + print() + print(f"Errors detected for: {opts.file_name}") + sys.exit(1) + else: + print(f"No errors detected for: {opts.file_name}") if skipped_lines > 0: print("Skipped lines: " + str(skipped_lines)) From e74082703e224740e6281fb04f9a177c42c6467f Mon Sep 17 00:00:00 2001 From: ZhaoQi Date: Thu, 17 Jul 2025 19:21:24 +0800 Subject: [PATCH 163/813] [LoongArch] Optimize inserting bitcasted integer element or bitcasting extracted fp element (#147043) --- llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td | 2 +- llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td | 10 ++++++++++ llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td | 10 ++++++++++ .../lasx/ir-instruction/bitcast-extract-element.ll | 4 ---- .../lasx/ir-instruction/insert-bitcast-element.ll | 4 ---- .../lsx/ir-instruction/bitcast-extract-element.ll | 6 ++---- .../lsx/ir-instruction/insert-bitcast-element.ll | 4 ---- 7 files changed, 23 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index d5a5f17348e4b..36c3011be2b9e 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file describes the baisc single-precision floating-point instructions. +// This file describes the basic single-precision floating-point instructions. // //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 6a8c9fac840d9..a0107e44b421b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -1651,6 +1651,10 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm), (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>; def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm), (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>; +def : Pat<(vector_insert v8f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm3:$imm), + (XVINSGR2VR_W $vd, $rj, uimm3:$imm)>; +def : Pat<(vector_insert v4f64:$vd, (f64 (bitconvert i64:$rj)), uimm2:$imm), + (XVINSGR2VR_D $vd, $rj, uimm2:$imm)>; def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2), (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>; def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2), @@ -1851,6 +1855,12 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in { def : RegRegStPat; } +// Bitcast float/double element extracted from vector to integer. +def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v8f32:$xj, uimm3:$imm))), + (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm)>; +def : Pat<(i64 (bitconvert (f64 (vector_extract v4f64:$xj, uimm2:$imm)))), + (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm)>; + // Vector extraction with constant index. foreach imm = 16...31 in { defvar Imm = !and(imm, 15); diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index 9dd6006e3a9dc..962e7c21431b1 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -1838,6 +1838,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm), (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>; def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm), (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>; +def : Pat<(vector_insert v4f32:$vd, (loongarch_movgr2fr_w_la64 GPR:$rj), uimm2:$imm), + (VINSGR2VR_W $vd, $rj, uimm2:$imm)>; +def : Pat<(vector_insert v2f64:$vd, (f64 (bitconvert i64:$rj)), uimm1:$imm), + (VINSGR2VR_D $vd, $rj, uimm1:$imm)>; def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm), (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>; def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm), @@ -2036,6 +2040,12 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in { def : RegRegStPat; } +// Bitcast float/double element extracted from vector to integer. +def : Pat<(loongarch_movfr2gr_s_la64 (f32 (vector_extract v4f32:$vj, uimm2:$imm))), + (VPICKVE2GR_W v4f32:$vj, uimm2:$imm)>; +def : Pat<(i64 (bitconvert (f64 (vector_extract v2f64:$vj, uimm1:$imm)))), + (VPICKVE2GR_D v2f64:$vj, uimm1:$imm)>; + // Vector extraction with constant index. def : Pat<(i64 (vector_extract v16i8:$vj, uimm4:$imm)), (VPICKVE2GR_B v16i8:$vj, uimm4:$imm)>; diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll index 86808c7a8f014..09ce1a04d6c9d 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/bitcast-extract-element.ll @@ -5,8 +5,6 @@ define i32 @bitcast_extract_v8f32(<8 x float> %a) nounwind { ; CHECK-LABEL: bitcast_extract_v8f32: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7 -; CHECK-NEXT: movgr2fr.w $fa0, $a0 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 ; CHECK-NEXT: ret entry: %b = extractelement <8 x float> %a, i32 7 @@ -18,8 +16,6 @@ define i64 @bitcast_extract_v4f64(<4 x double> %a) nounwind { ; CHECK-LABEL: bitcast_extract_v4f64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 -; CHECK-NEXT: movgr2fr.d $fa0, $a0 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 ; CHECK-NEXT: ret entry: %b = extractelement <4 x double> %a, i32 3 diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll index 7b2461b11f12d..b37b525981fd9 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-bitcast-element.ll @@ -4,8 +4,6 @@ define <8 x float> @insert_bitcast_v8f32(<8 x float> %a, i32 %b) nounwind { ; CHECK-LABEL: insert_bitcast_v8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movgr2fr.w $fa1, $a0 -; CHECK-NEXT: movfr2gr.s $a0, $fa1 ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1 ; CHECK-NEXT: ret entry: @@ -17,8 +15,6 @@ entry: define <4 x double> @insert_bitcast_v4f64(<4 x double> %a, i64 %b) nounwind { ; CHECK-LABEL: insert_bitcast_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movgr2fr.d $fa1, $a0 -; CHECK-NEXT: movfr2gr.d $a0, $fa1 ; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll index df4896d7ec936..9a40feb45671f 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/bitcast-extract-element.ll @@ -4,8 +4,7 @@ define i32 @bitcast_extract_v4f32(<4 x float> %a) nounwind { ; CHECK-LABEL: bitcast_extract_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 -; CHECK-NEXT: movfr2gr.s $a0, $fa0 +; CHECK-NEXT: vpickve2gr.w $a0, $vr0, 3 ; CHECK-NEXT: ret entry: %b = extractelement <4 x float> %a, i32 3 @@ -16,8 +15,7 @@ entry: define i64 @bitcast_extract_v2f64(<2 x double> %a) nounwind { ; CHECK-LABEL: bitcast_extract_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vreplvei.d $vr0, $vr0, 1 -; CHECK-NEXT: movfr2gr.d $a0, $fa0 +; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1 ; CHECK-NEXT: ret entry: %b = extractelement <2 x double> %a, i32 1 diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll index a20d17efdfb11..c42e3013c1131 100644 --- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll +++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-bitcast-element.ll @@ -4,8 +4,6 @@ define <4 x float> @insert_bitcast_v4f32(<4 x float> %a, i32 %b) nounwind { ; CHECK-LABEL: insert_bitcast_v4f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movgr2fr.w $fa1, $a0 -; CHECK-NEXT: movfr2gr.s $a0, $fa1 ; CHECK-NEXT: vinsgr2vr.w $vr0, $a0, 1 ; CHECK-NEXT: ret entry: @@ -17,8 +15,6 @@ entry: define <2 x double> @insert_bitcast_v2f64(<2 x double> %a, i64 %b) nounwind { ; CHECK-LABEL: insert_bitcast_v2f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movgr2fr.d $fa1, $a0 -; CHECK-NEXT: movfr2gr.d $a0, $fa1 ; CHECK-NEXT: vinsgr2vr.d $vr0, $a0, 1 ; CHECK-NEXT: ret entry: From fe1941967267e472f7eee15b43712bdfa2b63544 Mon Sep 17 00:00:00 2001 From: Jie Fu Date: Thu, 17 Jul 2025 19:22:24 +0800 Subject: [PATCH 164/813] [X86] Fix an unused-variable warnig (NFC) /llvm-project/llvm/lib/Target/X86/X86CallingConv.cpp:392:12: error: unused variable 'NumRegs' [-Werror,-Wunused-variable] unsigned NumRegs = PendingMembers.size(); ^ 1 error generated. --- llvm/lib/Target/X86/X86CallingConv.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index eb39259f7166b..82e8ce4e0bd7c 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -389,8 +389,7 @@ static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (!ArgFlags.isInConsecutiveRegsLast()) return true; - unsigned NumRegs = PendingMembers.size(); - assert(NumRegs == 4 && "Should have two parts"); + assert(PendingMembers.size() == 4 && "Should have two parts"); int64_t Offset = State.AllocateStack(16, Align(16)); PendingMembers[0].convertToMem(Offset); From fcabb53f0c349885167ea3d0e53915e6c42271a7 Mon Sep 17 00:00:00 2001 From: Abinaya Saravanan Date: Thu, 17 Jul 2025 17:27:13 +0530 Subject: [PATCH 165/813] [HEXAGON] Add AssertSext in sign-extended mpy (#149061) The pattern i32xi32->i64, should be matched to the sign-extended multiply op, instead of explicit sign- extension of the operands followed by non-widening multiply (this takes 4 operations instead of one). Currently, if one of the operands of multiply inside a loop is a constant, the sign-extension of this constant is hoisted out of the loop by LICM pass and this pattern is not matched by the ISEL. This change handles multiply operand with Opcode of the type AssertSext which is seen when the sign-extension is hoisted out-of the loop. Modifies the DetectUseSxtw() to check for this. --- .../Target/Hexagon/HexagonISelDAGToDAG.cpp | 9 +++++ .../test/CodeGen/Hexagon/mpy-operand-hoist.ll | 38 +++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 53943de3bc597..e285e04543694 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) { R = N; break; } + case ISD::AssertSext: { + EVT T = cast(N.getOperand(1))->getVT(); + if (T.getSizeInBits() == 32) + R = N.getOperand(0); + else + return false; + break; + } + default: return false; } diff --git a/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll new file mode 100644 index 0000000000000..ff50f1abe5897 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll @@ -0,0 +1,38 @@ +; RUN: llc -march=hexagon -verify-machineinstrs < %s | FileCheck %s + +; CHECK-NOT: r{{[0-9]+}} = asr(r{{[0-9]+}},#{{[0-9]+}}) +; CHECK-NOT: r{{[0-9]+}}:{{[0-9]+}} = mpyu(r{{[0-9]+}},r{{[0-9]+}}) +; CHECK-NOT: r{{[0-9]+}} += mpyi(r{{[0-9]+}},r{{[0-9]+}}) +; CHECK: r{{[0-9]+}}:{{[0-9]+}} = mpy(r{{[0-9]+}},r{{[0-9]+}}) + +; ModuleID = '39544.c' +source_filename = "39544.c" +target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048" +target triple = "hexagon" + +define dso_local void @mul_n(i64* nocapture %p, i32* nocapture readonly %a, i32 %k, i32 %n) local_unnamed_addr { +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %conv1 = sext i32 %k to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.lr.ph + %arrayidx.phi = phi i32* [ %a, %for.body.lr.ph ], [ %arrayidx.inc, %for.body ] + %arrayidx2.phi = phi i64* [ %p, %for.body.lr.ph ], [ %arrayidx2.inc, %for.body ] + %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ] + %0 = load i32, i32* %arrayidx.phi, align 4 + %conv = sext i32 %0 to i64 + %mul = mul nsw i64 %conv, %conv1 + store i64 %mul, i64* %arrayidx2.phi, align 8 + %inc = add nuw nsw i32 %i.08, 1 + %exitcond = icmp eq i32 %inc, %n + %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1 + %arrayidx2.inc = getelementptr i64, i64* %arrayidx2.phi, i32 1 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} From 60ae9c9c632dec978e71d1d3ab3c3d18eca16c77 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 17 Jul 2025 08:00:02 -0400 Subject: [PATCH 166/813] [SLP]Do not consider non-profitable loads slices If all slices are small and end up with strided or even vectorization states, better to not consider these candidates for the vectorization and try to vectorize the whole bunch as gathered loads. Reviewers: hiraditya, RKSimon, HanKuanChen Reviewed By: RKSimon, HanKuanChen Pull Request: https://github.com/llvm/llvm-project/pull/149209 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 9 ++++ .../X86/matched-nodes-updated.ll | 44 ++++++++----------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index da6af353c709f..6ad5c60105a28 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -11693,6 +11693,7 @@ void BoUpSLP::transformNodes() { if (StartIdx + VF > End) continue; SmallVector> Slices; + bool AllStrided = true; for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { ArrayRef Slice = VL.slice(Cnt, VF); // If any instruction is vectorized already - do not try again. @@ -11743,6 +11744,9 @@ void BoUpSLP::transformNodes() { SmallVector PointerOps; LoadsState Res = canVectorizeLoads(Slice, Slice.front(), Order, PointerOps); + AllStrided &= Res == LoadsState::StridedVectorize || + Res == LoadsState::ScatterVectorize || + Res == LoadsState::Gather; // Do not vectorize gathers. if (Res == LoadsState::ScatterVectorize || Res == LoadsState::Gather) { @@ -11772,6 +11776,11 @@ void BoUpSLP::transformNodes() { } Slices.emplace_back(Cnt, Slice.size()); } + // Do not try to vectorize if all slides are strided or gathered with + // vector factor 2 and there are more than 2 slices. Better to handle + // them in gathered loads analysis, may result in better vectorization. + if (VF == 2 && AllStrided && Slices.size() > 2) + continue; auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) { E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt); if (StartIdx == Cnt) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll index f56af934f19f5..b1864b43512d8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-nodes-updated.ll @@ -14,50 +14,44 @@ define i32 @test(i32 %s.0) { ; CHECK: [[IF_END3:.*]]: ; CHECK-NEXT: br label %[[IF_END6:.*]] ; CHECK: [[IF_END6]]: -; CHECK-NEXT: [[J_4:%.*]] = phi i32 [ 0, %[[IF_END3]] ], [ [[TMP28:%.*]], %[[O]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP22:%.*]], %[[O]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP24:%.*]], %[[O]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ poison, %[[IF_END3]] ], [ zeroinitializer, %[[O]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END3]] ], [ [[TMP29:%.*]], %[[O]] ] ; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> , <8 x i32> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> , i32 [[TMP22:%.*]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <8 x i32> [[TMP27]], <8 x i32> [[TMP30]], <8 x i32> ; CHECK-NEXT: br i1 false, label %[[IF_END24:.*]], label %[[IF_THEN11:.*]] ; CHECK: [[IF_THEN11]]: -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i32> poison, i32 [[J_4]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> [[TMP16]], <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> , <8 x i32> [[TMP11]], <8 x i32> ; CHECK-NEXT: br label %[[IF_END24]] ; CHECK: [[IF_THEN18:.*]]: ; CHECK-NEXT: br label %[[T]] ; CHECK: [[T]]: -; CHECK-NEXT: [[TMP34:%.*]] = phi <8 x i32> [ [[TMP33:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ] +; CHECK-NEXT: [[TMP13:%.*]] = phi <8 x i32> [ [[TMP33:%.*]], %[[O]] ], [ poison, %[[IF_THEN18]] ] ; CHECK-NEXT: [[TMP17]] = extractelement <4 x i32> [[TMP23:%.*]], i32 0 ; CHECK-NEXT: br i1 false, label %[[IF_END24]], label %[[K]] ; CHECK: [[IF_END24]]: -; CHECK-NEXT: [[TMP18:%.*]] = phi <8 x i32> [ [[TMP29]], %[[IF_THEN11]] ], [ [[TMP31]], %[[IF_END6]] ], [ [[TMP34]], %[[T]] ] -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = phi <8 x i32> [ [[TMP12]], %[[IF_THEN11]] ], [ [[TMP31]], %[[IF_END6]] ], [ [[TMP13]], %[[T]] ] ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <8 x i32> [[TMP18]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[O]] ; CHECK: [[O]]: -; CHECK-NEXT: [[TMP22]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP19]], %[[IF_END24]] ] ; CHECK-NEXT: [[TMP23]] = phi <4 x i32> [ [[TMP1]], %[[K]] ], [ [[TMP20]], %[[IF_END24]] ] -; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP21]], %[[IF_END24]] ] +; CHECK-NEXT: [[TMP24]] = phi <2 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP19]], %[[IF_END24]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i32> [ zeroinitializer, %[[K]] ], [ [[TMP34]], %[[IF_END24]] ] +; CHECK-NEXT: [[TMP22]] = extractelement <2 x i32> [[TMP24]], i32 1 ; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <8 x i32> [[TMP25]], <8 x i32> , <8 x i32> ; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP33]] = shufflevector <8 x i32> [[TMP26]], <8 x i32> [[TMP32]], <8 x i32> -; CHECK-NEXT: [[TMP28]] = extractelement <4 x i32> [[TMP24]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <2 x i32> [[TMP24]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i32> [[TMP21]], <4 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP29]] = shufflevector <2 x i32> [[TMP35]], <2 x i32> [[TMP28]], <2 x i32> ; CHECK-NEXT: br i1 false, label %[[T]], label %[[IF_END6]] ; entry: From 2b3a410f5bc8358a9e8594331d70c9c5d59633d8 Mon Sep 17 00:00:00 2001 From: Ryotaro Kasuga Date: Thu, 17 Jul 2025 21:11:37 +0900 Subject: [PATCH 167/813] [DA] Check element size when analyzing deps between same instruction (#148813) DependenceAnalysis checks whether the given addresses are divisible by the element size of corresponding load/store instructions. However, this check was only executed when the two instructions (Src and Dst) are different. We must also perform the same check when Src and Dst are the same instruction. Fix the test added in #147715. --- llvm/lib/Analysis/DependenceAnalysis.cpp | 14 ++++++-------- .../DependenceAnalysis/DifferentOffsets.ll | 6 +++--- .../Analysis/DependenceAnalysis/MIVCheckConst.ll | 3 +++ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp index 428342f51ad2e..dd9a44b9aecac 100644 --- a/llvm/lib/Analysis/DependenceAnalysis.cpp +++ b/llvm/lib/Analysis/DependenceAnalysis.cpp @@ -3670,14 +3670,12 @@ DependenceInfo::depends(Instruction *Src, Instruction *Dst, const SCEV *SrcEv = SE->getMinusSCEV(SrcSCEV, SrcBase); const SCEV *DstEv = SE->getMinusSCEV(DstSCEV, DstBase); - if (Src != Dst) { - // Check that memory access offsets are multiples of element sizes. - if (!SE->isKnownMultipleOf(SrcEv, EltSize, Assume) || - !SE->isKnownMultipleOf(DstEv, EltSize, Assume)) { - LLVM_DEBUG(dbgs() << "can't analyze SCEV with different offsets\n"); - return std::make_unique(Src, Dst, - SCEVUnionPredicate(Assume, *SE)); - } + // Check that memory access offsets are multiples of element sizes. + if (!SE->isKnownMultipleOf(SrcEv, EltSize, Assume) || + !SE->isKnownMultipleOf(DstEv, EltSize, Assume)) { + LLVM_DEBUG(dbgs() << "can't analyze SCEV with different offsets\n"); + return std::make_unique(Src, Dst, + SCEVUnionPredicate(Assume, *SE)); } if (!Assume.empty()) { diff --git a/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll b/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll index 4f95da4f79c57..d9ccea55dd478 100644 --- a/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll +++ b/llvm/test/Analysis/DependenceAnalysis/DifferentOffsets.ll @@ -11,7 +11,7 @@ define i32 @alias_with_different_offsets(ptr nocapture %A) { ; CHECK-LABEL: 'alias_with_different_offsets' ; CHECK-NEXT: Src: store i32 2, ptr %arrayidx, align 1 --> Dst: store i32 2, ptr %arrayidx, align 1 -; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: da analyze - confused! ; CHECK-NEXT: Src: store i32 2, ptr %arrayidx, align 1 --> Dst: %0 = load i32, ptr %A, align 1 ; CHECK-NEXT: da analyze - confused! ; CHECK-NEXT: Src: %0 = load i32, ptr %A, align 1 --> Dst: %0 = load i32, ptr %A, align 1 @@ -207,11 +207,11 @@ end: ; *((long long *)idx) = 1; ; } ; -; FIXME: There are loop-carried dependencies across iterations in the store. +; There are loop-carried dependencies across iterations in the store. define void @multidim_accesses2(ptr %A) { ; CHECK-LABEL: 'multidim_accesses2' ; CHECK-NEXT: Src: store i64 1, ptr %idx, align 4 --> Dst: store i64 1, ptr %idx, align 4 -; CHECK-NEXT: da analyze - none! +; CHECK-NEXT: da analyze - confused! ; entry: br label %for.i diff --git a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll index c1f8c85f2bf0e..b498d70648bad 100644 --- a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll +++ b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll @@ -40,6 +40,9 @@ define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) #0 align 2 { ; CHECK-NEXT: da analyze - confused! ; CHECK-NEXT: Src: %v27 = load <32 x i32>, ptr %v25, align 256 --> Dst: %v27 = load <32 x i32>, ptr %v25, align 256 ; CHECK-NEXT: da analyze - consistent input [0 S S]! +; CHECK-NEXT: Runtime Assumptions: +; CHECK-NEXT: Equal predicate: (zext i7 (4 * (trunc i32 %v1 to i7) * (1 + (trunc i32 %n to i7))) to i32) == 0 +; CHECK-NEXT: Equal predicate: (8 * (zext i4 (trunc i32 %v1 to i4) to i32)) == 0 ; CHECK-NEXT: Src: %v27 = load <32 x i32>, ptr %v25, align 256 --> Dst: %v32 = load <32 x i32>, ptr %v30, align 128 ; CHECK-NEXT: da analyze - input [* S S|<]! ; CHECK-NEXT: Runtime Assumptions: From 145b6cdffaf6711a5b7ad191444ab3e5d97b8992 Mon Sep 17 00:00:00 2001 From: Kristof Beyls Date: Thu, 17 Jul 2025 14:38:17 +0200 Subject: [PATCH 168/813] Improve description of what is considered a security issue (#147035) This patch improves the description of what the LLVM project considers a security issue, and what not. This patch is based on the RFC discussion in https://discourse.llvm.org/t/improving-documentation-of-what-is-considered-a-security-issue-in-llvm/86714 --- llvm/docs/Security.rst | 76 ++++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 25 deletions(-) diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst index 8f04b6594de79..5cb8d04c0da2a 100644 --- a/llvm/docs/Security.rst +++ b/llvm/docs/Security.rst @@ -157,6 +157,7 @@ Members of the LLVM Security Response Group are expected to: * Help write and review patches to address security issues. * Participate in the member nomination and removal processes. +.. _security-group-discussion-medium: Discussion Medium ================= @@ -204,6 +205,10 @@ The LLVM Security Policy may be changed by majority vote of the LLVM Security Re What is considered a security issue? ==================================== +We define "security-sensitive" to mean that a discovered bug or vulnerability +may require coordinated disclosure, and therefore should be reported to the LLVM +Security Response group rather than publishing in the public bug tracker. + The LLVM Project has a significant amount of code, and not all of it is considered security-sensitive. This is particularly true because LLVM is used in a wide variety of circumstances: there are different threat models, untrusted @@ -217,31 +222,52 @@ security-sensitive). This requires a rationale, and buy-in from the LLVM community as for any RFC. In some cases, parts of the codebase could be handled as security-sensitive but need significant work to get to the stage where that's manageable. The LLVM community will need to decide whether it wants to invest in -making these parts of the code securable, and maintain these security -properties over time. In all cases the LLVM Security Response Group should be consulted, -since they'll be responding to security issues filed against these parts of the -codebase. - -If you're not sure whether an issue is in-scope for this security process or -not, err towards assuming that it is. The Security Response Group might agree or disagree -and will explain its rationale in the report, as well as update this document -through the above process. - -The security-sensitive parts of the LLVM Project currently are the following. -Note that this list can change over time. - -* None are currently defined. Please don't let this stop you from reporting - issues to the LLVM Security Response Group that you believe are security-sensitive. - -The parts of the LLVM Project which are currently treated as non-security -sensitive are the following. Note that this list can change over time. - -* Language front-ends, such as clang, for which a malicious input file can cause - undesirable behavior. For example, a maliciously crafted C or Rust source file - can cause arbitrary code to execute in LLVM. These parts of LLVM haven't been - hardened, and compiling untrusted code usually also includes running utilities - such as `make` which can more readily perform malicious things. - +making these parts of the code securable, and maintain these security properties +over time. In all cases the LLVM Security Response Group +`should be consulted `__, since they'll be +responding to security issues filed against these parts of the codebase. + +The security-sensitive parts of the LLVM Project currently are the following: + +* Code generation: most miscompilations are not security sensitive. However, a + miscompilation where there are clear indications that it can result in the + produced binary becoming significantly easier to exploit could be considered + security sensitive, and should be reported to the security response group. +* Run-time libraries: only parts of the run-time libraries are considered + security-sensitive. The parts that are not considered security-sensitive are + documented below. + +The following parts of the LLVM Project are currently treated as non-security +sensitive: + +* LLVM's language frontends, analyzers, optimizers, and code generators for + which a malicious input can cause undesirable behavior. For example, a + maliciously crafted C, Rust or bitcode input file can cause arbitrary code to + execute in LLVM. These parts of LLVM haven't been hardened, and handling + untrusted code usually also includes running utilities such as make which can + more readily perform malicious things. For example, vulnerabilities in clang, + clangd, or the LLVM optimizer in a JIT caused by untrusted inputs are not + security-sensitive. +* The following parts of the run-time libraries are explicitly not considered + security-sensitive: + + * parts of the run-time libraries that are not meant to be included in + production binaries. For example, most sanitizers are not considered + security-sensitive as they are meant to be used during development only, not + in production. + * for libc and libc++: if a user calls library functionality in an undefined + or otherwise incorrect way, this will most likely not be considered a + security issue, unless the libc/libc++ documentation explicitly promises to + harden or catch that specific undefined behaviour or incorrect usage. + * unwinding and exception handling: the implementations are not hardened + against malformed or malicious unwind or exception handling data. This is + not considered security sensitive. + +Note that both the explicit security-sensitive and explicit non-security +sensitive lists can change over time. If you're not sure whether an issue is +in-scope for this security process or not, err towards assuming that it is. The +Security Response Group might agree or disagree and will explain its rationale +in the report, as well as update this document through the above process. .. _CVE process: https://cve.mitre.org .. _report a vulnerability: https://github.com/llvm/llvm-security-repo/security/advisories/new From a6b5ece75e8289e93ed8233eae186c74c58e4355 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 17 Jul 2025 08:46:01 -0400 Subject: [PATCH 169/813] [AMDGPU] Add support for `v_exp_bf16` on gfx1250 (#149229) Co-authored-by: Mekhanoshin, Stanislav --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 + llvm/test/CodeGen/AMDGPU/bf16-math.ll | 23 + .../CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll | 33 + llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll | 1013 +++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 + llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 48 + .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 56 + llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 60 + .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 + llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 16 + .../gfx1250_asm_vop3_from_vop1-fake16.s | 45 + .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 48 + .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 56 + .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 60 + .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16 + .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 20 + .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 63 + .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 59 + .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 15 + .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 64 ++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 60 + .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 20 + 25 files changed, 1855 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index eee0a94f6fc64..7eb5e2acc8b37 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -673,6 +673,7 @@ TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_exp2_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 0312205d4ff8d..f7450373d1309 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -439,6 +439,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_log_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log); case AMDGPU::BI__builtin_amdgcn_exp2f: + case AMDGPU::BI__builtin_amdgcn_exp2_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_exp2); case AMDGPU::BI__builtin_amdgcn_log_clampf: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index bdf169a1a97da..7b1fd8aefe5be 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -118,6 +118,25 @@ void test_log_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_log_bf16(a); } +// CHECK-LABEL: @test_exp2_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.exp2.bf16(bfloat [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_exp2_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_exp2_bf16(a); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index e1bc39302e126..d93f5e5b81454 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -533,6 +533,7 @@ defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; +defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -1145,6 +1146,7 @@ defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>; defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; +defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll index 05eee2d4d549d..029604c2933a9 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll @@ -25,4 +25,27 @@ define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src ret void } +define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) { +; GCN-LABEL: llvm_exp2_bf16_v: +; GCN: ; %bb.0: +; GCN-NEXT: v_exp_bf16_e32 v2, v2 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %exp = call bfloat @llvm.exp2.bf16(bfloat %src) + store bfloat %exp, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) { +; GCN-LABEL: llvm_exp2_bf16_s: +; GCN: ; %bb.0: +; GCN-NEXT: v_exp_bf16_e32 v2, s0 +; GCN-NEXT: global_store_b16 v[0:1], v2, off +; GCN-NEXT: s_endpgm + %exp = call bfloat @llvm.exp2.bf16(bfloat %src) + store bfloat %exp, ptr addrspace(1) %out, align 2 + ret void +} + declare bfloat @llvm.log2.bf16(bfloat) +declare bfloat @llvm.exp2.bf16(bfloat) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll new file mode 100644 index 0000000000000..6304923790ad5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll @@ -0,0 +1,33 @@ +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.amdgcn.exp2.bf16(bfloat) #0 + +; GCN-LABEL: {{^}}exp_bf16: +; GCN: v_exp_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} +define amdgpu_kernel void @exp_bf16(ptr addrspace(1) %out, bfloat %src) #1 { + %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat %src) #0 + store bfloat %exp, ptr addrspace(1) %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}exp_bf16_constant_4 +; GCN: v_exp_bf16_e32 v0, 4.0 +define amdgpu_kernel void @exp_bf16_constant_4(ptr addrspace(1) %out) #1 { + %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 4.0) #0 + store bfloat %exp, ptr addrspace(1) %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}exp_bf16_constant_100 +; GCN: v_exp_bf16_e32 {{v[0-9]+}}, 0x42c8 +define amdgpu_kernel void @exp_bf16_constant_100(ptr addrspace(1) %out) #1 { + %exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 100.0) #0 + store bfloat %exp, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll new file mode 100644 index 0000000000000..0f37639059169 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll @@ -0,0 +1,1013 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-SDAG-FAKE16 +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-GI-TRUE16 +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1200-GI-FAKE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-SDAG-FAKE16 +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-GI-TRUE16 +; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck %s -check-prefixes=GFX1250-GI-FAKE16 + +define bfloat @v_exp2_bf16(bfloat %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_bf16: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_bf16: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_bf16: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.l, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_bf16: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %result = call bfloat @llvm.exp2.bf16(bfloat %in) + ret bfloat %result +} + +define bfloat @v_exp2_fabs_bf16(bfloat %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fabs_bf16: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fabs_bf16: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fabs_bf16: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, |v0.l| +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fabs_bf16: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, |v0| +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call bfloat @llvm.fabs.bf16(bfloat %in) + %result = call bfloat @llvm.exp2.bf16(bfloat %fabs) + ret bfloat %result +} + +define bfloat @v_exp2_fneg_fabs_bf16(bfloat %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_bf16: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_bf16: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_bf16: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, -|v0.l| +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_bf16: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, -|v0| +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call bfloat @llvm.fabs.bf16(bfloat %in) + %fneg.fabs = fneg bfloat %fabs + %result = call bfloat @llvm.exp2.bf16(bfloat %fneg.fabs) + ret bfloat %result +} + +define bfloat @v_exp2_fneg_bf16(bfloat %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_bf16: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_bf16: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_bf16: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, -v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_bf16: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, -v0 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fneg = fneg bfloat %in + %result = call bfloat @llvm.exp2.bf16(bfloat %fneg) + ret bfloat %result +} + +define bfloat @v_exp2_bf16_fast(bfloat %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_bf16_fast: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_bf16_fast: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_bf16_fast: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.l, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_bf16_fast: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %result = call fast bfloat @llvm.exp2.bf16(bfloat %in) + ret bfloat %result +} + +define <2 x bfloat> @v_exp2_v2bf16(<2 x bfloat> %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_v2bf16: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_v2bf16: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_v2bf16: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.h, v0.h +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.l, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_v2bf16: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v1, v1 +; GFX1250-SDAG-FAKE16-NEXT: v_nop +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in) + ret <2 x bfloat> %result +} + +define <2 x bfloat> @v_exp2_fabs_v2bf16(<2 x bfloat> %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fabs_v2bf16: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v0, v0, v3 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fabs_v2bf16: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX1250-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15 +; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.l, v1.l +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.h, v2.l +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fabs_v2bf16: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v1, v1 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_nop +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in) + %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fabs) + ret <2 x bfloat> %result +} + +define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_v2bf16: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 15 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v2.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, s0 +; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4 +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_fabs_v2bf16: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX1250-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15 +; GFX1250-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, -v1.l +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.h, -v2.l +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v1, -v1 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, -v0 +; GFX1250-SDAG-FAKE16-NEXT: v_nop +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in) + %fneg.fabs = fneg <2 x bfloat> %fabs + %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fneg.fabs) + ret <2 x bfloat> %result +} + +define <2 x bfloat> @v_exp2_fneg_v2bf16(<2 x bfloat> %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_fneg_v2bf16: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_fneg_v2bf16: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.h, -v0.h +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e64 v0.l, -v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v0, -v0 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e64 v1, -v1 +; GFX1250-SDAG-FAKE16-NEXT: v_nop +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %fneg = fneg <2 x bfloat> %in + %result = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %fneg) + ret <2 x bfloat> %result +} + +define <2 x bfloat> @v_exp2_v2bf16_fast(<2 x bfloat> %in) { +; GFX1200-SDAG-TRUE16-LABEL: v_exp2_v2bf16_fast: +; GFX1200-SDAG-TRUE16: ; %bb.0: +; GFX1200-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1200-SDAG-FAKE16-LABEL: v_exp2_v2bf16_fast: +; GFX1200-SDAG-FAKE16: ; %bb.0: +; GFX1200-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1200-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX1200-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, s0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v0, v0, v3 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, s0 +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-FAKE16-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX1200-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-FAKE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-FAKE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX1200-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX1200-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-SDAG-TRUE16-LABEL: v_exp2_v2bf16_fast: +; GFX1250-SDAG-TRUE16: ; %bb.0: +; GFX1250-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.h, v0.h +; GFX1250-SDAG-TRUE16-NEXT: v_exp_bf16_e32 v0.l, v0.l +; GFX1250-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31] +; +; GFX1250-SDAG-FAKE16-LABEL: v_exp2_v2bf16_fast: +; GFX1250-SDAG-FAKE16: ; %bb.0: +; GFX1250-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX1250-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v0, v0 +; GFX1250-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1) +; GFX1250-SDAG-FAKE16-NEXT: v_exp_bf16_e32 v1, v1 +; GFX1250-SDAG-FAKE16-NEXT: v_nop +; GFX1250-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX1250-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31] + %result = call fast <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> %in) + ret <2 x bfloat> %result +} + +declare bfloat @llvm.exp2.bf16(bfloat) #0 +declare <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat>) #0 +declare bfloat @llvm.fabs.bf16(bfloat) #0 +declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index 0f5ce56f1a2cf..426f480200e4b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -253,6 +253,51 @@ v_log_bf16 v5, src_scc v_log_bf16 v127, 0x8000 // GFX1250: v_log_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00] +v_exp_bf16 v5, v1 +// GFX1250: v_exp_bf16_e32 v5, v1 ; encoding: [0x01,0xfb,0x0a,0x7e] + +v_exp_bf16 v5, v127 +// GFX1250: v_exp_bf16_e32 v5, v127 ; encoding: [0x7f,0xfb,0x0a,0x7e] + +v_exp_bf16 v5, s1 +// GFX1250: v_exp_bf16_e32 v5, s1 ; encoding: [0x01,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, s105 +// GFX1250: v_exp_bf16_e32 v5, s105 ; encoding: [0x69,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, vcc_lo +// GFX1250: v_exp_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, vcc_hi +// GFX1250: v_exp_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, ttmp15 +// GFX1250: v_exp_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, m0 +// GFX1250: v_exp_bf16_e32 v5, m0 ; encoding: [0x7d,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, exec_lo +// GFX1250: v_exp_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, exec_hi +// GFX1250: v_exp_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, null +// GFX1250: v_exp_bf16_e32 v5, null ; encoding: [0x7c,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, -1 +// GFX1250: v_exp_bf16_e32 v5, -1 ; encoding: [0xc1,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, 0.5 +// GFX1250: v_exp_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, src_scc +// GFX1250: v_exp_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfa,0x0a,0x7e] + +v_exp_bf16 v127, 0x8000 +// GFX1250: v_exp_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 9dd11e6249b27..93999043d0fb8 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -268,6 +268,54 @@ v_log_bf16 v127, 0x8000 v_log_bf16 v5.h, v1.h // GFX1250: v_log_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf9,0x0a,0x7f] +v_exp_bf16 v5, v1 +// GFX1250: v_exp_bf16_e32 v5, v1 ; encoding: [0x01,0xfb,0x0a,0x7e] + +v_exp_bf16 v5, v127 +// GFX1250: v_exp_bf16_e32 v5, v127 ; encoding: [0x7f,0xfb,0x0a,0x7e] + +v_exp_bf16 v5, s1 +// GFX1250: v_exp_bf16_e32 v5, s1 ; encoding: [0x01,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, s105 +// GFX1250: v_exp_bf16_e32 v5, s105 ; encoding: [0x69,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, vcc_lo +// GFX1250: v_exp_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, vcc_hi +// GFX1250: v_exp_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, ttmp15 +// GFX1250: v_exp_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, m0 +// GFX1250: v_exp_bf16_e32 v5, m0 ; encoding: [0x7d,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, exec_lo +// GFX1250: v_exp_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, exec_hi +// GFX1250: v_exp_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, null +// GFX1250: v_exp_bf16_e32 v5, null ; encoding: [0x7c,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, -1 +// GFX1250: v_exp_bf16_e32 v5, -1 ; encoding: [0xc1,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, 0.5 +// GFX1250: v_exp_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfa,0x0a,0x7e] + +v_exp_bf16 v5, src_scc +// GFX1250: v_exp_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfa,0x0a,0x7e] + +v_exp_bf16 v127, 0x8000 +// GFX1250: v_exp_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00] + +v_exp_bf16 v5.h, v1.h +// GFX1250: v_exp_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xfb,0x0a,0x7f] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index 3882e43b5daf4..459c2d3e7b751 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -282,6 +282,62 @@ v_log_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi // GFX1250: v_log_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf8,0xfe,0x7e,0x7f,0x6f,0x35,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_exp_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_mirror +// GFX1250: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_half_mirror +// GFX1250: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_shl:1 +// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_shl:15 +// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_shr:1 +// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_shr:15 +// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_ror:1 +// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_ror:15 +// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index 2f849b15edee9..30355596be48b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -302,6 +302,66 @@ v_log_bf16 v5.h, v1.h quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_exp_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_mirror +// GFX1250: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_half_mirror +// GFX1250: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_shl:1 +// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_shl:15 +// GFX1250: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_shr:1 +// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_shr:15 +// GFX1250: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_ror:1 +// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_ror:15 +// GFX1250: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5.h, v1.h quad_perm:[3,2,1,0] +// GFX1250: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index 85cf08bdb3a31..50e3e0acae4d2 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -62,6 +62,18 @@ v_log_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_log_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf8,0xfe,0x7e,0x7f,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index d9b320ac6c094..34a15116ebed4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -82,6 +82,22 @@ v_log_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 0d4de4c8c877a..1d5df8d131228 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -307,6 +307,51 @@ v_log_bf16_e64 v5, src_scc mul:4 v_log_bf16_e64 v255, -|0x8000| clamp div:2 // GFX1250: v_log_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfc,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +v_exp_bf16_e64 v5, v1 +// GFX1250: v_exp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00] + +v_exp_bf16_e64 v5, v255 +// GFX1250: v_exp_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00] + +v_exp_bf16_e64 v5, s1 +// GFX1250: v_exp_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, s105 +// GFX1250: v_exp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, vcc_lo +// GFX1250: v_exp_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, vcc_hi +// GFX1250: v_exp_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, ttmp15 +// GFX1250: v_exp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, m0 +// GFX1250: v_exp_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, exec_lo +// GFX1250: v_exp_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, exec_hi +// GFX1250: v_exp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, null +// GFX1250: v_exp_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, -1 +// GFX1250: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] + +v_exp_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] + +v_exp_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 8bf5d242660b6..a461a4cfc8212 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -322,6 +322,54 @@ v_log_bf16_e64 v255, -|0x8000| clamp div:2 v_log_bf16 v5.h, v128.h // GFX1250: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] +v_exp_bf16_e64 v5, v1 +// GFX1250: v_exp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00] + +v_exp_bf16_e64 v5, v255 +// GFX1250: v_exp_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00] + +v_exp_bf16_e64 v5, s1 +// GFX1250: v_exp_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, s105 +// GFX1250: v_exp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, vcc_lo +// GFX1250: v_exp_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, vcc_hi +// GFX1250: v_exp_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, ttmp15 +// GFX1250: v_exp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, m0 +// GFX1250: v_exp_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, exec_lo +// GFX1250: v_exp_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, exec_hi +// GFX1250: v_exp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, null +// GFX1250: v_exp_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, -1 +// GFX1250: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] + +v_exp_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] + +v_exp_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] + +v_exp_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +v_exp_bf16 v5.h, v128.h +// GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index 4231fcf7c5e92..182315f93b2b2 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -282,6 +282,62 @@ v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask // GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfc,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index 1a094e285e730..da02b07191a62 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -302,6 +302,66 @@ v_log_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] +// GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index f6a2103ed9077..744ea732ad95c 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -82,6 +82,22 @@ v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_log_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfc,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 5a1b1414dda37..8bf45cb4345fe 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -102,6 +102,26 @@ v_log_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index c318dd7fc4ee0..c1ea84585a66f 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -344,6 +344,69 @@ 0x81,0xf9,0x0a,0x7f # GFX1250-REAL16: v_log_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xf9,0x0a,0x7f] +0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00] + +0xc1,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, -1 ; encoding: [0xc1,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, -1 ; encoding: [0xc1,0xfa,0x0a,0x7e] + +0xf0,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, 0.5 ; encoding: [0xf0,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfa,0x0a,0x7e] + +0x7f,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, exec_hi ; encoding: [0x7f,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfa,0x0a,0x7e] + +0x7e,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, exec_lo ; encoding: [0x7e,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfa,0x0a,0x7e] + +0x7d,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, m0 ; encoding: [0x7d,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, m0 ; encoding: [0x7d,0xfa,0x0a,0x7e] + +0x7c,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, null ; encoding: [0x7c,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, null ; encoding: [0x7c,0xfa,0x0a,0x7e] + +0x01,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, s1 ; encoding: [0x01,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, s1 ; encoding: [0x01,0xfa,0x0a,0x7e] + +0x69,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, s105 ; encoding: [0x69,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, s105 ; encoding: [0x69,0xfa,0x0a,0x7e] + +0xfd,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, src_scc ; encoding: [0xfd,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfa,0x0a,0x7e] + +0x7b,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfa,0x0a,0x7e] + +0x01,0xfb,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, v1.l ; encoding: [0x01,0xfb,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, v1 ; encoding: [0x01,0xfb,0x0a,0x7e] + +0x7f,0xfb,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, v127.l ; encoding: [0x7f,0xfb,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, v127 ; encoding: [0x7f,0xfb,0x0a,0x7e] + +0x6b,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfa,0x0a,0x7e] + +0x6a,0xfa,0x0a,0x7e +# GFX1250-REAL16: v_exp_bf16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xfa,0x0a,0x7e] +# GFX1250-FAKE16: v_exp_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfa,0x0a,0x7e] + +0x81,0xfb,0x0a,0x7f +# GFX1250-REAL16: v_exp_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xfb,0x0a,0x7f] + 0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00 # GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index 22ed09e957de7..bb5f1442920fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -297,6 +297,65 @@ 0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff # GFX1250-REAL16: v_log_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf8,0x0a,0x7f,0x81,0x1b,0x00,0xff] +0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30 +# GFX1250-REAL16: v_exp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +# GFX1250-FAKE16: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x41,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x40,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x21,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x2f,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x50,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x01,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x11,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfa,0x0a,0x7e,0x01,0x60,0x09,0x13] + +0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff +# GFX1250-REAL16: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff] + 0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30 # GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index d8458e8808b39..1b7da587d20fd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -80,6 +80,21 @@ 0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05 # GFX1250-REAL16: v_log_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf8,0x0a,0x7f,0x81,0x77,0x39,0x05] +0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_exp_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_exp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfa,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX1250-REAL16: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05] + 0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index d1a7158ce582e..43f6f5d66f25a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -322,6 +322,70 @@ # GFX1250-REAL16: v_log_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfc,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_log_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfc,0xd5,0x80,0x01,0x00,0x00] +0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfd,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfd,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfd,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, null ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfd,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfd,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfd,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfd,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfd,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfd,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfd,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfd,0xd5,0x6a,0x00,0x00,0x00] + +0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00] + 0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 56f65d0711664..016a669e9ae5c 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -122,6 +122,66 @@ # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfc,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + 0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index 9ff9e54c1b40c..cda17a850d9b6 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -42,6 +42,26 @@ # GFX1250-REAL16: v_log_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_log_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfc,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfd,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + 0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From a102342990231f8558361da68e5df92c7b1c737d Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 17 Jul 2025 08:49:45 -0400 Subject: [PATCH 170/813] [AMDGPU] Add support for `v_sin_bf16` on gfx1250 (#149241) Co-authored-by: Mekhanoshin, Stanislav --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 ++++++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 + .../CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll | 33 ++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 +++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 48 ++++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 56 ++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 60 +++++++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 ++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 16 +++++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 45 +++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 48 ++++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 56 ++++++++++++++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 60 +++++++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16 +++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 20 ++++++ .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 63 ++++++++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 59 +++++++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 15 +++++ .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 64 +++++++++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 60 +++++++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 20 ++++++ 23 files changed, 819 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 7eb5e2acc8b37..e7a45f0e4300d 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -674,6 +674,7 @@ TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_exp2_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_sin_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index f7450373d1309..32cf622f20605 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -429,6 +429,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Intrinsic::amdgcn_rsq_clamp); case AMDGPU::BI__builtin_amdgcn_sinf: case AMDGPU::BI__builtin_amdgcn_sinh: + case AMDGPU::BI__builtin_amdgcn_sin_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin); case AMDGPU::BI__builtin_amdgcn_cosf: case AMDGPU::BI__builtin_amdgcn_cosh: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 7b1fd8aefe5be..748b6455103ec 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -137,6 +137,25 @@ void test_exp2_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_exp2_bf16(a); } +// CHECK-LABEL: @test_sin_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sin.bf16(bfloat [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_sin_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_sin_bf16(a); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index d93f5e5b81454..c91319eae7218 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -534,6 +534,7 @@ defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>; defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; +defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -1147,6 +1148,7 @@ defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>; defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>; +defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll new file mode 100644 index 0000000000000..9c35a7eae0b8e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.bf16.ll @@ -0,0 +1,33 @@ +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.amdgcn.sin.bf16(bfloat) #0 + +; GCN-LABEL: {{^}}sin_bf16: +; GCN: v_sin_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} +define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 { + %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat %src) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}sin_bf16_constant_4 +; GCN: v_sin_bf16_e32 v0, 4.0 +define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 { + %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat 4.0) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}sin_bf16_constant_100 +; GCN: v_sin_bf16_e32 {{v[0-9]+}}, 0x42c8 +define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 { + %sin = call bfloat @llvm.amdgcn.sin.bf16(bfloat 100.0) #0 + store bfloat %sin, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index 426f480200e4b..f51d709a594a0 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -298,6 +298,51 @@ v_exp_bf16 v5, src_scc v_exp_bf16 v127, 0x8000 // GFX1250: v_exp_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfa,0xfe,0x7e,0x00,0x80,0x00,0x00] +v_sin_bf16 v5, v1 +// GFX1250: v_sin_bf16_e32 v5, v1 ; encoding: [0x01,0xfd,0x0a,0x7e] + +v_sin_bf16 v5, v127 +// GFX1250: v_sin_bf16_e32 v5, v127 ; encoding: [0x7f,0xfd,0x0a,0x7e] + +v_sin_bf16 v5, s1 +// GFX1250: v_sin_bf16_e32 v5, s1 ; encoding: [0x01,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, s105 +// GFX1250: v_sin_bf16_e32 v5, s105 ; encoding: [0x69,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, vcc_lo +// GFX1250: v_sin_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, vcc_hi +// GFX1250: v_sin_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, ttmp15 +// GFX1250: v_sin_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, m0 +// GFX1250: v_sin_bf16_e32 v5, m0 ; encoding: [0x7d,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, exec_lo +// GFX1250: v_sin_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, exec_hi +// GFX1250: v_sin_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, null +// GFX1250: v_sin_bf16_e32 v5, null ; encoding: [0x7c,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, -1 +// GFX1250: v_sin_bf16_e32 v5, -1 ; encoding: [0xc1,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, 0.5 +// GFX1250: v_sin_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, src_scc +// GFX1250: v_sin_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfc,0x0a,0x7e] + +v_sin_bf16 v127, 0x8000 +// GFX1250: v_sin_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 93999043d0fb8..39fc73d70cab2 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -316,6 +316,54 @@ v_exp_bf16 v127, 0x8000 v_exp_bf16 v5.h, v1.h // GFX1250: v_exp_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xfb,0x0a,0x7f] +v_sin_bf16 v5, v1 +// GFX1250: v_sin_bf16_e32 v5, v1 ; encoding: [0x01,0xfd,0x0a,0x7e] + +v_sin_bf16 v5, v127 +// GFX1250: v_sin_bf16_e32 v5, v127 ; encoding: [0x7f,0xfd,0x0a,0x7e] + +v_sin_bf16 v5, s1 +// GFX1250: v_sin_bf16_e32 v5, s1 ; encoding: [0x01,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, s105 +// GFX1250: v_sin_bf16_e32 v5, s105 ; encoding: [0x69,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, vcc_lo +// GFX1250: v_sin_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, vcc_hi +// GFX1250: v_sin_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, ttmp15 +// GFX1250: v_sin_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, m0 +// GFX1250: v_sin_bf16_e32 v5, m0 ; encoding: [0x7d,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, exec_lo +// GFX1250: v_sin_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, exec_hi +// GFX1250: v_sin_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, null +// GFX1250: v_sin_bf16_e32 v5, null ; encoding: [0x7c,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, -1 +// GFX1250: v_sin_bf16_e32 v5, -1 ; encoding: [0xc1,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, 0.5 +// GFX1250: v_sin_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfc,0x0a,0x7e] + +v_sin_bf16 v5, src_scc +// GFX1250: v_sin_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfc,0x0a,0x7e] + +v_sin_bf16 v127, 0x8000 +// GFX1250: v_sin_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00] + +v_sin_bf16 v5.h, v1.h +// GFX1250: v_sin_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xfd,0x0a,0x7f] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index 459c2d3e7b751..97058eb2e7c9f 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -338,6 +338,62 @@ v_exp_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi // GFX1250: v_exp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfa,0xfe,0x7e,0x7f,0x6f,0x35,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_sin_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_mirror +// GFX1250: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_half_mirror +// GFX1250: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_shl:1 +// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_shl:15 +// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_shr:1 +// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_shr:15 +// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_ror:1 +// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_ror:15 +// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index 30355596be48b..6a293c19a79a4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -362,6 +362,66 @@ v_exp_bf16 v5.h, v1.h quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_sin_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_mirror +// GFX1250: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_half_mirror +// GFX1250: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_shl:1 +// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_shl:15 +// GFX1250: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_shr:1 +// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_shr:15 +// GFX1250: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_ror:1 +// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_ror:15 +// GFX1250: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5.h, v1.h quad_perm:[3,2,1,0] +// GFX1250: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index 50e3e0acae4d2..d1f53c7b2065c 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -74,6 +74,18 @@ v_exp_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_exp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfa,0xfe,0x7e,0x7f,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index 34a15116ebed4..dbee9f39df5f5 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -98,6 +98,22 @@ v_exp_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 1d5df8d131228..4257334444244 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -352,6 +352,51 @@ v_exp_bf16_e64 v5, src_scc mul:4 v_exp_bf16_e64 v255, -|0x8000| clamp div:2 // GFX1250: v_exp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfd,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +v_sin_bf16_e64 v5, v1 +// GFX1250: v_sin_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00] + +v_sin_bf16_e64 v5, v255 +// GFX1250: v_sin_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00] + +v_sin_bf16_e64 v5, s1 +// GFX1250: v_sin_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, s105 +// GFX1250: v_sin_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, vcc_lo +// GFX1250: v_sin_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, vcc_hi +// GFX1250: v_sin_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, ttmp15 +// GFX1250: v_sin_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, m0 +// GFX1250: v_sin_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, exec_lo +// GFX1250: v_sin_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, exec_hi +// GFX1250: v_sin_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, null +// GFX1250: v_sin_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, -1 +// GFX1250: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] + +v_sin_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] + +v_sin_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index a461a4cfc8212..83986a61fd572 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -370,6 +370,54 @@ v_exp_bf16_e64 v255, -|0x8000| clamp div:2 v_exp_bf16 v5.h, v128.h // GFX1250: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] +v_sin_bf16_e64 v5, v1 +// GFX1250: v_sin_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00] + +v_sin_bf16_e64 v5, v255 +// GFX1250: v_sin_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00] + +v_sin_bf16_e64 v5, s1 +// GFX1250: v_sin_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, s105 +// GFX1250: v_sin_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, vcc_lo +// GFX1250: v_sin_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, vcc_hi +// GFX1250: v_sin_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, ttmp15 +// GFX1250: v_sin_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, m0 +// GFX1250: v_sin_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, exec_lo +// GFX1250: v_sin_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, exec_hi +// GFX1250: v_sin_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, null +// GFX1250: v_sin_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, -1 +// GFX1250: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] + +v_sin_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] + +v_sin_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] + +v_sin_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +v_sin_bf16 v5.h, v128.h +// GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index 182315f93b2b2..bb6739ec312a5 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -338,6 +338,62 @@ v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask // GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfd,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index da02b07191a62..5f6f28e0f6edb 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -362,6 +362,66 @@ v_exp_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] +// GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 744ea732ad95c..037e7d650ad73 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -98,6 +98,22 @@ v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_exp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfd,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 8bf45cb4345fe..53fb0eb4e9517 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -122,6 +122,26 @@ v_exp_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index c1ea84585a66f..fec2207d70a8e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -407,6 +407,69 @@ 0x81,0xfb,0x0a,0x7f # GFX1250-REAL16: v_exp_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xfb,0x0a,0x7f] +0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00] + +0xc1,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, -1 ; encoding: [0xc1,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, -1 ; encoding: [0xc1,0xfc,0x0a,0x7e] + +0xf0,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, 0.5 ; encoding: [0xf0,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfc,0x0a,0x7e] + +0x7f,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, exec_hi ; encoding: [0x7f,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfc,0x0a,0x7e] + +0x7e,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, exec_lo ; encoding: [0x7e,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfc,0x0a,0x7e] + +0x7d,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, m0 ; encoding: [0x7d,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, m0 ; encoding: [0x7d,0xfc,0x0a,0x7e] + +0x7c,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, null ; encoding: [0x7c,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, null ; encoding: [0x7c,0xfc,0x0a,0x7e] + +0x01,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, s1 ; encoding: [0x01,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, s1 ; encoding: [0x01,0xfc,0x0a,0x7e] + +0x69,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, s105 ; encoding: [0x69,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, s105 ; encoding: [0x69,0xfc,0x0a,0x7e] + +0xfd,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, src_scc ; encoding: [0xfd,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfc,0x0a,0x7e] + +0x7b,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfc,0x0a,0x7e] + +0x01,0xfd,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, v1.l ; encoding: [0x01,0xfd,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, v1 ; encoding: [0x01,0xfd,0x0a,0x7e] + +0x7f,0xfd,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, v127.l ; encoding: [0x7f,0xfd,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, v127 ; encoding: [0x7f,0xfd,0x0a,0x7e] + +0x6b,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfc,0x0a,0x7e] + +0x6a,0xfc,0x0a,0x7e +# GFX1250-REAL16: v_sin_bf16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xfc,0x0a,0x7e] +# GFX1250-FAKE16: v_sin_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfc,0x0a,0x7e] + +0x81,0xfd,0x0a,0x7f +# GFX1250-REAL16: v_sin_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xfd,0x0a,0x7f] + 0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00 # GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index bb5f1442920fd..dc8c6b15dd1bb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -356,6 +356,65 @@ 0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff # GFX1250-REAL16: v_exp_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfa,0x0a,0x7f,0x81,0x1b,0x00,0xff] +0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30 +# GFX1250-REAL16: v_sin_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +# GFX1250-FAKE16: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x41,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x40,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x21,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x2f,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x50,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x01,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x11,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfc,0x0a,0x7e,0x01,0x60,0x09,0x13] + +0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff +# GFX1250-REAL16: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff] + 0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30 # GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index 1b7da587d20fd..741bf3fd34d32 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -95,6 +95,21 @@ 0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05 # GFX1250-REAL16: v_exp_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfa,0x0a,0x7f,0x81,0x77,0x39,0x05] +0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_sin_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sin_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfc,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX1250-REAL16: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05] + 0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 43f6f5d66f25a..cd9b7120ca966 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -386,6 +386,70 @@ # GFX1250-REAL16: v_exp_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfd,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_exp_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfd,0xd5,0x80,0x01,0x00,0x00] +0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xfe,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xfe,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xfe,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, null ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, null ; encoding: [0x05,0x00,0xfe,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xfe,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xfe,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xfe,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xfe,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xfe,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xfe,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xfe,0xd5,0x6a,0x00,0x00,0x00] + +0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00] + 0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 016a669e9ae5c..ed07393d18b18 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -182,6 +182,66 @@ # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfd,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + 0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index cda17a850d9b6..a6d6713c1b00d 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -62,6 +62,26 @@ # GFX1250-REAL16: v_exp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_exp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfd,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfe,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + 0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From 359dca0dad72d11e41e08136bc6c6cca3f22f038 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Tue, 15 Jul 2025 15:20:21 +0100 Subject: [PATCH 171/813] [AMDGPU] Move class WaitcntBrackets after class SIInsertWaitcnts. NFC. This is a prerequisite for "[AMDGPU] Move common fields out of WaitcntBrackets. NFC. (#148864)" --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 470 ++++++++++---------- 1 file changed, 236 insertions(+), 234 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7ce1359f03da6..cb72a64eaee2a 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -260,240 +260,7 @@ InstCounterType eventCounter(const unsigned *masks, WaitEventType E) { llvm_unreachable("event type has no associated counter"); } -// This objects maintains the current score brackets of each wait counter, and -// a per-register scoreboard for each wait counter. -// -// We also maintain the latest score for every event type that can change the -// waitcnt in order to know if there are multiple types of events within -// the brackets. When multiple types of event happen in the bracket, -// wait count may get decreased out of order, therefore we need to put in -// "s_waitcnt 0" before use. -class WaitcntBrackets { -public: - WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, - HardwareLimits Limits, const unsigned *WaitEventMaskForInst, - InstCounterType SmemAccessCounter) - : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), - WaitEventMaskForInst(WaitEventMaskForInst), - SmemAccessCounter(SmemAccessCounter) {} - - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - default: - break; - } - return 0; - } - - bool isSmemCounter(InstCounterType T) const { - return T == SmemAccessCounter || T == X_CNT; - } - - unsigned getSgprScoresIdx(InstCounterType T) const { - assert(isSmemCounter(T) && "Invalid SMEM counter"); - return T == X_CNT ? 1 : 0; - } - - unsigned getScoreLB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreLBs[T]; - } - - unsigned getScoreUB(InstCounterType T) const { - assert(T < NUM_INST_CNTS); - return ScoreUBs[T]; - } - - unsigned getScoreRange(InstCounterType T) const { - return getScoreUB(T) - getScoreLB(T); - } - - unsigned getRegScore(int GprNo, InstCounterType T) const { - if (GprNo < NUM_ALL_VGPRS) - return VgprScores[T][GprNo]; - return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; - } - - bool merge(const WaitcntBrackets &Other); - - RegInterval getRegInterval(const MachineInstr *MI, - const MachineRegisterInfo *MRI, - const SIRegisterInfo *TRI, - const MachineOperand &Op) const; - - bool counterOutOfOrder(InstCounterType T) const; - void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; - void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; - - void determineWait(InstCounterType T, RegInterval Interval, - AMDGPU::Waitcnt &Wait) const; - void determineWait(InstCounterType T, int RegNo, - AMDGPU::Waitcnt &Wait) const { - determineWait(T, {RegNo, RegNo + 1}, Wait); - } - - void applyWaitcnt(const AMDGPU::Waitcnt &Wait); - void applyWaitcnt(InstCounterType T, unsigned Count); - void applyXcnt(const AMDGPU::Waitcnt &Wait); - void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, WaitEventType E, - MachineInstr &MI); - - unsigned hasPendingEvent() const { return PendingEvents; } - unsigned hasPendingEvent(WaitEventType E) const { - return PendingEvents & (1 << E); - } - unsigned hasPendingEvent(InstCounterType T) const { - unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; - assert((HasPending != 0) == (getScoreRange(T) != 0)); - return HasPending; - } - - bool hasMixedPendingEvents(InstCounterType T) const { - unsigned Events = hasPendingEvent(T); - // Return true if more than one bit is set in Events. - return Events & (Events - 1); - } - - bool hasPendingFlat() const { - return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && - LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || - (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && - LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); - } - - void setPendingFlat() { - LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; - LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; - } - - bool hasPendingGDS() const { - return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; - } - - unsigned getPendingGDSWait() const { - return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); - } - - void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } - - // Return true if there might be pending writes to the vgpr-interval by VMEM - // instructions with types different from V. - bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - if (VgprVmemTypes[RegNo] & ~(1 << V)) - return true; - } - return false; - } - - void clearVgprVmemTypes(RegInterval Interval) { - for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { - assert(RegNo < NUM_ALL_VGPRS); - VgprVmemTypes[RegNo] = 0; - } - } - - void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); - PendingEvents |= WaitEventMaskForInst[STORE_CNT]; - } - - ArrayRef getLDSDMAStores() const { - return LDSDMAStores; - } - - bool hasPointSampleAccel(const MachineInstr &MI) const; - bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, - RegInterval Interval) const; - - void print(raw_ostream &) const; - void dump() const { print(dbgs()); } - -private: - struct MergeInfo { - unsigned OldLB; - unsigned OtherLB; - unsigned MyShift; - unsigned OtherShift; - }; - static bool mergeScore(const MergeInfo &M, unsigned &Score, - unsigned OtherScore); - - void setScoreLB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreLBs[T] = Val; - } - - void setScoreUB(InstCounterType T, unsigned Val) { - assert(T < NUM_INST_CNTS); - ScoreUBs[T] = Val; - - if (T != EXP_CNT) - return; - - if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); - } - - void setRegScore(int GprNo, InstCounterType T, unsigned Val) { - setScoreByInterval({GprNo, GprNo + 1}, T, Val); - } - - void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, - unsigned Score); - - void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, - const MachineRegisterInfo *MRI, - const MachineOperand &Op, InstCounterType CntTy, - unsigned Val); - - const GCNSubtarget *ST = nullptr; - InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; - HardwareLimits Limits = {}; - const unsigned *WaitEventMaskForInst; - InstCounterType SmemAccessCounter; - unsigned ScoreLBs[NUM_INST_CNTS] = {0}; - unsigned ScoreUBs[NUM_INST_CNTS] = {0}; - unsigned PendingEvents = 0; - // Remember the last flat memory operation. - unsigned LastFlat[NUM_INST_CNTS] = {0}; - // Remember the last GDS operation. - unsigned LastGDS = 0; - // wait_cnt scores for every vgpr. - // Keep track of the VgprUB and SgprUB to make merge at join efficient. - int VgprUB = -1; - int SgprUB = -1; - unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; - // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt - // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. - // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the - // X_CNT score. - unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; - // Bitmask of the VmemTypes of VMEM instructions that might have a pending - // write to each vgpr. - unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; - // Store representative LDS DMA operations. The only useful info here is - // alias info. One store is kept per unique AAInfo. - SmallVector LDSDMAStores; -}; +class WaitcntBrackets; // This abstracts the logic for generating and updating S_WAIT* instructions // away from the analysis that determines where they are needed. This was @@ -791,6 +558,241 @@ class SIInsertWaitcnts { WaitcntBrackets &ScoreBrackets); }; +// This objects maintains the current score brackets of each wait counter, and +// a per-register scoreboard for each wait counter. +// +// We also maintain the latest score for every event type that can change the +// waitcnt in order to know if there are multiple types of events within +// the brackets. When multiple types of event happen in the bracket, +// wait count may get decreased out of order, therefore we need to put in +// "s_waitcnt 0" before use. +class WaitcntBrackets { +public: + WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, + HardwareLimits Limits, const unsigned *WaitEventMaskForInst, + InstCounterType SmemAccessCounter) + : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), + WaitEventMaskForInst(WaitEventMaskForInst), + SmemAccessCounter(SmemAccessCounter) {} + + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + default: + break; + } + return 0; + } + + bool isSmemCounter(InstCounterType T) const { + return T == SmemAccessCounter || T == X_CNT; + } + + unsigned getSgprScoresIdx(InstCounterType T) const { + assert(isSmemCounter(T) && "Invalid SMEM counter"); + return T == X_CNT ? 1 : 0; + } + + unsigned getScoreLB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreLBs[T]; + } + + unsigned getScoreUB(InstCounterType T) const { + assert(T < NUM_INST_CNTS); + return ScoreUBs[T]; + } + + unsigned getScoreRange(InstCounterType T) const { + return getScoreUB(T) - getScoreLB(T); + } + + unsigned getRegScore(int GprNo, InstCounterType T) const { + if (GprNo < NUM_ALL_VGPRS) + return VgprScores[T][GprNo]; + return SgprScores[getSgprScoresIdx(T)][GprNo - NUM_ALL_VGPRS]; + } + + bool merge(const WaitcntBrackets &Other); + + RegInterval getRegInterval(const MachineInstr *MI, + const MachineRegisterInfo *MRI, + const SIRegisterInfo *TRI, + const MachineOperand &Op) const; + + bool counterOutOfOrder(InstCounterType T) const; + void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; + void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; + + void determineWait(InstCounterType T, RegInterval Interval, + AMDGPU::Waitcnt &Wait) const; + void determineWait(InstCounterType T, int RegNo, + AMDGPU::Waitcnt &Wait) const { + determineWait(T, {RegNo, RegNo + 1}, Wait); + } + + void applyWaitcnt(const AMDGPU::Waitcnt &Wait); + void applyWaitcnt(InstCounterType T, unsigned Count); + void applyXcnt(const AMDGPU::Waitcnt &Wait); + void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, WaitEventType E, + MachineInstr &MI); + + unsigned hasPendingEvent() const { return PendingEvents; } + unsigned hasPendingEvent(WaitEventType E) const { + return PendingEvents & (1 << E); + } + unsigned hasPendingEvent(InstCounterType T) const { + unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; + assert((HasPending != 0) == (getScoreRange(T) != 0)); + return HasPending; + } + + bool hasMixedPendingEvents(InstCounterType T) const { + unsigned Events = hasPendingEvent(T); + // Return true if more than one bit is set in Events. + return Events & (Events - 1); + } + + bool hasPendingFlat() const { + return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] && + LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) || + (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] && + LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT])); + } + + void setPendingFlat() { + LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT]; + LastFlat[DS_CNT] = ScoreUBs[DS_CNT]; + } + + bool hasPendingGDS() const { + return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT]; + } + + unsigned getPendingGDSWait() const { + return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); + } + + void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } + + // Return true if there might be pending writes to the vgpr-interval by VMEM + // instructions with types different from V. + bool hasOtherPendingVmemTypes(RegInterval Interval, VmemType V) const { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + if (VgprVmemTypes[RegNo] & ~(1 << V)) + return true; + } + return false; + } + + void clearVgprVmemTypes(RegInterval Interval) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + assert(RegNo < NUM_ALL_VGPRS); + VgprVmemTypes[RegNo] = 0; + } + } + + void setStateOnFunctionEntryOrReturn() { + setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); + PendingEvents |= WaitEventMaskForInst[STORE_CNT]; + } + + ArrayRef getLDSDMAStores() const { + return LDSDMAStores; + } + + bool hasPointSampleAccel(const MachineInstr &MI) const; + bool hasPointSamplePendingVmemTypes(const MachineInstr &MI, + RegInterval Interval) const; + + void print(raw_ostream &) const; + void dump() const { print(dbgs()); } + +private: + struct MergeInfo { + unsigned OldLB; + unsigned OtherLB; + unsigned MyShift; + unsigned OtherShift; + }; + static bool mergeScore(const MergeInfo &M, unsigned &Score, + unsigned OtherScore); + + void setScoreLB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreLBs[T] = Val; + } + + void setScoreUB(InstCounterType T, unsigned Val) { + assert(T < NUM_INST_CNTS); + ScoreUBs[T] = Val; + + if (T != EXP_CNT) + return; + + if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); + } + + void setRegScore(int GprNo, InstCounterType T, unsigned Val) { + setScoreByInterval({GprNo, GprNo + 1}, T, Val); + } + + void setScoreByInterval(RegInterval Interval, InstCounterType CntTy, + unsigned Score); + + void setScoreByOperand(const MachineInstr *MI, const SIRegisterInfo *TRI, + const MachineRegisterInfo *MRI, + const MachineOperand &Op, InstCounterType CntTy, + unsigned Val); + + const GCNSubtarget *ST = nullptr; + InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; + HardwareLimits Limits = {}; + const unsigned *WaitEventMaskForInst; + InstCounterType SmemAccessCounter; + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; + unsigned ScoreUBs[NUM_INST_CNTS] = {0}; + unsigned PendingEvents = 0; + // Remember the last flat memory operation. + unsigned LastFlat[NUM_INST_CNTS] = {0}; + // Remember the last GDS operation. + unsigned LastGDS = 0; + // wait_cnt scores for every vgpr. + // Keep track of the VgprUB and SgprUB to make merge at join efficient. + int VgprUB = -1; + int SgprUB = -1; + unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}}; + // Wait cnt scores for every sgpr, the DS_CNT (corresponding to LGKMcnt + // pre-gfx12) or KM_CNT (gfx12+ only), and X_CNT (gfx1250) are relevant. + // Row 0 represents the score for either DS_CNT or KM_CNT and row 1 keeps the + // X_CNT score. + unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}}; + // Bitmask of the VmemTypes of VMEM instructions that might have a pending + // write to each vgpr. + unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0}; + // Store representative LDS DMA operations. The only useful info here is + // alias info. One store is kept per unique AAInfo. + SmallVector LDSDMAStores; +}; + class SIInsertWaitcntsLegacy : public MachineFunctionPass { public: static char ID; From 935fd986474fa2b6fe9e424ffd5d8cbc875151be Mon Sep 17 00:00:00 2001 From: George Burgess IV Date: Thu, 17 Jul 2025 07:09:10 -0600 Subject: [PATCH 172/813] [Docs] remove beginner office hours from GettingInvolved (#149160) These were turned down at the beginning of this year; thanks to the folks on https://discourse.llvm.org/t/is-the-beginner-office-hours-still-running/87398/2 for flagging this! --- N.B., I tried testing via `ninja doxygen-llvm`, but that didn't terminate on my machine within 30mins (either with or without this patch). I assume it's some local config bug on my end, but it happened on `main` and `main~1000`, so I'm not sure how to test. Since the change is pretty trivial, still comfortable uploading for review. --- llvm/docs/GettingInvolved.rst | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index dc53072e09e39..d87a8bd81cc7b 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -354,11 +354,6 @@ The :doc:`CodeOfConduct` applies to all office hours. - Every first Friday of the month, 14:00 UK time, for 60 minutes. - `Google meet `__ - English, Portuguese - * - Rotating hosts - - Getting Started, beginner questions, new contributors. - - Every Tuesday at 2 PM ET (11 AM PT), for 30 minutes. - - `Google meet `__ - - English For event owners, our Discord bot also supports sending automated announcements of upcoming office hours. Please see the :ref:`discord-bot-event-pings` section From 4166df2073b6b3e5c7ab0c25d9bc73980b50ea31 Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Tue, 15 Jul 2025 14:02:09 +0200 Subject: [PATCH 173/813] [RISCV][test] Add tests for vector subtraction if above threshold --- .../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 214 ++++++++++++++++++ llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll | 214 ++++++++++++++++++ 2 files changed, 428 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 0c30cbe4a42ef..35b9457fbc1ff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -5707,3 +5707,217 @@ define void @msub_vv_v2i64_2(ptr %x, <2 x i64> %y) { store <2 x i64> %c, ptr %x ret void } + +define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) { +; CHECK-LABEL: vsub_if_uge_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmsltu.vv v0, v8, v9 +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult <8 x i8> %va, %vb + %select = select <8 x i1> %cmp, <8 x i8> zeroinitializer, <8 x i8> %vb + %sub = sub nuw <8 x i8> %va, %select + ret <8 x i8> %sub +} + +define <8 x i8> @vsub_if_uge_swapped_v8i8(<8 x i8> %va, <8 x i8> %vb) { +; CHECK-LABEL: vsub_if_uge_swapped_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmsleu.vv v0, v9, v8 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %cmp = icmp uge <8 x i8> %va, %vb + %select = select <8 x i1> %cmp, <8 x i8> %vb, <8 x i8> zeroinitializer + %sub = sub nuw <8 x i8> %va, %select + ret <8 x i8> %sub +} + +define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) { +; CHECK-LABEL: vsub_if_uge_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmsltu.vv v0, v8, v9 +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult <8 x i16> %va, %vb + %select = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vb + %sub = sub nuw <8 x i16> %va, %select + ret <8 x i16> %sub +} + +define <8 x i16> @vsub_if_uge_swapped_v8i16(<8 x i16> %va, <8 x i16> %vb) { +; CHECK-LABEL: vsub_if_uge_swapped_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vmsleu.vv v0, v9, v8 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %cmp = icmp uge <8 x i16> %va, %vb + %select = select <8 x i1> %cmp, <8 x i16> %vb, <8 x i16> zeroinitializer + %sub = sub nuw <8 x i16> %va, %select + ret <8 x i16> %sub +} + +define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) { +; CHECK-LABEL: vsub_if_uge_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmsltu.vv v0, v8, v9 +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult <4 x i32> %va, %vb + %select = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vb + %sub = sub nuw <4 x i32> %va, %select + ret <4 x i32> %sub +} + +define <4 x i32> @vsub_if_uge_swapped_v4i32(<4 x i32> %va, <4 x i32> %vb) { +; CHECK-LABEL: vsub_if_uge_swapped_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vmsleu.vv v0, v9, v8 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %cmp = icmp uge <4 x i32> %va, %vb + %select = select <4 x i1> %cmp, <4 x i32> %vb, <4 x i32> zeroinitializer + %sub = sub nuw <4 x i32> %va, %select + ret <4 x i32> %sub +} + +define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) { +; CHECK-LABEL: vsub_if_uge_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vmsltu.vv v0, v8, v9 +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult <2 x i64> %va, %vb + %select = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vb + %sub = sub nuw <2 x i64> %va, %select + ret <2 x i64> %sub +} + +define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) { +; CHECK-LABEL: vsub_if_uge_swapped_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vmsleu.vv v0, v9, v8 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %cmp = icmp uge <2 x i64> %va, %vb + %select = select <2 x i1> %cmp, <2 x i64> %vb, <2 x i64> zeroinitializer + %sub = sub nuw <2 x i64> %va, %select + ret <2 x i64> %sub +} + +define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) { +; CHECK-LABEL: sub_if_uge_C_v8i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vi v0, v8, 12 +; CHECK-NEXT: vadd.vi v8, v8, -13, v0.t +; CHECK-NEXT: ret + %cmp = icmp ugt <8 x i8> %x, splat (i8 12) + %sub = add <8 x i8> %x, splat (i8 -13) + %select = select <8 x i1> %cmp, <8 x i8> %sub, <8 x i8> %x + ret <8 x i8> %select +} + +define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) { +; CHECK-LABEL: sub_if_uge_C_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 2000 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu +; CHECK-NEXT: vmsgtu.vx v0, v8, a0 +; CHECK-NEXT: li a0, -2001 +; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %cmp = icmp ugt <8 x i16> %x, splat (i16 2000) + %sub = add <8 x i16> %x, splat (i16 -2001) + %select = select <8 x i1> %cmp, <8 x i16> %sub, <8 x i16> %x + ret <8 x i16> %select +} + +define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) { +; CHECK-LABEL: sub_if_uge_C_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vmsgtu.vx v0, v8, a0 +; CHECK-NEXT: lui a0, 1048560 +; CHECK-NEXT: addi a0, a0, 15 +; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %cmp = icmp ugt <4 x i32> %x, splat (i32 65520) + %sub = add <4 x i32> %x, splat (i32 -65521) + %select = select <4 x i1> %cmp, <4 x i32> %sub, <4 x i32> %x + ret <4 x i32> %select +} + +define <4 x i32> @sub_if_uge_C_swapped_v4i32(<4 x i32> %x) { +; CHECK-LABEL: sub_if_uge_C_swapped_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmsltu.vx v0, v8, a0 +; CHECK-NEXT: lui a0, 1048560 +; CHECK-NEXT: addi a0, a0, 15 +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult <4 x i32> %x, splat (i32 65521) + %sub = add <4 x i32> %x, splat (i32 -65521) + %select = select <4 x i1> %cmp, <4 x i32> %x, <4 x i32> %sub + ret <4 x i32> %select +} + +define <2 x i64> @sub_if_uge_C_v2i64(<2 x i64> %x) nounwind { +; RV32-LABEL: sub_if_uge_C_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: lui a1, 172127 +; RV32-NEXT: mv a2, sp +; RV32-NEXT: addi a1, a1, 512 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: li a0, -2 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV32-NEXT: vlse64.v v9, (a2), zero +; RV32-NEXT: lui a1, 876449 +; RV32-NEXT: addi a1, a1, -513 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vmsltu.vv v0, v9, v8 +; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: sub_if_uge_C_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, 2384 +; RV64-NEXT: addi a0, a0, 761 +; RV64-NEXT: slli a0, a0, 9 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; RV64-NEXT: vmsgtu.vx v0, v8, a0 +; RV64-NEXT: lui a0, 1048278 +; RV64-NEXT: addi a0, a0, -95 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -513 +; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %cmp = icmp ugt <2 x i64> %x, splat (i64 5000000000) + %sub = add <2 x i64> %x, splat (i64 -5000000001) + %select = select <2 x i1> %cmp, <2 x i64> %sub, <2 x i64> %x + ret <2 x i64> %select +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll index e3b2d6c1efe1f..a21a526e00ec8 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll @@ -893,3 +893,217 @@ define @vmin_vi_mask_nxv8i32( %va, %cmp, %va, %vs ret %vc } + +define @vsub_if_uge_nxv2i8( %va, %vb) { +; CHECK-LABEL: vsub_if_uge_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma +; CHECK-NEXT: vmsltu.vv v0, v8, v9 +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult %va, %vb + %select = select %cmp, zeroinitializer, %vb + %sub = sub nuw %va, %select + ret %sub +} + +define @vsub_if_uge_swapped_nxv2i8( %va, %vb) { +; CHECK-LABEL: vsub_if_uge_swapped_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; CHECK-NEXT: vmsleu.vv v0, v9, v8 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %cmp = icmp uge %va, %vb + %select = select %cmp, %vb, zeroinitializer + %sub = sub nuw %va, %select + ret %sub +} + +define @vsub_if_uge_nxv2i16( %va, %vb) { +; CHECK-LABEL: vsub_if_uge_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmsltu.vv v0, v8, v9 +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult %va, %vb + %select = select %cmp, zeroinitializer, %vb + %sub = sub nuw %va, %select + ret %sub +} + +define @vsub_if_uge_swapped_nxv2i16( %va, %vb) { +; CHECK-LABEL: vsub_if_uge_swapped_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu +; CHECK-NEXT: vmsleu.vv v0, v9, v8 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %cmp = icmp uge %va, %vb + %select = select %cmp, %vb, zeroinitializer + %sub = sub nuw %va, %select + ret %sub +} + +define @vsub_if_uge_nxv2i32( %va, %vb) { +; CHECK-LABEL: vsub_if_uge_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vmsltu.vv v0, v8, v9 +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult %va, %vb + %select = select %cmp, zeroinitializer, %vb + %sub = sub nuw %va, %select + ret %sub +} + +define @vsub_if_uge_swapped_nxv2i32( %va, %vb) { +; CHECK-LABEL: vsub_if_uge_swapped_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu +; CHECK-NEXT: vmsleu.vv v0, v9, v8 +; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: ret + %cmp = icmp uge %va, %vb + %select = select %cmp, %vb, zeroinitializer + %sub = sub nuw %va, %select + ret %sub +} + +define @vsub_if_uge_nxv2i64( %va, %vb) { +; CHECK-LABEL: vsub_if_uge_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vmsltu.vv v0, v8, v10 +; CHECK-NEXT: vsub.vv v10, v8, v10 +; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult %va, %vb + %select = select %cmp, zeroinitializer, %vb + %sub = sub nuw %va, %select + ret %sub +} + +define @vsub_if_uge_swapped_nxv2i64( %va, %vb) { +; CHECK-LABEL: vsub_if_uge_swapped_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu +; CHECK-NEXT: vmsleu.vv v0, v10, v8 +; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: ret + %cmp = icmp uge %va, %vb + %select = select %cmp, %vb, zeroinitializer + %sub = sub nuw %va, %select + ret %sub +} + +define @sub_if_uge_C_nxv2i8( %x) { +; CHECK-LABEL: sub_if_uge_C_nxv2i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu +; CHECK-NEXT: vmsgtu.vi v0, v8, 12 +; CHECK-NEXT: vadd.vi v8, v8, -13, v0.t +; CHECK-NEXT: ret + %cmp = icmp ugt %x, splat (i8 12) + %sub = add %x, splat (i8 -13) + %select = select %cmp, %sub, %x + ret %select +} + +define @sub_if_uge_C_nxv2i16( %x) { +; CHECK-LABEL: sub_if_uge_C_nxv2i16: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 2000 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, mu +; CHECK-NEXT: vmsgtu.vx v0, v8, a0 +; CHECK-NEXT: li a0, -2001 +; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %cmp = icmp ugt %x, splat (i16 2000) + %sub = add %x, splat (i16 -2001) + %select = select %cmp, %sub, %x + ret %select +} + +define @sub_if_uge_C_nxv2i32( %x) { +; CHECK-LABEL: sub_if_uge_C_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -16 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu +; CHECK-NEXT: vmsgtu.vx v0, v8, a0 +; CHECK-NEXT: lui a0, 1048560 +; CHECK-NEXT: addi a0, a0, 15 +; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: ret + %cmp = icmp ugt %x, splat (i32 65520) + %sub = add %x, splat (i32 -65521) + %select = select %cmp, %sub, %x + ret %select +} + +define @sub_if_uge_C_swapped_nxv2i32( %x) { +; CHECK-LABEL: sub_if_uge_C_swapped_nxv2i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addi a0, a0, -15 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vmsltu.vx v0, v8, a0 +; CHECK-NEXT: lui a0, 1048560 +; CHECK-NEXT: addi a0, a0, 15 +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: ret + %cmp = icmp ult %x, splat (i32 65521) + %sub = add %x, splat (i32 -65521) + %select = select %cmp, %x, %sub + ret %select +} + +define @sub_if_uge_C_nxv2i64( %x) nounwind { +; RV32-LABEL: sub_if_uge_C_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: li a0, 1 +; RV32-NEXT: lui a1, 172127 +; RV32-NEXT: mv a2, sp +; RV32-NEXT: addi a1, a1, 512 +; RV32-NEXT: sw a1, 0(sp) +; RV32-NEXT: sw a0, 4(sp) +; RV32-NEXT: li a0, -2 +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, mu +; RV32-NEXT: vlse64.v v10, (a2), zero +; RV32-NEXT: lui a1, 876449 +; RV32-NEXT: addi a1, a1, -513 +; RV32-NEXT: sw a1, 8(sp) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 8 +; RV32-NEXT: vlse64.v v12, (a0), zero +; RV32-NEXT: vmsltu.vv v0, v10, v8 +; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: sub_if_uge_C_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, 2384 +; RV64-NEXT: addi a0, a0, 761 +; RV64-NEXT: slli a0, a0, 9 +; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, mu +; RV64-NEXT: vmsgtu.vx v0, v8, a0 +; RV64-NEXT: lui a0, 1048278 +; RV64-NEXT: addi a0, a0, -95 +; RV64-NEXT: slli a0, a0, 12 +; RV64-NEXT: addi a0, a0, -513 +; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: ret + %cmp = icmp ugt %x, splat (i64 5000000000) + %sub = add %x, splat (i64 -5000000001) + %select = select %cmp, %sub, %x + ret %select +} From fc5c5a934d2560559221bcb334b14ef4aa96a2dd Mon Sep 17 00:00:00 2001 From: jyli0116 Date: Thu, 17 Jul 2025 14:43:58 +0100 Subject: [PATCH 174/813] [GlobalISel] Allow expansion of srem by constant in prelegalizer (#148845) This patch allows srem by a constant to be expanded more efficiently to avoid the need for expensive sdiv instructions. This is the last part of the patches which fixes #118090 --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 23 +- .../include/llvm/Target/GlobalISel/Combine.td | 20 +- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 45 +- llvm/test/CodeGen/AArch64/rem-by-const.ll | 819 ++++++++---------- .../CodeGen/AMDGPU/GlobalISel/srem.i32.ll | 158 +--- 5 files changed, 449 insertions(+), 616 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 31f1197b9723b..da829046cc421 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -700,18 +700,19 @@ class CombinerHelper { /// Given an G_UDIV \p MI or G_UREM \p MI expressing a divide by constant, /// return an expression that implements it by multiplying by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". - MachineInstr *buildUDivorURemUsingMul(MachineInstr &MI) const; + MachineInstr *buildUDivOrURemUsingMul(MachineInstr &MI) const; /// Combine G_UDIV or G_UREM by constant into a multiply by magic constant. - bool matchUDivorURemByConst(MachineInstr &MI) const; - void applyUDivorURemByConst(MachineInstr &MI) const; - - /// Given an G_SDIV \p MI expressing a signed divide by constant, return an - /// expression that implements it by multiplying by a magic number. - /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". - MachineInstr *buildSDivUsingMul(MachineInstr &MI) const; - /// Combine G_SDIV by constant into a multiply by magic constant. - bool matchSDivByConst(MachineInstr &MI) const; - void applySDivByConst(MachineInstr &MI) const; + bool matchUDivOrURemByConst(MachineInstr &MI) const; + void applyUDivOrURemByConst(MachineInstr &MI) const; + + /// Given an G_SDIV \p MI or G_SREM \p MI expressing a signed divide by + /// constant, return an expression that implements it by multiplying by a + /// magic number. Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's + /// Guide". + MachineInstr *buildSDivOrSRemUsingMul(MachineInstr &MI) const; + /// Combine G_SDIV or G_SREM by constant into a multiply by magic constant. + bool matchSDivOrSRemByConst(MachineInstr &MI) const; + void applySDivOrSRemByConst(MachineInstr &MI) const; /// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant, /// return expressions that implements it by shifting. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 66051d756c808..fc81ab76dc72d 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1132,14 +1132,14 @@ def form_bitfield_extract : GICombineGroup<[bitfield_extract_from_sext_inreg, def udiv_by_const : GICombineRule< (defs root:$root), (match (G_UDIV $dst, $x, $y):$root, - [{ return Helper.matchUDivorURemByConst(*${root}); }]), - (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>; + [{ return Helper.matchUDivOrURemByConst(*${root}); }]), + (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>; def sdiv_by_const : GICombineRule< (defs root:$root), (match (G_SDIV $dst, $x, $y):$root, - [{ return Helper.matchSDivByConst(*${root}); }]), - (apply [{ Helper.applySDivByConst(*${root}); }])>; + [{ return Helper.matchSDivOrSRemByConst(*${root}); }]), + (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>; def sdiv_by_pow2 : GICombineRule< (defs root:$root), @@ -1159,10 +1159,16 @@ def intdiv_combines : GICombineGroup<[udiv_by_pow2, sdiv_by_pow2, def urem_by_const : GICombineRule< (defs root:$root), (match (G_UREM $dst, $x, $y):$root, - [{ return Helper.matchUDivorURemByConst(*${root}); }]), - (apply [{ Helper.applyUDivorURemByConst(*${root}); }])>; + [{ return Helper.matchUDivOrURemByConst(*${root}); }]), + (apply [{ Helper.applyUDivOrURemByConst(*${root}); }])>; -def intrem_combines : GICombineGroup<[urem_by_const]>; +def srem_by_const : GICombineRule< + (defs root:$root), + (match (G_SREM $dst, $x, $y):$root, + [{ return Helper.matchSDivOrSRemByConst(*${root}); }]), + (apply [{ Helper.applySDivOrSRemByConst(*${root}); }])>; + +def intrem_combines : GICombineGroup<[urem_by_const, srem_by_const]>; def reassoc_ptradd : GICombineRule< (defs root:$root, build_fn_matchinfo:$matchinfo), diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 3922eba55e195..e8f513ad5a7a9 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -5300,7 +5300,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI, return false; } -MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const { +MachineInstr *CombinerHelper::buildUDivOrURemUsingMul(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM); auto &UDivorRem = cast(MI); @@ -5468,7 +5468,7 @@ MachineInstr *CombinerHelper::buildUDivorURemUsingMul(MachineInstr &MI) const { return ret; } -bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const { +bool CombinerHelper::matchUDivOrURemByConst(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); assert(Opcode == TargetOpcode::G_UDIV || Opcode == TargetOpcode::G_UREM); Register Dst = MI.getOperand(0).getReg(); @@ -5517,13 +5517,14 @@ bool CombinerHelper::matchUDivorURemByConst(MachineInstr &MI) const { MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } -void CombinerHelper::applyUDivorURemByConst(MachineInstr &MI) const { - auto *NewMI = buildUDivorURemUsingMul(MI); +void CombinerHelper::applyUDivOrURemByConst(MachineInstr &MI) const { + auto *NewMI = buildUDivOrURemUsingMul(MI); replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } -bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); +bool CombinerHelper::matchSDivOrSRemByConst(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert(Opcode == TargetOpcode::G_SDIV || Opcode == TargetOpcode::G_SREM); Register Dst = MI.getOperand(0).getReg(); Register RHS = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(Dst); @@ -5543,7 +5544,8 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { return false; // If the sdiv has an 'exact' flag we can use a simpler lowering. - if (MI.getFlag(MachineInstr::MIFlag::IsExact)) { + if (Opcode == TargetOpcode::G_SDIV && + MI.getFlag(MachineInstr::MIFlag::IsExact)) { return matchUnaryPredicate( MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } @@ -5559,23 +5561,28 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const { if (!isLegal({TargetOpcode::G_SMULH, {DstTy}}) && !isLegalOrHasWidenScalar({TargetOpcode::G_MUL, {WideTy, WideTy}})) return false; + if (Opcode == TargetOpcode::G_SREM && + !isLegalOrBeforeLegalizer({TargetOpcode::G_SUB, {DstTy, DstTy}})) + return false; } return matchUnaryPredicate( MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); }); } -void CombinerHelper::applySDivByConst(MachineInstr &MI) const { - auto *NewMI = buildSDivUsingMul(MI); +void CombinerHelper::applySDivOrSRemByConst(MachineInstr &MI) const { + auto *NewMI = buildSDivOrSRemUsingMul(MI); replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg()); } -MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { - assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV"); - auto &SDiv = cast(MI); - Register Dst = SDiv.getReg(0); - Register LHS = SDiv.getReg(1); - Register RHS = SDiv.getReg(2); +MachineInstr *CombinerHelper::buildSDivOrSRemUsingMul(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + assert(MI.getOpcode() == TargetOpcode::G_SDIV || + Opcode == TargetOpcode::G_SREM); + auto &SDivorRem = cast(MI); + Register Dst = SDivorRem.getReg(0); + Register LHS = SDivorRem.getReg(1); + Register RHS = SDivorRem.getReg(2); LLT Ty = MRI.getType(Dst); LLT ScalarTy = Ty.getScalarType(); const unsigned EltBits = ScalarTy.getScalarSizeInBits(); @@ -5705,7 +5712,13 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const { auto SignShift = MIB.buildConstant(ShiftAmtTy, EltBits - 1); auto T = MIB.buildLShr(Ty, Q, SignShift); T = MIB.buildAnd(Ty, T, ShiftMask); - return MIB.buildAdd(Ty, Q, T); + auto ret = MIB.buildAdd(Ty, Q, T); + + if (Opcode == TargetOpcode::G_SREM) { + auto Prod = MIB.buildMul(Ty, ret, RHS); + return MIB.buildSub(Ty, LHS, Prod); + } + return ret; } bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const { diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index 1376f5d9a380d..b124042265d40 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -19,8 +19,13 @@ define i8 @si8_7(i8 %a, i8 %b) { ; CHECK-GI-LABEL: si8_7: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxtb w8, w0 -; CHECK-GI-NEXT: mov w9, #7 // =0x7 -; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, #-109 // =0xffffff93 +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: add w8, w0, w8, asr #8 +; CHECK-GI-NEXT: sbfx w8, w8, #2, #6 +; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 +; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: lsl w9, w8, #3 ; CHECK-GI-NEXT: sub w8, w9, w8 ; CHECK-GI-NEXT: sub w0, w0, w8 @@ -45,8 +50,14 @@ define i8 @si8_100(i8 %a, i8 %b) { ; CHECK-GI-LABEL: si8_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxtb w8, w0 +; CHECK-GI-NEXT: mov w9, #41 // =0x29 +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: sxth w8, w8 +; CHECK-GI-NEXT: sbfx w8, w8, #8, #8 +; CHECK-GI-NEXT: asr w8, w8, #4 +; CHECK-GI-NEXT: ubfx w9, w8, #7, #1 +; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: mov w9, #100 // =0x64 -; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: msub w0, w8, w9, w0 ; CHECK-GI-NEXT: ret entry: @@ -129,8 +140,12 @@ define i16 @si16_7(i16 %a, i16 %b) { ; CHECK-GI-LABEL: si16_7: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxth w8, w0 -; CHECK-GI-NEXT: mov w9, #7 // =0x7 -; CHECK-GI-NEXT: sdiv w8, w8, w9 +; CHECK-GI-NEXT: mov w9, #18725 // =0x4925 +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: asr w8, w8, #16 +; CHECK-GI-NEXT: asr w8, w8, #1 +; CHECK-GI-NEXT: ubfx w9, w8, #15, #1 +; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: lsl w9, w8, #3 ; CHECK-GI-NEXT: sub w8, w9, w8 ; CHECK-GI-NEXT: sub w0, w0, w8 @@ -155,8 +170,13 @@ define i16 @si16_100(i16 %a, i16 %b) { ; CHECK-GI-LABEL: si16_100: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sxth w8, w0 +; CHECK-GI-NEXT: mov w9, #5243 // =0x147b +; CHECK-GI-NEXT: mul w8, w8, w9 +; CHECK-GI-NEXT: asr w8, w8, #16 +; CHECK-GI-NEXT: asr w8, w8, #3 +; CHECK-GI-NEXT: ubfx w9, w8, #15, #1 +; CHECK-GI-NEXT: add w8, w8, w9 ; CHECK-GI-NEXT: mov w9, #100 // =0x64 -; CHECK-GI-NEXT: sdiv w8, w8, w9 ; CHECK-GI-NEXT: msub w0, w8, w9, w0 ; CHECK-GI-NEXT: ret entry: @@ -240,8 +260,13 @@ define i32 @si32_7(i32 %a, i32 %b) { ; ; CHECK-GI-LABEL: si32_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: sdiv w8, w0, w8 +; CHECK-GI-NEXT: mov w8, #9363 // =0x2493 +; CHECK-GI-NEXT: movk w8, #37449, lsl #16 +; CHECK-GI-NEXT: smull x8, w0, w8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: add w8, w8, w0 +; CHECK-GI-NEXT: asr w8, w8, #2 +; CHECK-GI-NEXT: add w8, w8, w8, lsr #31 ; CHECK-GI-NEXT: lsl w9, w8, #3 ; CHECK-GI-NEXT: sub w8, w9, w8 ; CHECK-GI-NEXT: sub w0, w0, w8 @@ -265,9 +290,14 @@ define i32 @si32_100(i32 %a, i32 %b) { ; ; CHECK-GI-LABEL: si32_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: sdiv w9, w0, w8 -; CHECK-GI-NEXT: msub w0, w9, w8, w0 +; CHECK-GI-NEXT: mov w8, #34079 // =0x851f +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: movk w8, #20971, lsl #16 +; CHECK-GI-NEXT: smull x8, w0, w8 +; CHECK-GI-NEXT: asr x8, x8, #32 +; CHECK-GI-NEXT: asr w8, w8, #5 +; CHECK-GI-NEXT: add w8, w8, w8, lsr #31 +; CHECK-GI-NEXT: msub w0, w8, w9, w0 ; CHECK-GI-NEXT: ret entry: %s = srem i32 %a, 100 @@ -348,8 +378,13 @@ define i64 @si64_7(i64 %a, i64 %b) { ; ; CHECK-GI-LABEL: si64_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: sdiv x8, x0, x8 +; CHECK-GI-NEXT: mov x8, #18725 // =0x4925 +; CHECK-GI-NEXT: movk x8, #9362, lsl #16 +; CHECK-GI-NEXT: movk x8, #37449, lsl #32 +; CHECK-GI-NEXT: movk x8, #18724, lsl #48 +; CHECK-GI-NEXT: smulh x8, x0, x8 +; CHECK-GI-NEXT: asr x8, x8, #1 +; CHECK-GI-NEXT: add x8, x8, x8, lsr #63 ; CHECK-GI-NEXT: lsl x9, x8, #3 ; CHECK-GI-NEXT: sub x8, x9, x8 ; CHECK-GI-NEXT: sub x0, x0, x8 @@ -376,9 +411,16 @@ define i64 @si64_100(i64 %a, i64 %b) { ; ; CHECK-GI-LABEL: si64_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: sdiv x9, x0, x8 -; CHECK-GI-NEXT: msub x0, x9, x8, x0 +; CHECK-GI-NEXT: mov x8, #55051 // =0xd70b +; CHECK-GI-NEXT: mov w9, #100 // =0x64 +; CHECK-GI-NEXT: movk x8, #28835, lsl #16 +; CHECK-GI-NEXT: movk x8, #2621, lsl #32 +; CHECK-GI-NEXT: movk x8, #41943, lsl #48 +; CHECK-GI-NEXT: smulh x8, x0, x8 +; CHECK-GI-NEXT: add x8, x8, x0 +; CHECK-GI-NEXT: asr x8, x8, #6 +; CHECK-GI-NEXT: add x8, x8, x8, lsr #63 +; CHECK-GI-NEXT: msub x0, x8, x9, x0 ; CHECK-GI-NEXT: ret entry: %s = srem i64 %a, 100 @@ -644,25 +686,49 @@ define <2 x i8> @sv2i8_7(<2 x i8> %d, <2 x i8> %e) { ; ; CHECK-GI-LABEL: sv2i8_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: mov w8, #65427 // =0xff93 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] ; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-GI-NEXT: sdiv w9, w9, w8 ; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 -; CHECK-GI-NEXT: smov w11, v1.h[1] -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: smov w10, v1.h[0] +; CHECK-GI-NEXT: smov w8, v1.h[0] +; CHECK-GI-NEXT: smov w9, v1.h[1] +; CHECK-GI-NEXT: shl v1.2s, v0.2s, #24 +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov w8, #8 // =0x8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: mov v2.b[1], w8 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: neg v2.8b, v2.8b +; CHECK-GI-NEXT: mov w9, v1.s[1] +; CHECK-GI-NEXT: mov v1.b[1], w9 +; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: movi v3.2s, #7 +; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: umov w8, v1.b[0] +; CHECK-GI-NEXT: umov w10, v1.b[1] +; CHECK-GI-NEXT: umov w9, v2.b[0] +; CHECK-GI-NEXT: umov w11, v2.b[1] +; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: fmov s1, w10 -; CHECK-GI-NEXT: mov v1.s[1], w11 -; CHECK-GI-NEXT: mov v2.s[1], w8 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i8> %d, @@ -687,25 +753,46 @@ define <2 x i8> @sv2i8_100(<2 x i8> %d, <2 x i8> %e) { ; ; CHECK-GI-LABEL: sv2i8_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: shl v0.2s, v0.2s, #24 +; CHECK-GI-NEXT: mov w8, #41 // =0x29 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] ; CHECK-GI-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-GI-NEXT: sdiv w9, w9, w8 ; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 -; CHECK-GI-NEXT: smov w11, v1.h[1] -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: smov w10, v1.h[0] +; CHECK-GI-NEXT: smov w8, v1.h[0] +; CHECK-GI-NEXT: smov w9, v1.h[1] +; CHECK-GI-NEXT: shl v1.2s, v0.2s, #24 +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #24 +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov w8, #8 // =0x8 +; CHECK-GI-NEXT: mov v2.s[1], w9 +; CHECK-GI-NEXT: mul v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #4 // =0x4 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: neg v3.8b, v3.8b +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: mov v2.b[1], w8 +; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v3.8b +; CHECK-GI-NEXT: neg v2.8b, v2.8b +; CHECK-GI-NEXT: movi v3.2s, #100 +; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: umov w8, v1.b[0] +; CHECK-GI-NEXT: umov w10, v1.b[1] +; CHECK-GI-NEXT: umov w9, v2.b[0] +; CHECK-GI-NEXT: umov w11, v2.b[1] +; CHECK-GI-NEXT: fmov s1, w8 ; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: fmov s1, w10 -; CHECK-GI-NEXT: mov v1.s[1], w11 -; CHECK-GI-NEXT: mov v2.s[1], w8 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mov v1.s[1], w10 +; CHECK-GI-NEXT: mov v2.s[1], w11 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i8> %d, @@ -872,30 +959,37 @@ define <4 x i8> @sv4i8_7(<4 x i8> %d, <4 x i8> %e) { ; ; CHECK-GI-LABEL: sv4i8_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v3.4h, #7 -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24 -; CHECK-GI-NEXT: mov v2.h[1], w8 -; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24 -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w9, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w9 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: mov w8, #147 // =0x93 +; CHECK-GI-NEXT: shl v2.4h, v0.4h, #8 +; CHECK-GI-NEXT: mov w9, #7 // =0x7 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #8 +; CHECK-GI-NEXT: mov v1.b[1], w8 +; CHECK-GI-NEXT: mov v4.b[1], w9 +; CHECK-GI-NEXT: mov v1.b[2], w8 +; CHECK-GI-NEXT: mov v4.b[2], w9 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: mov w8, #2 // =0x2 +; CHECK-GI-NEXT: mov v4.b[3], w9 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h +; CHECK-GI-NEXT: fmov d2, d0 +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #8 +; CHECK-GI-NEXT: mov v3.b[3], w8 +; CHECK-GI-NEXT: uzp1 v1.8b, v2.8b, v0.8b +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: dup v3.4h, w9 +; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: neg v2.8b, v4.8b +; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i8> %d, @@ -943,30 +1037,37 @@ define <4 x i8> @sv4i8_100(<4 x i8> %d, <4 x i8> %e) { ; ; CHECK-GI-LABEL: sv4i8_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-GI-NEXT: mov w8, #41 // =0x29 +; CHECK-GI-NEXT: shl v2.4h, v0.4h, #8 +; CHECK-GI-NEXT: mov w9, #7 // =0x7 +; CHECK-GI-NEXT: fmov s1, w8 +; CHECK-GI-NEXT: fmov s4, w9 +; CHECK-GI-NEXT: sshr v2.4h, v2.4h, #8 +; CHECK-GI-NEXT: mov v1.b[1], w8 +; CHECK-GI-NEXT: mov v4.b[1], w9 +; CHECK-GI-NEXT: mov v1.b[2], w8 +; CHECK-GI-NEXT: mov v4.b[2], w9 +; CHECK-GI-NEXT: mov v1.b[3], w8 +; CHECK-GI-NEXT: mov w8, #4 // =0x4 +; CHECK-GI-NEXT: mov v4.b[3], w9 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: mov v3.b[1], w8 +; CHECK-GI-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: mul v1.4h, v2.4h, v1.4h +; CHECK-GI-NEXT: mov v3.b[2], w8 +; CHECK-GI-NEXT: sshr v1.4h, v1.4h, #8 +; CHECK-GI-NEXT: mov v3.b[3], w8 ; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v3.4h, #100 -; CHECK-GI-NEXT: fmov s2, w8 -; CHECK-GI-NEXT: shl v0.4s, v0.4s, #24 -; CHECK-GI-NEXT: mov v2.h[1], w8 -; CHECK-GI-NEXT: sshll v3.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshr v0.4s, v0.4s, #24 -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: mov v3.d[1], v2.d[0] -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w9, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w9 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v3.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: uzp1 v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: neg v2.8b, v3.8b +; CHECK-GI-NEXT: dup v3.4h, w8 +; CHECK-GI-NEXT: sshl v1.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: neg v2.8b, v4.8b +; CHECK-GI-NEXT: ushl v2.8b, v1.8b, v2.8b +; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-GI-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-GI-NEXT: add v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: mls v0.4h, v1.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i8> %d, @@ -988,42 +1089,15 @@ define <8 x i8> @sv8i8_7(<8 x i8> %d, <8 x i8> %e) { ; ; CHECK-GI-LABEL: sv8i8_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v4.8b, #7 -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s0 -; CHECK-GI-NEXT: mov w10, v1.s[1] -; CHECK-GI-NEXT: mov w14, v0.s[1] -; CHECK-GI-NEXT: mov w11, v1.s[2] -; CHECK-GI-NEXT: mov w15, v0.s[2] -; CHECK-GI-NEXT: mov w12, v1.s[3] -; CHECK-GI-NEXT: mov w16, v0.s[3] -; CHECK-GI-NEXT: sshll v5.4s, v4.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v3.s[1], w14 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v2.s[2], w11 -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v3.s[2], w15 -; CHECK-GI-NEXT: sdiv w8, w16, w8 -; CHECK-GI-NEXT: mov v2.s[3], w12 -; CHECK-GI-NEXT: mls v1.4s, v2.4s, v5.4s -; CHECK-GI-NEXT: mov v3.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: movi v1.8b, #147 +; CHECK-GI-NEXT: movi v3.8b, #7 +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: add v1.8b, v1.8b, v0.8b +; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #2 +; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7 +; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #2 +; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i8> %d, @@ -1044,42 +1118,14 @@ define <8 x i8> @sv8i8_100(<8 x i8> %d, <8 x i8> %e) { ; ; CHECK-GI-LABEL: sv8i8_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v4.8b, #100 -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s0 -; CHECK-GI-NEXT: mov w10, v1.s[1] -; CHECK-GI-NEXT: mov w14, v0.s[1] -; CHECK-GI-NEXT: mov w11, v1.s[2] -; CHECK-GI-NEXT: mov w15, v0.s[2] -; CHECK-GI-NEXT: mov w12, v1.s[3] -; CHECK-GI-NEXT: mov w16, v0.s[3] -; CHECK-GI-NEXT: sshll v5.4s, v4.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sshll2 v4.4s, v4.8h, #0 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v3.s[1], w14 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v2.s[2], w11 -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v3.s[2], w15 -; CHECK-GI-NEXT: sdiv w8, w16, w8 -; CHECK-GI-NEXT: mov v2.s[3], w12 -; CHECK-GI-NEXT: mls v1.4s, v2.4s, v5.4s -; CHECK-GI-NEXT: mov v3.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h -; CHECK-GI-NEXT: xtn v0.8b, v0.8h +; CHECK-GI-NEXT: movi v1.8b, #41 +; CHECK-GI-NEXT: movi v3.8b, #100 +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: shrn v1.8b, v1.8h, #8 +; CHECK-GI-NEXT: sshr v2.8b, v1.8b, #4 +; CHECK-GI-NEXT: ushr v2.8b, v2.8b, #7 +; CHECK-GI-NEXT: ssra v2.8b, v1.8b, #4 +; CHECK-GI-NEXT: mls v0.8b, v2.8b, v3.8b ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i8> %d, @@ -1102,72 +1148,16 @@ define <16 x i8> @sv16i8_7(<16 x i8> %d, <16 x i8> %e) { ; ; CHECK-GI-LABEL: sv16i8_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll2 v3.8h, v0.16b, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v16.8b, #7 -; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s2 -; CHECK-GI-NEXT: fmov w17, s0 -; CHECK-GI-NEXT: fmov w2, s3 -; CHECK-GI-NEXT: mov w14, v2.s[1] -; CHECK-GI-NEXT: mov w18, v0.s[1] -; CHECK-GI-NEXT: mov w3, v3.s[1] -; CHECK-GI-NEXT: mov w15, v2.s[2] -; CHECK-GI-NEXT: mov w0, v0.s[2] -; CHECK-GI-NEXT: sdiv w11, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov w4, v3.s[2] -; CHECK-GI-NEXT: mov w16, v2.s[3] -; CHECK-GI-NEXT: mov w1, v0.s[3] -; CHECK-GI-NEXT: mov w5, v3.s[3] -; CHECK-GI-NEXT: sshll v17.4s, v16.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: sdiv w17, w17, w8 -; CHECK-GI-NEXT: fmov s5, w13 -; CHECK-GI-NEXT: sdiv w2, w2, w8 -; CHECK-GI-NEXT: fmov s6, w17 -; CHECK-GI-NEXT: sdiv w12, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[2] -; CHECK-GI-NEXT: fmov s7, w2 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v4.s[1], w12 -; CHECK-GI-NEXT: sdiv w18, w18, w8 -; CHECK-GI-NEXT: mov v5.s[1], w14 -; CHECK-GI-NEXT: sdiv w3, w3, w8 -; CHECK-GI-NEXT: mov v6.s[1], w18 -; CHECK-GI-NEXT: sdiv w10, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[3] -; CHECK-GI-NEXT: mov v7.s[1], w3 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v4.s[2], w10 -; CHECK-GI-NEXT: sdiv w0, w0, w8 -; CHECK-GI-NEXT: mov v5.s[2], w15 -; CHECK-GI-NEXT: sdiv w4, w4, w8 -; CHECK-GI-NEXT: mov v6.s[2], w0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: mov v7.s[2], w4 -; CHECK-GI-NEXT: sdiv w16, w16, w8 -; CHECK-GI-NEXT: mov v4.s[3], w9 -; CHECK-GI-NEXT: mls v1.4s, v4.4s, v17.4s -; CHECK-GI-NEXT: sdiv w1, w1, w8 -; CHECK-GI-NEXT: mov v5.s[3], w16 -; CHECK-GI-NEXT: mls v2.4s, v5.4s, v16.4s -; CHECK-GI-NEXT: sdiv w8, w5, w8 -; CHECK-GI-NEXT: mov v6.s[3], w1 -; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: mls v0.4s, v6.4s, v17.4s -; CHECK-GI-NEXT: mov v7.s[3], w8 -; CHECK-GI-NEXT: mls v3.4s, v7.4s, v16.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: movi v1.16b, #147 +; CHECK-GI-NEXT: movi v3.16b, #7 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: add v1.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: sshr v2.16b, v1.16b, #2 +; CHECK-GI-NEXT: ushr v2.16b, v2.16b, #7 +; CHECK-GI-NEXT: ssra v2.16b, v1.16b, #2 +; CHECK-GI-NEXT: mls v0.16b, v2.16b, v3.16b ; CHECK-GI-NEXT: ret entry: %s = srem <16 x i8> %d, @@ -1189,72 +1179,15 @@ define <16 x i8> @sv16i8_100(<16 x i8> %d, <16 x i8> %e) { ; ; CHECK-GI-LABEL: sv16i8_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v2.8h, v0.8b, #0 -; CHECK-GI-NEXT: sshll2 v3.8h, v0.16b, #0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v16.8b, #100 -; CHECK-GI-NEXT: sshll v1.4s, v2.4h, #0 -; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 -; CHECK-GI-NEXT: sshll v0.4s, v3.4h, #0 -; CHECK-GI-NEXT: sshll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s2 -; CHECK-GI-NEXT: fmov w17, s0 -; CHECK-GI-NEXT: fmov w2, s3 -; CHECK-GI-NEXT: mov w14, v2.s[1] -; CHECK-GI-NEXT: mov w18, v0.s[1] -; CHECK-GI-NEXT: mov w3, v3.s[1] -; CHECK-GI-NEXT: mov w15, v2.s[2] -; CHECK-GI-NEXT: mov w0, v0.s[2] -; CHECK-GI-NEXT: sdiv w11, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[1] -; CHECK-GI-NEXT: mov w4, v3.s[2] -; CHECK-GI-NEXT: mov w16, v2.s[3] -; CHECK-GI-NEXT: mov w1, v0.s[3] -; CHECK-GI-NEXT: mov w5, v3.s[3] -; CHECK-GI-NEXT: sshll v17.4s, v16.4h, #0 -; CHECK-GI-NEXT: sshll2 v16.4s, v16.8h, #0 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s4, w11 -; CHECK-GI-NEXT: sdiv w17, w17, w8 -; CHECK-GI-NEXT: fmov s5, w13 -; CHECK-GI-NEXT: sdiv w2, w2, w8 -; CHECK-GI-NEXT: fmov s6, w17 -; CHECK-GI-NEXT: sdiv w12, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[2] -; CHECK-GI-NEXT: fmov s7, w2 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v4.s[1], w12 -; CHECK-GI-NEXT: sdiv w18, w18, w8 -; CHECK-GI-NEXT: mov v5.s[1], w14 -; CHECK-GI-NEXT: sdiv w3, w3, w8 -; CHECK-GI-NEXT: mov v6.s[1], w18 -; CHECK-GI-NEXT: sdiv w10, w9, w8 -; CHECK-GI-NEXT: mov w9, v1.s[3] -; CHECK-GI-NEXT: mov v7.s[1], w3 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v4.s[2], w10 -; CHECK-GI-NEXT: sdiv w0, w0, w8 -; CHECK-GI-NEXT: mov v5.s[2], w15 -; CHECK-GI-NEXT: sdiv w4, w4, w8 -; CHECK-GI-NEXT: mov v6.s[2], w0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: mov v7.s[2], w4 -; CHECK-GI-NEXT: sdiv w16, w16, w8 -; CHECK-GI-NEXT: mov v4.s[3], w9 -; CHECK-GI-NEXT: mls v1.4s, v4.4s, v17.4s -; CHECK-GI-NEXT: sdiv w1, w1, w8 -; CHECK-GI-NEXT: mov v5.s[3], w16 -; CHECK-GI-NEXT: mls v2.4s, v5.4s, v16.4s -; CHECK-GI-NEXT: sdiv w8, w5, w8 -; CHECK-GI-NEXT: mov v6.s[3], w1 -; CHECK-GI-NEXT: uzp1 v1.8h, v1.8h, v2.8h -; CHECK-GI-NEXT: mls v0.4s, v6.4s, v17.4s -; CHECK-GI-NEXT: mov v7.s[3], w8 -; CHECK-GI-NEXT: mls v3.4s, v7.4s, v16.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v0.8h, v3.8h -; CHECK-GI-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-GI-NEXT: movi v1.16b, #41 +; CHECK-GI-NEXT: movi v3.16b, #100 +; CHECK-GI-NEXT: smull2 v2.8h, v0.16b, v1.16b +; CHECK-GI-NEXT: smull v1.8h, v0.8b, v1.8b +; CHECK-GI-NEXT: uzp2 v1.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: sshr v2.16b, v1.16b, #4 +; CHECK-GI-NEXT: ushr v2.16b, v2.16b, #7 +; CHECK-GI-NEXT: ssra v2.16b, v1.16b, #4 +; CHECK-GI-NEXT: mls v0.16b, v2.16b, v3.16b ; CHECK-GI-NEXT: ret entry: %s = srem <16 x i8> %d, @@ -1754,20 +1687,31 @@ define <2 x i16> @sv2i16_7(<2 x i16> %d, <2 x i16> %e) { ; ; CHECK-GI-LABEL: sv2i16_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: mov w8, #18725 // =0x4925 +; CHECK-GI-NEXT: shl v2.2s, v0.2s, #16 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: sshr v2.2s, v2.2s, #16 ; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w8, #1 // =0x1 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mul v1.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #15 // =0xf +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov w8, #7 // =0x7 +; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: neg v2.4h, v3.4h +; CHECK-GI-NEXT: dup v3.2s, w8 +; CHECK-GI-NEXT: ushl v2.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i16> %d, @@ -1792,20 +1736,31 @@ define <2 x i16> @sv2i16_100(<2 x i16> %d, <2 x i16> %e) { ; ; CHECK-GI-LABEL: sv2i16_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: mov w8, #5243 // =0x147b +; CHECK-GI-NEXT: shl v2.2s, v0.2s, #16 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: sshr v0.2s, v0.2s, #16 +; CHECK-GI-NEXT: sshr v2.2s, v2.2s, #16 ; CHECK-GI-NEXT: mov v1.h[1], w8 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] +; CHECK-GI-NEXT: mov w8, #3 // =0x3 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: mul v1.2s, v2.2s, v1.2s +; CHECK-GI-NEXT: fmov s2, w8 +; CHECK-GI-NEXT: mov v2.h[1], w8 +; CHECK-GI-NEXT: mov w8, #15 // =0xf +; CHECK-GI-NEXT: sshr v1.2s, v1.2s, #16 +; CHECK-GI-NEXT: fmov s3, w8 +; CHECK-GI-NEXT: uzp1 v1.4h, v1.4h, v0.4h +; CHECK-GI-NEXT: mov v3.h[1], w8 +; CHECK-GI-NEXT: neg v2.4h, v2.4h +; CHECK-GI-NEXT: mov w8, #100 // =0x64 +; CHECK-GI-NEXT: sshl v1.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: neg v2.4h, v3.4h +; CHECK-GI-NEXT: dup v3.2s, w8 +; CHECK-GI-NEXT: ushl v2.4h, v1.4h, v2.4h +; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v2.2s +; CHECK-GI-NEXT: mls v0.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i16> %d, @@ -1949,24 +1904,15 @@ define <4 x i16> @sv4i16_7(<4 x i16> %d, <4 x i16> %e) { ; ; CHECK-GI-LABEL: sv4i16_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v2.4h, #7 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: adrp x8, .LCPI44_0 +; CHECK-GI-NEXT: movi v3.4h, #7 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI44_0] +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #1 +; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15 +; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #1 +; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i16> %d, @@ -1988,24 +1934,15 @@ define <4 x i16> @sv4i16_100(<4 x i16> %d, <4 x i16> %e) { ; ; CHECK-GI-LABEL: sv4i16_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v0.4s, v0.4h, #0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v2.4h, #100 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s -; CHECK-GI-NEXT: xtn v0.4h, v0.4s +; CHECK-GI-NEXT: adrp x8, .LCPI45_0 +; CHECK-GI-NEXT: movi v3.4h, #100 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI45_0] +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: shrn v1.4h, v1.4s, #16 +; CHECK-GI-NEXT: sshr v2.4h, v1.4h, #3 +; CHECK-GI-NEXT: ushr v2.4h, v2.4h, #15 +; CHECK-GI-NEXT: ssra v2.4h, v1.4h, #3 +; CHECK-GI-NEXT: mls v0.4h, v2.4h, v3.4h ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i16> %d, @@ -2028,38 +1965,16 @@ define <8 x i16> @sv8i16_7(<8 x i16> %d, <8 x i16> %e) { ; ; CHECK-GI-LABEL: sv8i16_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: movi v4.4h, #7 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s0 -; CHECK-GI-NEXT: mov w10, v1.s[1] -; CHECK-GI-NEXT: mov w14, v0.s[1] -; CHECK-GI-NEXT: mov w11, v1.s[2] -; CHECK-GI-NEXT: mov w15, v0.s[2] -; CHECK-GI-NEXT: mov w12, v1.s[3] -; CHECK-GI-NEXT: mov w16, v0.s[3] -; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v3.s[1], w14 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v2.s[2], w11 -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v3.s[2], w15 -; CHECK-GI-NEXT: sdiv w8, w16, w8 -; CHECK-GI-NEXT: mov v2.s[3], w12 -; CHECK-GI-NEXT: mls v1.4s, v2.4s, v4.4s -; CHECK-GI-NEXT: mov v3.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: adrp x8, .LCPI46_0 +; CHECK-GI-NEXT: movi v3.8h, #7 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI46_0] +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: sshr v2.8h, v1.8h, #1 +; CHECK-GI-NEXT: ushr v2.8h, v2.8h, #15 +; CHECK-GI-NEXT: ssra v2.8h, v1.8h, #1 +; CHECK-GI-NEXT: mls v0.8h, v2.8h, v3.8h ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i16> %d, @@ -2082,38 +1997,16 @@ define <8 x i16> @sv8i16_100(<8 x i16> %d, <8 x i16> %e) { ; ; CHECK-GI-LABEL: sv8i16_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: sshll v1.4s, v0.4h, #0 -; CHECK-GI-NEXT: sshll2 v0.4s, v0.8h, #0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: movi v4.4h, #100 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fmov w13, s0 -; CHECK-GI-NEXT: mov w10, v1.s[1] -; CHECK-GI-NEXT: mov w14, v0.s[1] -; CHECK-GI-NEXT: mov w11, v1.s[2] -; CHECK-GI-NEXT: mov w15, v0.s[2] -; CHECK-GI-NEXT: mov w12, v1.s[3] -; CHECK-GI-NEXT: mov w16, v0.s[3] -; CHECK-GI-NEXT: sshll v4.4s, v4.4h, #0 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w13, w13, w8 -; CHECK-GI-NEXT: fmov s2, w9 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s3, w13 -; CHECK-GI-NEXT: sdiv w14, w14, w8 -; CHECK-GI-NEXT: mov v2.s[1], w10 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v3.s[1], w14 -; CHECK-GI-NEXT: sdiv w15, w15, w8 -; CHECK-GI-NEXT: mov v2.s[2], w11 -; CHECK-GI-NEXT: sdiv w12, w12, w8 -; CHECK-GI-NEXT: mov v3.s[2], w15 -; CHECK-GI-NEXT: sdiv w8, w16, w8 -; CHECK-GI-NEXT: mov v2.s[3], w12 -; CHECK-GI-NEXT: mls v1.4s, v2.4s, v4.4s -; CHECK-GI-NEXT: mov v3.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v3.4s, v4.4s -; CHECK-GI-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-GI-NEXT: adrp x8, .LCPI47_0 +; CHECK-GI-NEXT: movi v3.8h, #100 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI47_0] +; CHECK-GI-NEXT: smull2 v2.4s, v0.8h, v1.8h +; CHECK-GI-NEXT: smull v1.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: uzp2 v1.8h, v1.8h, v2.8h +; CHECK-GI-NEXT: sshr v2.8h, v1.8h, #3 +; CHECK-GI-NEXT: ushr v2.8h, v2.8h, #15 +; CHECK-GI-NEXT: ssra v2.8h, v1.8h, #3 +; CHECK-GI-NEXT: mls v0.8h, v2.8h, v3.8h ; CHECK-GI-NEXT: ret entry: %s = srem <8 x i16> %d, @@ -2499,17 +2392,16 @@ define <2 x i32> @sv2i32_7(<2 x i32> %d, <2 x i32> %e) { ; ; CHECK-GI-LABEL: sv2i32_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: movi v2.2s, #7 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: adrp x8, .LCPI56_0 +; CHECK-GI-NEXT: movi v3.2s, #7 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI56_0] +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: add v1.2s, v1.2s, v0.2s +; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #2 +; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31 +; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #2 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i32> %d, @@ -2532,17 +2424,15 @@ define <2 x i32> @sv2i32_100(<2 x i32> %d, <2 x i32> %e) { ; ; CHECK-GI-LABEL: sv2i32_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: movi v2.2s, #100 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w8, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: mov v1.s[1], w8 -; CHECK-GI-NEXT: mls v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GI-NEXT: adrp x8, .LCPI57_0 +; CHECK-GI-NEXT: movi v3.2s, #100 +; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI57_0] +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #32 +; CHECK-GI-NEXT: sshr v2.2s, v1.2s, #5 +; CHECK-GI-NEXT: ushr v2.2s, v2.2s, #31 +; CHECK-GI-NEXT: ssra v2.2s, v1.2s, #5 +; CHECK-GI-NEXT: mls v0.2s, v2.2s, v3.2s ; CHECK-GI-NEXT: ret entry: %s = srem <2 x i32> %d, @@ -2664,21 +2554,17 @@ define <4 x i32> @sv4i32_7(<4 x i32> %d, <4 x i32> %e) { ; ; CHECK-GI-LABEL: sv4i32_7: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #7 // =0x7 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: movi v2.4s, #7 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: adrp x8, .LCPI60_0 +; CHECK-GI-NEXT: movi v3.4s, #7 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI60_0] +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: add v1.4s, v1.4s, v0.4s +; CHECK-GI-NEXT: sshr v2.4s, v1.4s, #2 +; CHECK-GI-NEXT: ushr v2.4s, v2.4s, #31 +; CHECK-GI-NEXT: ssra v2.4s, v1.4s, #2 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i32> %d, @@ -2702,21 +2588,16 @@ define <4 x i32> @sv4i32_100(<4 x i32> %d, <4 x i32> %e) { ; ; CHECK-GI-LABEL: sv4i32_100: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: fmov w9, s0 -; CHECK-GI-NEXT: mov w8, #100 // =0x64 -; CHECK-GI-NEXT: mov w10, v0.s[1] -; CHECK-GI-NEXT: mov w11, v0.s[2] -; CHECK-GI-NEXT: mov w12, v0.s[3] -; CHECK-GI-NEXT: movi v2.4s, #100 -; CHECK-GI-NEXT: sdiv w9, w9, w8 -; CHECK-GI-NEXT: sdiv w10, w10, w8 -; CHECK-GI-NEXT: fmov s1, w9 -; CHECK-GI-NEXT: sdiv w11, w11, w8 -; CHECK-GI-NEXT: mov v1.s[1], w10 -; CHECK-GI-NEXT: sdiv w8, w12, w8 -; CHECK-GI-NEXT: mov v1.s[2], w11 -; CHECK-GI-NEXT: mov v1.s[3], w8 -; CHECK-GI-NEXT: mls v0.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: adrp x8, .LCPI61_0 +; CHECK-GI-NEXT: movi v3.4s, #100 +; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI61_0] +; CHECK-GI-NEXT: smull2 v2.2d, v0.4s, v1.4s +; CHECK-GI-NEXT: smull v1.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: uzp2 v1.4s, v1.4s, v2.4s +; CHECK-GI-NEXT: sshr v2.4s, v1.4s, #5 +; CHECK-GI-NEXT: ushr v2.4s, v2.4s, #31 +; CHECK-GI-NEXT: ssra v2.4s, v1.4s, #5 +; CHECK-GI-NEXT: mls v0.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: ret entry: %s = srem <4 x i32> %d, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll index 530f4cf53321e..1eb8457cd4a5d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll @@ -254,27 +254,13 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) { ; CHECK-LABEL: v_srem_i32_pow2k_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x1000 -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_lshlrev_b32_e32 v2, 12, v2 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xfffff000, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x80000001 +; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 11, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_lshlrev_b32_e32 v1, 12, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 4096 @@ -327,42 +313,21 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) { ; CGP-LABEL: v_srem_v2i32_pow2k_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000 -; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000 -; CGP-NEXT: v_mov_b32_e32 v5, 0x1000 -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v7 +; CGP-NEXT: v_mov_b32_e32 v2, 0x80000001 +; CGP-NEXT: v_mul_hi_i32 v3, v0, v2 +; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v1 +; CGP-NEXT: v_ashrrev_i32_e32 v3, 11, v3 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 11, v2 +; CGP-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v2 +; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; CGP-NEXT: v_lshlrev_b32_e32 v3, 12, v3 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xfffff000, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xfffff000, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_lshlrev_b32_e32 v2, 12, v2 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, ret <2 x i32> %result @@ -372,27 +337,14 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) { ; CHECK-LABEL: v_srem_i32_oddk_denom: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x4996c7d8 -; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705 -; CHECK-NEXT: v_mov_b32_e32 v4, 0x12d8fb -; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2 -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 -; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2 -; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2 -; CHECK-NEXT: v_mul_lo_u32 v2, v2, v4 -; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v0 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4 -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0xd9528441 +; CHECK-NEXT: v_mul_hi_i32 v1, v0, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; CHECK-NEXT: v_ashrrev_i32_e32 v1, 20, v1 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CHECK-NEXT: v_mov_b32_e32 v2, 0x12d8fb +; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: s_setpc_b64 s[30:31] %result = srem i32 %num, 1235195 @@ -445,42 +397,22 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) { ; CGP-LABEL: v_srem_v2i32_oddk_denom: ; CGP: ; %bb.0: ; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0 -; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x4996c7d8 -; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705 -; CGP-NEXT: v_mov_b32_e32 v5, 0x12d8fb -; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1 -; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3 -; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6 -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_mul_lo_u32 v7, v3, v4 -; CGP-NEXT: v_mul_hi_u32 v7, v3, v7 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7 -; CGP-NEXT: v_mul_hi_u32 v7, v0, v3 -; CGP-NEXT: v_mul_hi_u32 v3, v1, v3 -; CGP-NEXT: v_mul_lo_u32 v7, v7, v5 -; CGP-NEXT: v_mul_lo_u32 v3, v3, v5 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 -; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v7, vcc, 0xffed2705, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; CGP-NEXT: v_add_i32_e32 v3, vcc, v0, v4 -; CGP-NEXT: v_add_i32_e32 v4, vcc, 0xffed2705, v1 -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5 -; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5 -; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; CGP-NEXT: v_xor_b32_e32 v0, v0, v2 -; CGP-NEXT: v_xor_b32_e32 v1, v1, v6 -; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 -; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 +; CGP-NEXT: v_mov_b32_e32 v2, 0xd9528441 +; CGP-NEXT: v_mov_b32_e32 v3, 0x12d8fb +; CGP-NEXT: v_mul_hi_i32 v4, v0, v2 +; CGP-NEXT: v_mul_hi_i32 v2, v1, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v0 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v1 +; CGP-NEXT: v_ashrrev_i32_e32 v4, 20, v4 +; CGP-NEXT: v_ashrrev_i32_e32 v2, 20, v2 +; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v4 +; CGP-NEXT: v_lshrrev_b32_e32 v6, 31, v2 +; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5 +; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6 +; CGP-NEXT: v_mul_lo_u32 v4, v4, v3 +; CGP-NEXT: v_mul_lo_u32 v2, v2, v3 +; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 +; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 ; CGP-NEXT: s_setpc_b64 s[30:31] %result = srem <2 x i32> %num, ret <2 x i32> %result From 66da9f38f374e786b2f1c0ecdab0b651c94c4f27 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 17 Jul 2025 15:44:21 +0200 Subject: [PATCH 175/813] [SelectionDAG] Fix copyExtraInfo where new node has entry as operand (#149307) Add special case handling where a new replacement node has the entry node as an operand i.e. does not depend on any other nodes. This can be observed with the existing X86/pcsections-atomics.ll test case when targeting Haswell, where certain 128-bit atomics are transformed into arch-specific instructions, with some operands having no other dependencies. --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 9 +- llvm/test/CodeGen/X86/pcsections-atomics.ll | 3378 +++++++++++++++++ 2 files changed, 3386 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 70a39eab1e720..682d93d0abf3f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -13872,6 +13872,8 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) { return; } + const SDNode *EntrySDN = getEntryNode().getNode(); + // We need to copy NodeExtraInfo to all _new_ nodes that are being introduced // through the replacement of From with To. Otherwise, replacements of a node // (From) with more complex nodes (To and its operands) may result in lost @@ -13903,9 +13905,14 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) { return true; if (!Visited.insert(N).second) return true; - if (getEntryNode().getNode() == N) + if (EntrySDN == N) return false; for (const SDValue &Op : N->op_values()) { + if (N == To && Op.getNode() == EntrySDN) { + // Special case: New node's operand is the entry node; just need to + // copy extra info to new node. + break; + } if (!Self(Self, Op.getNode())) return false; } diff --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll index 672ebc1ec7275..69ae1f19f3200 100644 --- a/llvm/test/CodeGen/X86/pcsections-atomics.ll +++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll @@ -9,6 +9,7 @@ ; RUN: llc -O1 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O1 ; RUN: llc -O2 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O2 ; RUN: llc -O3 -mattr=cx16 < %s | FileCheck %s --check-prefixes=O3 +; RUN: llc -O3 -mcpu=haswell -mattr=cx16 < %s | FileCheck %s --check-prefixes=HASWELL-O3 target triple = "x86_64-unknown-linux-gnu" @@ -50,6 +51,14 @@ define void @mixed_atomic_non_atomic(ptr %a) { ; O3-NEXT: movl $1, (%rdi) ; O3-NEXT: decl (%rdi) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: mixed_atomic_non_atomic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: incl (%rdi) +; HASWELL-O3-NEXT: .Lpcsection0: +; HASWELL-O3-NEXT: movl $1, (%rdi) +; HASWELL-O3-NEXT: decl (%rdi) +; HASWELL-O3-NEXT: retq entry: ; Accesses the same location atomically and non-atomically. %0 = load volatile i32, ptr %a, align 4 @@ -107,6 +116,17 @@ define i64 @mixed_complex_atomic_non_atomic(ptr %a, ptr %b) { ; O3-NEXT: movq %rdx, (%rsi) ; O3-NEXT: addq %rcx, %rax ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: mixed_complex_atomic_non_atomic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movl $1, %eax +; HASWELL-O3-NEXT: .Lpcsection1: +; HASWELL-O3-NEXT: lock xaddq %rax, (%rdi) +; HASWELL-O3-NEXT: movq (%rsi), %rcx +; HASWELL-O3-NEXT: leaq 1(%rcx), %rdx +; HASWELL-O3-NEXT: movq %rdx, (%rsi) +; HASWELL-O3-NEXT: addq %rcx, %rax +; HASWELL-O3-NEXT: retq entry: %0 = atomicrmw add ptr %a, i64 1 monotonic, align 8, !pcsections !0 %1 = load i64, ptr %b, align 8 @@ -148,6 +168,14 @@ define i8 @atomic8_load_unordered(ptr %a) { ; O3-NEXT: movzbl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_load_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection2: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i8, ptr %a unordered, align 1, !pcsections !0 @@ -187,6 +215,14 @@ define i8 @atomic8_load_monotonic(ptr %a) { ; O3-NEXT: movzbl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_load_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection3: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i8, ptr %a monotonic, align 1, !pcsections !0 @@ -226,6 +262,14 @@ define i8 @atomic8_load_acquire(ptr %a) { ; O3-NEXT: movzbl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_load_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection4: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i8, ptr %a acquire, align 1, !pcsections !0 @@ -265,6 +309,14 @@ define i8 @atomic8_load_seq_cst(ptr %a) { ; O3-NEXT: movzbl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_load_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection5: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i8, ptr %a seq_cst, align 1, !pcsections !0 @@ -304,6 +356,14 @@ define void @atomic8_store_unordered(ptr %a) { ; O3-NEXT: movb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_store_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection6: +; HASWELL-O3-NEXT: movb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i8 42, ptr %a unordered, align 1, !pcsections !0 @@ -343,6 +403,14 @@ define void @atomic8_store_monotonic(ptr %a) { ; O3-NEXT: movb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_store_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection7: +; HASWELL-O3-NEXT: movb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i8 42, ptr %a monotonic, align 1, !pcsections !0 @@ -382,6 +450,14 @@ define void @atomic8_store_release(ptr %a) { ; O3-NEXT: movb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_store_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection8: +; HASWELL-O3-NEXT: movb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i8 42, ptr %a release, align 1, !pcsections !0 @@ -425,6 +501,15 @@ define void @atomic8_store_seq_cst(ptr %a) { ; O3-NEXT: xchgb %al, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_store_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection9: +; HASWELL-O3-NEXT: xchgb %al, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i8 42, ptr %a seq_cst, align 1, !pcsections !0 @@ -468,6 +553,15 @@ define void @atomic8_xchg_monotonic(ptr %a) { ; O3-NEXT: xchgb %al, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xchg_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection10: +; HASWELL-O3-NEXT: xchgb %al, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i8 42 monotonic, align 1, !pcsections !0 @@ -507,6 +601,14 @@ define void @atomic8_add_monotonic(ptr %a) { ; O3-NEXT: lock addb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_add_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection11: +; HASWELL-O3-NEXT: lock addb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i8 42 monotonic, align 1, !pcsections !0 @@ -546,6 +648,14 @@ define void @atomic8_sub_monotonic(ptr %a) { ; O3-NEXT: lock subb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_sub_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection12: +; HASWELL-O3-NEXT: lock subb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i8 42 monotonic, align 1, !pcsections !0 @@ -585,6 +695,14 @@ define void @atomic8_and_monotonic(ptr %a) { ; O3-NEXT: lock andb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_and_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection13: +; HASWELL-O3-NEXT: lock andb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i8 42 monotonic, align 1, !pcsections !0 @@ -624,6 +742,14 @@ define void @atomic8_or_monotonic(ptr %a) { ; O3-NEXT: lock orb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_or_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection14: +; HASWELL-O3-NEXT: lock orb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i8 42 monotonic, align 1, !pcsections !0 @@ -663,6 +789,14 @@ define void @atomic8_xor_monotonic(ptr %a) { ; O3-NEXT: lock xorb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xor_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection15: +; HASWELL-O3-NEXT: lock xorb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i8 42 monotonic, align 1, !pcsections !0 @@ -763,6 +897,27 @@ define void @atomic8_nand_monotonic(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_nand_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection16: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB16_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection17: +; HASWELL-O3-NEXT: notb %cl +; HASWELL-O3-NEXT: .Lpcsection18: +; HASWELL-O3-NEXT: orb $-43, %cl +; HASWELL-O3-NEXT: .Lpcsection19: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection20: +; HASWELL-O3-NEXT: jne .LBB16_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i8 42 monotonic, align 1, !pcsections !0 @@ -806,6 +961,15 @@ define void @atomic8_xchg_acquire(ptr %a) { ; O3-NEXT: xchgb %al, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xchg_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection21: +; HASWELL-O3-NEXT: xchgb %al, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i8 42 acquire, align 1, !pcsections !0 @@ -845,6 +1009,14 @@ define void @atomic8_add_acquire(ptr %a) { ; O3-NEXT: lock addb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_add_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection22: +; HASWELL-O3-NEXT: lock addb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i8 42 acquire, align 1, !pcsections !0 @@ -884,6 +1056,14 @@ define void @atomic8_sub_acquire(ptr %a) { ; O3-NEXT: lock subb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_sub_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection23: +; HASWELL-O3-NEXT: lock subb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i8 42 acquire, align 1, !pcsections !0 @@ -923,6 +1103,14 @@ define void @atomic8_and_acquire(ptr %a) { ; O3-NEXT: lock andb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_and_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection24: +; HASWELL-O3-NEXT: lock andb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i8 42 acquire, align 1, !pcsections !0 @@ -962,6 +1150,14 @@ define void @atomic8_or_acquire(ptr %a) { ; O3-NEXT: lock orb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_or_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection25: +; HASWELL-O3-NEXT: lock orb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i8 42 acquire, align 1, !pcsections !0 @@ -1001,6 +1197,14 @@ define void @atomic8_xor_acquire(ptr %a) { ; O3-NEXT: lock xorb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xor_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection26: +; HASWELL-O3-NEXT: lock xorb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i8 42 acquire, align 1, !pcsections !0 @@ -1101,6 +1305,27 @@ define void @atomic8_nand_acquire(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_nand_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection27: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB23_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection28: +; HASWELL-O3-NEXT: notb %cl +; HASWELL-O3-NEXT: .Lpcsection29: +; HASWELL-O3-NEXT: orb $-43, %cl +; HASWELL-O3-NEXT: .Lpcsection30: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection31: +; HASWELL-O3-NEXT: jne .LBB23_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i8 42 acquire, align 1, !pcsections !0 @@ -1144,6 +1369,15 @@ define void @atomic8_xchg_release(ptr %a) { ; O3-NEXT: xchgb %al, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xchg_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection32: +; HASWELL-O3-NEXT: xchgb %al, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i8 42 release, align 1, !pcsections !0 @@ -1183,6 +1417,14 @@ define void @atomic8_add_release(ptr %a) { ; O3-NEXT: lock addb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_add_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection33: +; HASWELL-O3-NEXT: lock addb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i8 42 release, align 1, !pcsections !0 @@ -1222,6 +1464,14 @@ define void @atomic8_sub_release(ptr %a) { ; O3-NEXT: lock subb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_sub_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection34: +; HASWELL-O3-NEXT: lock subb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i8 42 release, align 1, !pcsections !0 @@ -1261,6 +1511,14 @@ define void @atomic8_and_release(ptr %a) { ; O3-NEXT: lock andb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_and_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection35: +; HASWELL-O3-NEXT: lock andb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i8 42 release, align 1, !pcsections !0 @@ -1300,6 +1558,14 @@ define void @atomic8_or_release(ptr %a) { ; O3-NEXT: lock orb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_or_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection36: +; HASWELL-O3-NEXT: lock orb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i8 42 release, align 1, !pcsections !0 @@ -1339,6 +1605,14 @@ define void @atomic8_xor_release(ptr %a) { ; O3-NEXT: lock xorb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xor_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection37: +; HASWELL-O3-NEXT: lock xorb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i8 42 release, align 1, !pcsections !0 @@ -1439,6 +1713,27 @@ define void @atomic8_nand_release(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_nand_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection38: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB30_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection39: +; HASWELL-O3-NEXT: notb %cl +; HASWELL-O3-NEXT: .Lpcsection40: +; HASWELL-O3-NEXT: orb $-43, %cl +; HASWELL-O3-NEXT: .Lpcsection41: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection42: +; HASWELL-O3-NEXT: jne .LBB30_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i8 42 release, align 1, !pcsections !0 @@ -1482,6 +1777,15 @@ define void @atomic8_xchg_acq_rel(ptr %a) { ; O3-NEXT: xchgb %al, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xchg_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection43: +; HASWELL-O3-NEXT: xchgb %al, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i8 42 acq_rel, align 1, !pcsections !0 @@ -1521,6 +1825,14 @@ define void @atomic8_add_acq_rel(ptr %a) { ; O3-NEXT: lock addb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_add_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection44: +; HASWELL-O3-NEXT: lock addb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i8 42 acq_rel, align 1, !pcsections !0 @@ -1560,6 +1872,14 @@ define void @atomic8_sub_acq_rel(ptr %a) { ; O3-NEXT: lock subb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_sub_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection45: +; HASWELL-O3-NEXT: lock subb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i8 42 acq_rel, align 1, !pcsections !0 @@ -1599,6 +1919,14 @@ define void @atomic8_and_acq_rel(ptr %a) { ; O3-NEXT: lock andb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_and_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection46: +; HASWELL-O3-NEXT: lock andb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i8 42 acq_rel, align 1, !pcsections !0 @@ -1638,6 +1966,14 @@ define void @atomic8_or_acq_rel(ptr %a) { ; O3-NEXT: lock orb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_or_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection47: +; HASWELL-O3-NEXT: lock orb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i8 42 acq_rel, align 1, !pcsections !0 @@ -1677,6 +2013,14 @@ define void @atomic8_xor_acq_rel(ptr %a) { ; O3-NEXT: lock xorb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xor_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection48: +; HASWELL-O3-NEXT: lock xorb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i8 42 acq_rel, align 1, !pcsections !0 @@ -1777,6 +2121,27 @@ define void @atomic8_nand_acq_rel(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_nand_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection49: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB37_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection50: +; HASWELL-O3-NEXT: notb %cl +; HASWELL-O3-NEXT: .Lpcsection51: +; HASWELL-O3-NEXT: orb $-43, %cl +; HASWELL-O3-NEXT: .Lpcsection52: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection53: +; HASWELL-O3-NEXT: jne .LBB37_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i8 42 acq_rel, align 1, !pcsections !0 @@ -1820,6 +2185,15 @@ define void @atomic8_xchg_seq_cst(ptr %a) { ; O3-NEXT: xchgb %al, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xchg_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection54: +; HASWELL-O3-NEXT: xchgb %al, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i8 42 seq_cst, align 1, !pcsections !0 @@ -1859,6 +2233,14 @@ define void @atomic8_add_seq_cst(ptr %a) { ; O3-NEXT: lock addb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_add_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection55: +; HASWELL-O3-NEXT: lock addb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i8 42 seq_cst, align 1, !pcsections !0 @@ -1898,6 +2280,14 @@ define void @atomic8_sub_seq_cst(ptr %a) { ; O3-NEXT: lock subb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_sub_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection56: +; HASWELL-O3-NEXT: lock subb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i8 42 seq_cst, align 1, !pcsections !0 @@ -1937,6 +2327,14 @@ define void @atomic8_and_seq_cst(ptr %a) { ; O3-NEXT: lock andb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_and_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection57: +; HASWELL-O3-NEXT: lock andb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i8 42 seq_cst, align 1, !pcsections !0 @@ -1976,6 +2374,14 @@ define void @atomic8_or_seq_cst(ptr %a) { ; O3-NEXT: lock orb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_or_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection58: +; HASWELL-O3-NEXT: lock orb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i8 42 seq_cst, align 1, !pcsections !0 @@ -2015,6 +2421,14 @@ define void @atomic8_xor_seq_cst(ptr %a) { ; O3-NEXT: lock xorb $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_xor_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection59: +; HASWELL-O3-NEXT: lock xorb $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i8 42 seq_cst, align 1, !pcsections !0 @@ -2115,6 +2529,27 @@ define void @atomic8_nand_seq_cst(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_nand_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection60: +; HASWELL-O3-NEXT: movzbl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB44_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection61: +; HASWELL-O3-NEXT: notb %cl +; HASWELL-O3-NEXT: .Lpcsection62: +; HASWELL-O3-NEXT: orb $-43, %cl +; HASWELL-O3-NEXT: .Lpcsection63: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection64: +; HASWELL-O3-NEXT: jne .LBB44_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i8 42 seq_cst, align 1, !pcsections !0 @@ -2200,6 +2635,25 @@ define void @atomic8_cas_monotonic(ptr %a) { ; O3-NEXT: lock cmpxchgb %cl, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_cas_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $1, %cl +; HASWELL-O3-NEXT: .Lpcsection65: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection66: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection67: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection68: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection69: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection70: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i8 42, i8 1 monotonic monotonic, align 1, !pcsections !0 @@ -2287,6 +2741,25 @@ define void @atomic8_cas_acquire(ptr %a) { ; O3-NEXT: lock cmpxchgb %cl, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_cas_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $1, %cl +; HASWELL-O3-NEXT: .Lpcsection71: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection72: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection73: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection74: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection75: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection76: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i8 42, i8 1 acquire monotonic, align 1, !pcsections !0 @@ -2374,6 +2847,25 @@ define void @atomic8_cas_release(ptr %a) { ; O3-NEXT: lock cmpxchgb %cl, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_cas_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $1, %cl +; HASWELL-O3-NEXT: .Lpcsection77: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection78: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection79: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection80: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection81: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection82: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i8 42, i8 1 release monotonic, align 1, !pcsections !0 @@ -2461,6 +2953,25 @@ define void @atomic8_cas_acq_rel(ptr %a) { ; O3-NEXT: lock cmpxchgb %cl, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_cas_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $1, %cl +; HASWELL-O3-NEXT: .Lpcsection83: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection84: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection85: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection86: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection87: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection88: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i8 42, i8 1 acq_rel monotonic, align 1, !pcsections !0 @@ -2548,6 +3059,25 @@ define void @atomic8_cas_seq_cst(ptr %a) { ; O3-NEXT: lock cmpxchgb %cl, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic8_cas_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movb $1, %cl +; HASWELL-O3-NEXT: .Lpcsection89: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection90: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection91: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection92: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection93: +; HASWELL-O3-NEXT: movb $42, %al +; HASWELL-O3-NEXT: .Lpcsection94: +; HASWELL-O3-NEXT: lock cmpxchgb %cl, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i8 42, i8 1 seq_cst monotonic, align 1, !pcsections !0 @@ -2589,6 +3119,14 @@ define i16 @atomic16_load_unordered(ptr %a) { ; O3-NEXT: movzwl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_load_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection95: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i16, ptr %a unordered, align 2, !pcsections !0 @@ -2628,6 +3166,14 @@ define i16 @atomic16_load_monotonic(ptr %a) { ; O3-NEXT: movzwl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_load_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection96: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i16, ptr %a monotonic, align 2, !pcsections !0 @@ -2667,6 +3213,14 @@ define i16 @atomic16_load_acquire(ptr %a) { ; O3-NEXT: movzwl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_load_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection97: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i16, ptr %a acquire, align 2, !pcsections !0 @@ -2706,6 +3260,14 @@ define i16 @atomic16_load_seq_cst(ptr %a) { ; O3-NEXT: movzwl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_load_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection98: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i16, ptr %a seq_cst, align 2, !pcsections !0 @@ -2745,6 +3307,14 @@ define void @atomic16_store_unordered(ptr %a) { ; O3-NEXT: movw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_store_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection99: +; HASWELL-O3-NEXT: movw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i16 42, ptr %a unordered, align 2, !pcsections !0 @@ -2784,6 +3354,14 @@ define void @atomic16_store_monotonic(ptr %a) { ; O3-NEXT: movw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_store_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection100: +; HASWELL-O3-NEXT: movw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i16 42, ptr %a monotonic, align 2, !pcsections !0 @@ -2823,6 +3401,14 @@ define void @atomic16_store_release(ptr %a) { ; O3-NEXT: movw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_store_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection101: +; HASWELL-O3-NEXT: movw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i16 42, ptr %a release, align 2, !pcsections !0 @@ -2866,6 +3452,15 @@ define void @atomic16_store_seq_cst(ptr %a) { ; O3-NEXT: xchgw %ax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_store_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection102: +; HASWELL-O3-NEXT: xchgw %ax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i16 42, ptr %a seq_cst, align 2, !pcsections !0 @@ -2909,6 +3504,15 @@ define void @atomic16_xchg_monotonic(ptr %a) { ; O3-NEXT: xchgw %ax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xchg_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection103: +; HASWELL-O3-NEXT: xchgw %ax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i16 42 monotonic, align 2, !pcsections !0 @@ -2948,6 +3552,14 @@ define void @atomic16_add_monotonic(ptr %a) { ; O3-NEXT: lock addw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_add_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection104: +; HASWELL-O3-NEXT: lock addw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i16 42 monotonic, align 2, !pcsections !0 @@ -2987,6 +3599,14 @@ define void @atomic16_sub_monotonic(ptr %a) { ; O3-NEXT: lock subw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_sub_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection105: +; HASWELL-O3-NEXT: lock subw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i16 42 monotonic, align 2, !pcsections !0 @@ -3026,6 +3646,14 @@ define void @atomic16_and_monotonic(ptr %a) { ; O3-NEXT: lock andw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_and_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection106: +; HASWELL-O3-NEXT: lock andw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i16 42 monotonic, align 2, !pcsections !0 @@ -3065,6 +3693,14 @@ define void @atomic16_or_monotonic(ptr %a) { ; O3-NEXT: lock orw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_or_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection107: +; HASWELL-O3-NEXT: lock orw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i16 42 monotonic, align 2, !pcsections !0 @@ -3104,6 +3740,14 @@ define void @atomic16_xor_monotonic(ptr %a) { ; O3-NEXT: lock xorw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xor_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection108: +; HASWELL-O3-NEXT: lock xorw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i16 42 monotonic, align 2, !pcsections !0 @@ -3220,6 +3864,31 @@ define void @atomic16_nand_monotonic(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_nand_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection109: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB64_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection110: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection111: +; HASWELL-O3-NEXT: orl $65493, %ecx # imm = 0xFFD5 +; HASWELL-O3-NEXT: .Lpcsection112: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax killed $eax +; HASWELL-O3-NEXT: .Lpcsection113: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection114: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax def $eax +; HASWELL-O3-NEXT: .Lpcsection115: +; HASWELL-O3-NEXT: jne .LBB64_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i16 42 monotonic, align 2, !pcsections !0 @@ -3263,6 +3932,15 @@ define void @atomic16_xchg_acquire(ptr %a) { ; O3-NEXT: xchgw %ax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xchg_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection116: +; HASWELL-O3-NEXT: xchgw %ax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i16 42 acquire, align 2, !pcsections !0 @@ -3302,6 +3980,14 @@ define void @atomic16_add_acquire(ptr %a) { ; O3-NEXT: lock addw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_add_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection117: +; HASWELL-O3-NEXT: lock addw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i16 42 acquire, align 2, !pcsections !0 @@ -3341,6 +4027,14 @@ define void @atomic16_sub_acquire(ptr %a) { ; O3-NEXT: lock subw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_sub_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection118: +; HASWELL-O3-NEXT: lock subw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i16 42 acquire, align 2, !pcsections !0 @@ -3380,6 +4074,14 @@ define void @atomic16_and_acquire(ptr %a) { ; O3-NEXT: lock andw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_and_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection119: +; HASWELL-O3-NEXT: lock andw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i16 42 acquire, align 2, !pcsections !0 @@ -3419,6 +4121,14 @@ define void @atomic16_or_acquire(ptr %a) { ; O3-NEXT: lock orw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_or_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection120: +; HASWELL-O3-NEXT: lock orw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i16 42 acquire, align 2, !pcsections !0 @@ -3458,6 +4168,14 @@ define void @atomic16_xor_acquire(ptr %a) { ; O3-NEXT: lock xorw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xor_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection121: +; HASWELL-O3-NEXT: lock xorw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i16 42 acquire, align 2, !pcsections !0 @@ -3574,6 +4292,31 @@ define void @atomic16_nand_acquire(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_nand_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection122: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB71_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection123: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection124: +; HASWELL-O3-NEXT: orl $65493, %ecx # imm = 0xFFD5 +; HASWELL-O3-NEXT: .Lpcsection125: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax killed $eax +; HASWELL-O3-NEXT: .Lpcsection126: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection127: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax def $eax +; HASWELL-O3-NEXT: .Lpcsection128: +; HASWELL-O3-NEXT: jne .LBB71_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i16 42 acquire, align 2, !pcsections !0 @@ -3617,6 +4360,15 @@ define void @atomic16_xchg_release(ptr %a) { ; O3-NEXT: xchgw %ax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xchg_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection129: +; HASWELL-O3-NEXT: xchgw %ax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i16 42 release, align 2, !pcsections !0 @@ -3656,6 +4408,14 @@ define void @atomic16_add_release(ptr %a) { ; O3-NEXT: lock addw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_add_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection130: +; HASWELL-O3-NEXT: lock addw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i16 42 release, align 2, !pcsections !0 @@ -3695,6 +4455,14 @@ define void @atomic16_sub_release(ptr %a) { ; O3-NEXT: lock subw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_sub_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection131: +; HASWELL-O3-NEXT: lock subw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i16 42 release, align 2, !pcsections !0 @@ -3734,6 +4502,14 @@ define void @atomic16_and_release(ptr %a) { ; O3-NEXT: lock andw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_and_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection132: +; HASWELL-O3-NEXT: lock andw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i16 42 release, align 2, !pcsections !0 @@ -3773,6 +4549,14 @@ define void @atomic16_or_release(ptr %a) { ; O3-NEXT: lock orw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_or_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection133: +; HASWELL-O3-NEXT: lock orw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i16 42 release, align 2, !pcsections !0 @@ -3812,6 +4596,14 @@ define void @atomic16_xor_release(ptr %a) { ; O3-NEXT: lock xorw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xor_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection134: +; HASWELL-O3-NEXT: lock xorw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i16 42 release, align 2, !pcsections !0 @@ -3928,6 +4720,31 @@ define void @atomic16_nand_release(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_nand_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection135: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB78_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection136: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection137: +; HASWELL-O3-NEXT: orl $65493, %ecx # imm = 0xFFD5 +; HASWELL-O3-NEXT: .Lpcsection138: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax killed $eax +; HASWELL-O3-NEXT: .Lpcsection139: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection140: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax def $eax +; HASWELL-O3-NEXT: .Lpcsection141: +; HASWELL-O3-NEXT: jne .LBB78_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i16 42 release, align 2, !pcsections !0 @@ -3971,6 +4788,15 @@ define void @atomic16_xchg_acq_rel(ptr %a) { ; O3-NEXT: xchgw %ax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xchg_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection142: +; HASWELL-O3-NEXT: xchgw %ax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i16 42 acq_rel, align 2, !pcsections !0 @@ -4010,6 +4836,14 @@ define void @atomic16_add_acq_rel(ptr %a) { ; O3-NEXT: lock addw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_add_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection143: +; HASWELL-O3-NEXT: lock addw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i16 42 acq_rel, align 2, !pcsections !0 @@ -4049,6 +4883,14 @@ define void @atomic16_sub_acq_rel(ptr %a) { ; O3-NEXT: lock subw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_sub_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection144: +; HASWELL-O3-NEXT: lock subw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i16 42 acq_rel, align 2, !pcsections !0 @@ -4088,6 +4930,14 @@ define void @atomic16_and_acq_rel(ptr %a) { ; O3-NEXT: lock andw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_and_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection145: +; HASWELL-O3-NEXT: lock andw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i16 42 acq_rel, align 2, !pcsections !0 @@ -4127,6 +4977,14 @@ define void @atomic16_or_acq_rel(ptr %a) { ; O3-NEXT: lock orw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_or_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection146: +; HASWELL-O3-NEXT: lock orw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i16 42 acq_rel, align 2, !pcsections !0 @@ -4166,6 +5024,14 @@ define void @atomic16_xor_acq_rel(ptr %a) { ; O3-NEXT: lock xorw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xor_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection147: +; HASWELL-O3-NEXT: lock xorw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i16 42 acq_rel, align 2, !pcsections !0 @@ -4282,6 +5148,31 @@ define void @atomic16_nand_acq_rel(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_nand_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection148: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB85_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection149: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection150: +; HASWELL-O3-NEXT: orl $65493, %ecx # imm = 0xFFD5 +; HASWELL-O3-NEXT: .Lpcsection151: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax killed $eax +; HASWELL-O3-NEXT: .Lpcsection152: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection153: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax def $eax +; HASWELL-O3-NEXT: .Lpcsection154: +; HASWELL-O3-NEXT: jne .LBB85_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i16 42 acq_rel, align 2, !pcsections !0 @@ -4325,6 +5216,15 @@ define void @atomic16_xchg_seq_cst(ptr %a) { ; O3-NEXT: xchgw %ax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xchg_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection155: +; HASWELL-O3-NEXT: xchgw %ax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i16 42 seq_cst, align 2, !pcsections !0 @@ -4364,6 +5264,14 @@ define void @atomic16_add_seq_cst(ptr %a) { ; O3-NEXT: lock addw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_add_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection156: +; HASWELL-O3-NEXT: lock addw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i16 42 seq_cst, align 2, !pcsections !0 @@ -4403,6 +5311,14 @@ define void @atomic16_sub_seq_cst(ptr %a) { ; O3-NEXT: lock subw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_sub_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection157: +; HASWELL-O3-NEXT: lock subw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i16 42 seq_cst, align 2, !pcsections !0 @@ -4442,6 +5358,14 @@ define void @atomic16_and_seq_cst(ptr %a) { ; O3-NEXT: lock andw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_and_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection158: +; HASWELL-O3-NEXT: lock andw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i16 42 seq_cst, align 2, !pcsections !0 @@ -4481,6 +5405,14 @@ define void @atomic16_or_seq_cst(ptr %a) { ; O3-NEXT: lock orw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_or_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection159: +; HASWELL-O3-NEXT: lock orw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i16 42 seq_cst, align 2, !pcsections !0 @@ -4520,6 +5452,14 @@ define void @atomic16_xor_seq_cst(ptr %a) { ; O3-NEXT: lock xorw $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_xor_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection160: +; HASWELL-O3-NEXT: lock xorw $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i16 42 seq_cst, align 2, !pcsections !0 @@ -4636,6 +5576,31 @@ define void @atomic16_nand_seq_cst(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_nand_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection161: +; HASWELL-O3-NEXT: movzwl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB92_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection162: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection163: +; HASWELL-O3-NEXT: orl $65493, %ecx # imm = 0xFFD5 +; HASWELL-O3-NEXT: .Lpcsection164: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax killed $eax +; HASWELL-O3-NEXT: .Lpcsection165: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection166: +; HASWELL-O3-NEXT: # kill: def $ax killed $ax def $eax +; HASWELL-O3-NEXT: .Lpcsection167: +; HASWELL-O3-NEXT: jne .LBB92_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i16 42 seq_cst, align 2, !pcsections !0 @@ -4712,6 +5677,22 @@ define void @atomic16_cas_monotonic(ptr %a) { ; O3-NEXT: lock cmpxchgw %cx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_cas_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $1, %cx +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection168: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection169: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection170: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i16 42, i16 1 monotonic monotonic, align 2, !pcsections !0 @@ -4790,6 +5771,22 @@ define void @atomic16_cas_acquire(ptr %a) { ; O3-NEXT: lock cmpxchgw %cx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_cas_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $1, %cx +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection171: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection172: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection173: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i16 42, i16 1 acquire monotonic, align 2, !pcsections !0 @@ -4868,6 +5865,22 @@ define void @atomic16_cas_release(ptr %a) { ; O3-NEXT: lock cmpxchgw %cx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_cas_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $1, %cx +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection174: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection175: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection176: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i16 42, i16 1 release monotonic, align 2, !pcsections !0 @@ -4946,6 +5959,22 @@ define void @atomic16_cas_acq_rel(ptr %a) { ; O3-NEXT: lock cmpxchgw %cx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_cas_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $1, %cx +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection177: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection178: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection179: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i16 42, i16 1 acq_rel monotonic, align 2, !pcsections !0 @@ -5024,6 +6053,22 @@ define void @atomic16_cas_seq_cst(ptr %a) { ; O3-NEXT: lock cmpxchgw %cx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic16_cas_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movw $1, %cx +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection180: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection181: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movw $42, %ax +; HASWELL-O3-NEXT: .Lpcsection182: +; HASWELL-O3-NEXT: lock cmpxchgw %cx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i16 42, i16 1 seq_cst monotonic, align 2, !pcsections !0 @@ -5065,6 +6110,14 @@ define i32 @atomic32_load_unordered(ptr %a) { ; O3-NEXT: movl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_load_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection183: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i32, ptr %a unordered, align 4, !pcsections !0 @@ -5104,6 +6157,14 @@ define i32 @atomic32_load_monotonic(ptr %a) { ; O3-NEXT: movl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_load_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection184: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i32, ptr %a monotonic, align 4, !pcsections !0 @@ -5143,6 +6204,14 @@ define i32 @atomic32_load_acquire(ptr %a) { ; O3-NEXT: movl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_load_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection185: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i32, ptr %a acquire, align 4, !pcsections !0 @@ -5182,6 +6251,14 @@ define i32 @atomic32_load_seq_cst(ptr %a) { ; O3-NEXT: movl (%rdi), %eax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_load_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection186: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i32, ptr %a seq_cst, align 4, !pcsections !0 @@ -5221,6 +6298,14 @@ define void @atomic32_store_unordered(ptr %a) { ; O3-NEXT: movl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_store_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection187: +; HASWELL-O3-NEXT: movl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i32 42, ptr %a unordered, align 4, !pcsections !0 @@ -5260,6 +6345,14 @@ define void @atomic32_store_monotonic(ptr %a) { ; O3-NEXT: movl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_store_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection188: +; HASWELL-O3-NEXT: movl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i32 42, ptr %a monotonic, align 4, !pcsections !0 @@ -5299,6 +6392,14 @@ define void @atomic32_store_release(ptr %a) { ; O3-NEXT: movl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_store_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection189: +; HASWELL-O3-NEXT: movl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i32 42, ptr %a release, align 4, !pcsections !0 @@ -5342,6 +6443,15 @@ define void @atomic32_store_seq_cst(ptr %a) { ; O3-NEXT: xchgl %eax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_store_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection190: +; HASWELL-O3-NEXT: xchgl %eax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i32 42, ptr %a seq_cst, align 4, !pcsections !0 @@ -5385,6 +6495,15 @@ define void @atomic32_xchg_monotonic(ptr %a) { ; O3-NEXT: xchgl %eax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xchg_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection191: +; HASWELL-O3-NEXT: xchgl %eax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i32 42 monotonic, align 4, !pcsections !0 @@ -5424,6 +6543,14 @@ define void @atomic32_add_monotonic(ptr %a) { ; O3-NEXT: lock addl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_add_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection192: +; HASWELL-O3-NEXT: lock addl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i32 42 monotonic, align 4, !pcsections !0 @@ -5463,6 +6590,14 @@ define void @atomic32_sub_monotonic(ptr %a) { ; O3-NEXT: lock subl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_sub_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection193: +; HASWELL-O3-NEXT: lock subl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i32 42 monotonic, align 4, !pcsections !0 @@ -5502,6 +6637,14 @@ define void @atomic32_and_monotonic(ptr %a) { ; O3-NEXT: lock andl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_and_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection194: +; HASWELL-O3-NEXT: lock andl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i32 42 monotonic, align 4, !pcsections !0 @@ -5541,6 +6684,14 @@ define void @atomic32_or_monotonic(ptr %a) { ; O3-NEXT: lock orl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_or_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection195: +; HASWELL-O3-NEXT: lock orl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i32 42 monotonic, align 4, !pcsections !0 @@ -5580,6 +6731,14 @@ define void @atomic32_xor_monotonic(ptr %a) { ; O3-NEXT: lock xorl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xor_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection196: +; HASWELL-O3-NEXT: lock xorl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i32 42 monotonic, align 4, !pcsections !0 @@ -5680,6 +6839,27 @@ define void @atomic32_nand_monotonic(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_nand_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection197: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB112_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection198: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection199: +; HASWELL-O3-NEXT: orl $-43, %ecx +; HASWELL-O3-NEXT: .Lpcsection200: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection201: +; HASWELL-O3-NEXT: jne .LBB112_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i32 42 monotonic, align 4, !pcsections !0 @@ -5723,6 +6903,15 @@ define void @atomic32_xchg_acquire(ptr %a) { ; O3-NEXT: xchgl %eax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xchg_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection202: +; HASWELL-O3-NEXT: xchgl %eax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i32 42 acquire, align 4, !pcsections !0 @@ -5762,6 +6951,14 @@ define void @atomic32_add_acquire(ptr %a) { ; O3-NEXT: lock addl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_add_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection203: +; HASWELL-O3-NEXT: lock addl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i32 42 acquire, align 4, !pcsections !0 @@ -5801,6 +6998,14 @@ define void @atomic32_sub_acquire(ptr %a) { ; O3-NEXT: lock subl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_sub_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection204: +; HASWELL-O3-NEXT: lock subl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i32 42 acquire, align 4, !pcsections !0 @@ -5840,6 +7045,14 @@ define void @atomic32_and_acquire(ptr %a) { ; O3-NEXT: lock andl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_and_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection205: +; HASWELL-O3-NEXT: lock andl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i32 42 acquire, align 4, !pcsections !0 @@ -5879,6 +7092,14 @@ define void @atomic32_or_acquire(ptr %a) { ; O3-NEXT: lock orl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_or_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection206: +; HASWELL-O3-NEXT: lock orl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i32 42 acquire, align 4, !pcsections !0 @@ -5918,6 +7139,14 @@ define void @atomic32_xor_acquire(ptr %a) { ; O3-NEXT: lock xorl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xor_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection207: +; HASWELL-O3-NEXT: lock xorl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i32 42 acquire, align 4, !pcsections !0 @@ -6018,6 +7247,27 @@ define void @atomic32_nand_acquire(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_nand_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection208: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB119_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection209: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection210: +; HASWELL-O3-NEXT: orl $-43, %ecx +; HASWELL-O3-NEXT: .Lpcsection211: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection212: +; HASWELL-O3-NEXT: jne .LBB119_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i32 42 acquire, align 4, !pcsections !0 @@ -6061,6 +7311,15 @@ define void @atomic32_xchg_release(ptr %a) { ; O3-NEXT: xchgl %eax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xchg_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection213: +; HASWELL-O3-NEXT: xchgl %eax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i32 42 release, align 4, !pcsections !0 @@ -6100,6 +7359,14 @@ define void @atomic32_add_release(ptr %a) { ; O3-NEXT: lock addl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_add_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection214: +; HASWELL-O3-NEXT: lock addl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i32 42 release, align 4, !pcsections !0 @@ -6139,6 +7406,14 @@ define void @atomic32_sub_release(ptr %a) { ; O3-NEXT: lock subl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_sub_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection215: +; HASWELL-O3-NEXT: lock subl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i32 42 release, align 4, !pcsections !0 @@ -6178,6 +7453,14 @@ define void @atomic32_and_release(ptr %a) { ; O3-NEXT: lock andl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_and_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection216: +; HASWELL-O3-NEXT: lock andl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i32 42 release, align 4, !pcsections !0 @@ -6217,6 +7500,14 @@ define void @atomic32_or_release(ptr %a) { ; O3-NEXT: lock orl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_or_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection217: +; HASWELL-O3-NEXT: lock orl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i32 42 release, align 4, !pcsections !0 @@ -6256,6 +7547,14 @@ define void @atomic32_xor_release(ptr %a) { ; O3-NEXT: lock xorl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xor_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection218: +; HASWELL-O3-NEXT: lock xorl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i32 42 release, align 4, !pcsections !0 @@ -6356,6 +7655,27 @@ define void @atomic32_nand_release(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_nand_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection219: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB126_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection220: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection221: +; HASWELL-O3-NEXT: orl $-43, %ecx +; HASWELL-O3-NEXT: .Lpcsection222: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection223: +; HASWELL-O3-NEXT: jne .LBB126_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i32 42 release, align 4, !pcsections !0 @@ -6399,6 +7719,15 @@ define void @atomic32_xchg_acq_rel(ptr %a) { ; O3-NEXT: xchgl %eax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xchg_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection224: +; HASWELL-O3-NEXT: xchgl %eax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i32 42 acq_rel, align 4, !pcsections !0 @@ -6438,6 +7767,14 @@ define void @atomic32_add_acq_rel(ptr %a) { ; O3-NEXT: lock addl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_add_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection225: +; HASWELL-O3-NEXT: lock addl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i32 42 acq_rel, align 4, !pcsections !0 @@ -6477,6 +7814,14 @@ define void @atomic32_sub_acq_rel(ptr %a) { ; O3-NEXT: lock subl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_sub_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection226: +; HASWELL-O3-NEXT: lock subl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i32 42 acq_rel, align 4, !pcsections !0 @@ -6516,6 +7861,14 @@ define void @atomic32_and_acq_rel(ptr %a) { ; O3-NEXT: lock andl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_and_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection227: +; HASWELL-O3-NEXT: lock andl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i32 42 acq_rel, align 4, !pcsections !0 @@ -6555,6 +7908,14 @@ define void @atomic32_or_acq_rel(ptr %a) { ; O3-NEXT: lock orl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_or_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection228: +; HASWELL-O3-NEXT: lock orl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i32 42 acq_rel, align 4, !pcsections !0 @@ -6594,6 +7955,14 @@ define void @atomic32_xor_acq_rel(ptr %a) { ; O3-NEXT: lock xorl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xor_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection229: +; HASWELL-O3-NEXT: lock xorl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i32 42 acq_rel, align 4, !pcsections !0 @@ -6694,6 +8063,27 @@ define void @atomic32_nand_acq_rel(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_nand_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection230: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB133_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection231: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection232: +; HASWELL-O3-NEXT: orl $-43, %ecx +; HASWELL-O3-NEXT: .Lpcsection233: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection234: +; HASWELL-O3-NEXT: jne .LBB133_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i32 42 acq_rel, align 4, !pcsections !0 @@ -6737,6 +8127,15 @@ define void @atomic32_xchg_seq_cst(ptr %a) { ; O3-NEXT: xchgl %eax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xchg_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection235: +; HASWELL-O3-NEXT: xchgl %eax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i32 42 seq_cst, align 4, !pcsections !0 @@ -6776,6 +8175,14 @@ define void @atomic32_add_seq_cst(ptr %a) { ; O3-NEXT: lock addl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_add_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection236: +; HASWELL-O3-NEXT: lock addl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i32 42 seq_cst, align 4, !pcsections !0 @@ -6815,6 +8222,14 @@ define void @atomic32_sub_seq_cst(ptr %a) { ; O3-NEXT: lock subl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_sub_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection237: +; HASWELL-O3-NEXT: lock subl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i32 42 seq_cst, align 4, !pcsections !0 @@ -6854,6 +8269,14 @@ define void @atomic32_and_seq_cst(ptr %a) { ; O3-NEXT: lock andl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_and_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection238: +; HASWELL-O3-NEXT: lock andl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i32 42 seq_cst, align 4, !pcsections !0 @@ -6893,6 +8316,14 @@ define void @atomic32_or_seq_cst(ptr %a) { ; O3-NEXT: lock orl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_or_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection239: +; HASWELL-O3-NEXT: lock orl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i32 42 seq_cst, align 4, !pcsections !0 @@ -6932,6 +8363,14 @@ define void @atomic32_xor_seq_cst(ptr %a) { ; O3-NEXT: lock xorl $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_xor_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection240: +; HASWELL-O3-NEXT: lock xorl $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i32 42 seq_cst, align 4, !pcsections !0 @@ -7032,6 +8471,27 @@ define void @atomic32_nand_seq_cst(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_nand_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection241: +; HASWELL-O3-NEXT: movl (%rdi), %eax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB140_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection242: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection243: +; HASWELL-O3-NEXT: orl $-43, %ecx +; HASWELL-O3-NEXT: .Lpcsection244: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection245: +; HASWELL-O3-NEXT: jne .LBB140_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i32 42 seq_cst, align 4, !pcsections !0 @@ -7117,6 +8577,25 @@ define void @atomic32_cas_monotonic(ptr %a) { ; O3-NEXT: lock cmpxchgl %ecx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_cas_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection246: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection247: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection248: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection249: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection250: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection251: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i32 42, i32 1 monotonic monotonic, align 4, !pcsections !0 @@ -7204,6 +8683,25 @@ define void @atomic32_cas_acquire(ptr %a) { ; O3-NEXT: lock cmpxchgl %ecx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_cas_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection252: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection253: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection254: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection255: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection256: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection257: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i32 42, i32 1 acquire monotonic, align 4, !pcsections !0 @@ -7291,6 +8789,25 @@ define void @atomic32_cas_release(ptr %a) { ; O3-NEXT: lock cmpxchgl %ecx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_cas_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection258: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection259: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection260: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection261: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection262: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection263: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i32 42, i32 1 release monotonic, align 4, !pcsections !0 @@ -7378,6 +8895,25 @@ define void @atomic32_cas_acq_rel(ptr %a) { ; O3-NEXT: lock cmpxchgl %ecx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_cas_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection264: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection265: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection266: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection267: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection268: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection269: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i32 42, i32 1 acq_rel monotonic, align 4, !pcsections !0 @@ -7465,6 +9001,25 @@ define void @atomic32_cas_seq_cst(ptr %a) { ; O3-NEXT: lock cmpxchgl %ecx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic32_cas_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection270: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection271: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection272: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection273: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection274: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection275: +; HASWELL-O3-NEXT: lock cmpxchgl %ecx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i32 42, i32 1 seq_cst monotonic, align 4, !pcsections !0 @@ -7506,6 +9061,14 @@ define i64 @atomic64_load_unordered(ptr %a) { ; O3-NEXT: movq (%rdi), %rax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_load_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection276: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i64, ptr %a unordered, align 8, !pcsections !0 @@ -7545,6 +9108,14 @@ define i64 @atomic64_load_monotonic(ptr %a) { ; O3-NEXT: movq (%rdi), %rax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_load_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection277: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i64, ptr %a monotonic, align 8, !pcsections !0 @@ -7584,6 +9155,14 @@ define i64 @atomic64_load_acquire(ptr %a) { ; O3-NEXT: movq (%rdi), %rax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_load_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection278: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i64, ptr %a acquire, align 8, !pcsections !0 @@ -7623,6 +9202,14 @@ define i64 @atomic64_load_seq_cst(ptr %a) { ; O3-NEXT: movq (%rdi), %rax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_load_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection279: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i64, ptr %a seq_cst, align 8, !pcsections !0 @@ -7662,6 +9249,14 @@ define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) { ; O3-NEXT: movq (%rdi), %rax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_load_seq_cst_ptr_ty: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection280: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic ptr, ptr %a seq_cst, align 8, !pcsections !0 @@ -7701,6 +9296,14 @@ define void @atomic64_store_unordered(ptr %a) { ; O3-NEXT: movq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_store_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection281: +; HASWELL-O3-NEXT: movq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i64 42, ptr %a unordered, align 8, !pcsections !0 @@ -7740,6 +9343,14 @@ define void @atomic64_store_monotonic(ptr %a) { ; O3-NEXT: movq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_store_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection282: +; HASWELL-O3-NEXT: movq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i64 42, ptr %a monotonic, align 8, !pcsections !0 @@ -7779,6 +9390,14 @@ define void @atomic64_store_release(ptr %a) { ; O3-NEXT: movq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_store_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection283: +; HASWELL-O3-NEXT: movq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i64 42, ptr %a release, align 8, !pcsections !0 @@ -7822,6 +9441,15 @@ define void @atomic64_store_seq_cst(ptr %a) { ; O3-NEXT: xchgq %rax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_store_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection284: +; HASWELL-O3-NEXT: xchgq %rax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i64 42, ptr %a seq_cst, align 8, !pcsections !0 @@ -7861,6 +9489,14 @@ define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) { ; O3-NEXT: xchgq %rsi, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_store_seq_cst_ptr_ty: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection285: +; HASWELL-O3-NEXT: xchgq %rsi, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic ptr %v, ptr %a seq_cst, align 8, !pcsections !0 @@ -7904,6 +9540,15 @@ define void @atomic64_xchg_monotonic(ptr %a) { ; O3-NEXT: xchgq %rax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xchg_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection286: +; HASWELL-O3-NEXT: xchgq %rax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i64 42 monotonic, align 8, !pcsections !0 @@ -7943,6 +9588,14 @@ define void @atomic64_add_monotonic(ptr %a) { ; O3-NEXT: lock addq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_add_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection287: +; HASWELL-O3-NEXT: lock addq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i64 42 monotonic, align 8, !pcsections !0 @@ -7982,6 +9635,14 @@ define void @atomic64_sub_monotonic(ptr %a) { ; O3-NEXT: lock subq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_sub_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection288: +; HASWELL-O3-NEXT: lock subq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i64 42 monotonic, align 8, !pcsections !0 @@ -8021,6 +9682,14 @@ define void @atomic64_and_monotonic(ptr %a) { ; O3-NEXT: lock andq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_and_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection289: +; HASWELL-O3-NEXT: lock andq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i64 42 monotonic, align 8, !pcsections !0 @@ -8060,6 +9729,14 @@ define void @atomic64_or_monotonic(ptr %a) { ; O3-NEXT: lock orq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_or_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection290: +; HASWELL-O3-NEXT: lock orq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i64 42 monotonic, align 8, !pcsections !0 @@ -8099,6 +9776,14 @@ define void @atomic64_xor_monotonic(ptr %a) { ; O3-NEXT: lock xorq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xor_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection291: +; HASWELL-O3-NEXT: lock xorq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i64 42 monotonic, align 8, !pcsections !0 @@ -8202,6 +9887,27 @@ define void @atomic64_nand_monotonic(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_nand_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection292: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB162_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection293: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection294: +; HASWELL-O3-NEXT: orq $-43, %rcx +; HASWELL-O3-NEXT: .Lpcsection295: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection296: +; HASWELL-O3-NEXT: jne .LBB162_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i64 42 monotonic, align 8, !pcsections !0 @@ -8245,6 +9951,15 @@ define void @atomic64_xchg_acquire(ptr %a) { ; O3-NEXT: xchgq %rax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xchg_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection297: +; HASWELL-O3-NEXT: xchgq %rax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i64 42 acquire, align 8, !pcsections !0 @@ -8284,6 +9999,14 @@ define void @atomic64_add_acquire(ptr %a) { ; O3-NEXT: lock addq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_add_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection298: +; HASWELL-O3-NEXT: lock addq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i64 42 acquire, align 8, !pcsections !0 @@ -8323,6 +10046,14 @@ define void @atomic64_sub_acquire(ptr %a) { ; O3-NEXT: lock subq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_sub_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection299: +; HASWELL-O3-NEXT: lock subq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i64 42 acquire, align 8, !pcsections !0 @@ -8362,6 +10093,14 @@ define void @atomic64_and_acquire(ptr %a) { ; O3-NEXT: lock andq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_and_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection300: +; HASWELL-O3-NEXT: lock andq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i64 42 acquire, align 8, !pcsections !0 @@ -8401,6 +10140,14 @@ define void @atomic64_or_acquire(ptr %a) { ; O3-NEXT: lock orq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_or_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection301: +; HASWELL-O3-NEXT: lock orq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i64 42 acquire, align 8, !pcsections !0 @@ -8440,6 +10187,14 @@ define void @atomic64_xor_acquire(ptr %a) { ; O3-NEXT: lock xorq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xor_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection302: +; HASWELL-O3-NEXT: lock xorq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i64 42 acquire, align 8, !pcsections !0 @@ -8543,6 +10298,27 @@ define void @atomic64_nand_acquire(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_nand_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection303: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB169_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection304: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection305: +; HASWELL-O3-NEXT: orq $-43, %rcx +; HASWELL-O3-NEXT: .Lpcsection306: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection307: +; HASWELL-O3-NEXT: jne .LBB169_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i64 42 acquire, align 8, !pcsections !0 @@ -8586,6 +10362,15 @@ define void @atomic64_xchg_release(ptr %a) { ; O3-NEXT: xchgq %rax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xchg_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection308: +; HASWELL-O3-NEXT: xchgq %rax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i64 42 release, align 8, !pcsections !0 @@ -8625,6 +10410,14 @@ define void @atomic64_add_release(ptr %a) { ; O3-NEXT: lock addq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_add_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection309: +; HASWELL-O3-NEXT: lock addq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i64 42 release, align 8, !pcsections !0 @@ -8664,6 +10457,14 @@ define void @atomic64_sub_release(ptr %a) { ; O3-NEXT: lock subq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_sub_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection310: +; HASWELL-O3-NEXT: lock subq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i64 42 release, align 8, !pcsections !0 @@ -8703,6 +10504,14 @@ define void @atomic64_and_release(ptr %a) { ; O3-NEXT: lock andq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_and_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection311: +; HASWELL-O3-NEXT: lock andq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i64 42 release, align 8, !pcsections !0 @@ -8742,6 +10551,14 @@ define void @atomic64_or_release(ptr %a) { ; O3-NEXT: lock orq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_or_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection312: +; HASWELL-O3-NEXT: lock orq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i64 42 release, align 8, !pcsections !0 @@ -8781,6 +10598,14 @@ define void @atomic64_xor_release(ptr %a) { ; O3-NEXT: lock xorq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xor_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection313: +; HASWELL-O3-NEXT: lock xorq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i64 42 release, align 8, !pcsections !0 @@ -8884,6 +10709,27 @@ define void @atomic64_nand_release(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_nand_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection314: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB176_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection315: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection316: +; HASWELL-O3-NEXT: orq $-43, %rcx +; HASWELL-O3-NEXT: .Lpcsection317: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection318: +; HASWELL-O3-NEXT: jne .LBB176_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i64 42 release, align 8, !pcsections !0 @@ -8927,6 +10773,15 @@ define void @atomic64_xchg_acq_rel(ptr %a) { ; O3-NEXT: xchgq %rax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xchg_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection319: +; HASWELL-O3-NEXT: xchgq %rax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i64 42 acq_rel, align 8, !pcsections !0 @@ -8966,6 +10821,14 @@ define void @atomic64_add_acq_rel(ptr %a) { ; O3-NEXT: lock addq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_add_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection320: +; HASWELL-O3-NEXT: lock addq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i64 42 acq_rel, align 8, !pcsections !0 @@ -9005,6 +10868,14 @@ define void @atomic64_sub_acq_rel(ptr %a) { ; O3-NEXT: lock subq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_sub_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection321: +; HASWELL-O3-NEXT: lock subq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i64 42 acq_rel, align 8, !pcsections !0 @@ -9044,6 +10915,14 @@ define void @atomic64_and_acq_rel(ptr %a) { ; O3-NEXT: lock andq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_and_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection322: +; HASWELL-O3-NEXT: lock andq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i64 42 acq_rel, align 8, !pcsections !0 @@ -9083,6 +10962,14 @@ define void @atomic64_or_acq_rel(ptr %a) { ; O3-NEXT: lock orq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_or_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection323: +; HASWELL-O3-NEXT: lock orq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i64 42 acq_rel, align 8, !pcsections !0 @@ -9122,6 +11009,14 @@ define void @atomic64_xor_acq_rel(ptr %a) { ; O3-NEXT: lock xorq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xor_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection324: +; HASWELL-O3-NEXT: lock xorq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i64 42 acq_rel, align 8, !pcsections !0 @@ -9225,6 +11120,27 @@ define void @atomic64_nand_acq_rel(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_nand_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection325: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB183_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection326: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection327: +; HASWELL-O3-NEXT: orq $-43, %rcx +; HASWELL-O3-NEXT: .Lpcsection328: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection329: +; HASWELL-O3-NEXT: jne .LBB183_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i64 42 acq_rel, align 8, !pcsections !0 @@ -9268,6 +11184,15 @@ define void @atomic64_xchg_seq_cst(ptr %a) { ; O3-NEXT: xchgq %rax, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xchg_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection330: +; HASWELL-O3-NEXT: xchgq %rax, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i64 42 seq_cst, align 8, !pcsections !0 @@ -9307,6 +11232,14 @@ define void @atomic64_add_seq_cst(ptr %a) { ; O3-NEXT: lock addq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_add_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection331: +; HASWELL-O3-NEXT: lock addq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i64 42 seq_cst, align 8, !pcsections !0 @@ -9346,6 +11279,14 @@ define void @atomic64_sub_seq_cst(ptr %a) { ; O3-NEXT: lock subq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_sub_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection332: +; HASWELL-O3-NEXT: lock subq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i64 42 seq_cst, align 8, !pcsections !0 @@ -9385,6 +11326,14 @@ define void @atomic64_and_seq_cst(ptr %a) { ; O3-NEXT: lock andq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_and_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection333: +; HASWELL-O3-NEXT: lock andq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i64 42 seq_cst, align 8, !pcsections !0 @@ -9424,6 +11373,14 @@ define void @atomic64_or_seq_cst(ptr %a) { ; O3-NEXT: lock orq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_or_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection334: +; HASWELL-O3-NEXT: lock orq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i64 42 seq_cst, align 8, !pcsections !0 @@ -9463,6 +11420,14 @@ define void @atomic64_xor_seq_cst(ptr %a) { ; O3-NEXT: lock xorq $42, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_xor_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection335: +; HASWELL-O3-NEXT: lock xorq $42, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i64 42 seq_cst, align 8, !pcsections !0 @@ -9566,6 +11531,27 @@ define void @atomic64_nand_seq_cst(ptr %a) { ; O3-NEXT: # %bb.2: # %atomicrmw.end ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_nand_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection336: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB190_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ecx +; HASWELL-O3-NEXT: .Lpcsection337: +; HASWELL-O3-NEXT: notl %ecx +; HASWELL-O3-NEXT: .Lpcsection338: +; HASWELL-O3-NEXT: orq $-43, %rcx +; HASWELL-O3-NEXT: .Lpcsection339: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection340: +; HASWELL-O3-NEXT: jne .LBB190_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i64 42 seq_cst, align 8, !pcsections !0 @@ -9651,6 +11637,25 @@ define void @atomic64_cas_monotonic(ptr %a) { ; O3-NEXT: lock cmpxchgq %rcx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_cas_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection341: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection342: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection343: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection344: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection345: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection346: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i64 42, i64 1 monotonic monotonic, align 8, !pcsections !0 @@ -9738,6 +11743,25 @@ define void @atomic64_cas_acquire(ptr %a) { ; O3-NEXT: lock cmpxchgq %rcx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_cas_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection347: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection348: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection349: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection350: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection351: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection352: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i64 42, i64 1 acquire monotonic, align 8, !pcsections !0 @@ -9825,6 +11849,25 @@ define void @atomic64_cas_release(ptr %a) { ; O3-NEXT: lock cmpxchgq %rcx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_cas_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection353: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection354: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection355: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection356: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection357: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection358: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i64 42, i64 1 release monotonic, align 8, !pcsections !0 @@ -9912,6 +11955,25 @@ define void @atomic64_cas_acq_rel(ptr %a) { ; O3-NEXT: lock cmpxchgq %rcx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_cas_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection359: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection360: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection361: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection362: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection363: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection364: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i64 42, i64 1 acq_rel monotonic, align 8, !pcsections !0 @@ -9999,6 +12061,25 @@ define void @atomic64_cas_seq_cst(ptr %a) { ; O3-NEXT: lock cmpxchgq %rcx, (%rdi) ; O3-NEXT: movq $3, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_cas_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: movl $1, %ecx +; HASWELL-O3-NEXT: .Lpcsection365: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection366: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection367: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection368: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection369: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection370: +; HASWELL-O3-NEXT: lock cmpxchgq %rcx, (%rdi) +; HASWELL-O3-NEXT: movq $3, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i64 42, i64 1 seq_cst monotonic, align 8, !pcsections !0 @@ -10044,6 +12125,15 @@ define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) { ; O3-NEXT: lock cmpxchgq %rdx, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic64_cas_seq_cst_ptr_ty: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq %rsi, %rax +; HASWELL-O3-NEXT: movq foo(%rip), %rcx +; HASWELL-O3-NEXT: .Lpcsection371: +; HASWELL-O3-NEXT: lock cmpxchgq %rdx, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, ptr %v1, ptr %v2 seq_cst seq_cst, align 8, !pcsections !0 @@ -10102,6 +12192,18 @@ define i64 @atomic_use_cond(ptr %a) { ; O3-NEXT: .LBB197_2: # %else ; O3-NEXT: movl $2, %eax ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic_use_cond: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: .Lpcsection372: +; HASWELL-O3-NEXT: lock decq (%rdi) +; HASWELL-O3-NEXT: jne .LBB197_2 +; HASWELL-O3-NEXT: # %bb.1: # %then +; HASWELL-O3-NEXT: movl $1, %eax +; HASWELL-O3-NEXT: retq +; HASWELL-O3-NEXT: .LBB197_2: # %else +; HASWELL-O3-NEXT: movl $2, %eax +; HASWELL-O3-NEXT: retq entry: %x = atomicrmw sub ptr %a, i64 1 seq_cst, align 8, !pcsections !0 %y = icmp eq i64 %x, 1 @@ -10196,6 +12298,18 @@ define i128 @atomic128_load_unordered(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_load_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection373: +; HASWELL-O3-NEXT: vmovdqa (%rdi), %xmm0 +; HASWELL-O3-NEXT: .Lpcsection374: +; HASWELL-O3-NEXT: vmovq %xmm0, %rax +; HASWELL-O3-NEXT: .Lpcsection375: +; HASWELL-O3-NEXT: vpextrq $1, %xmm0, %rdx +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i128, ptr %a unordered, align 16, !pcsections !0 @@ -10285,6 +12399,18 @@ define i128 @atomic128_load_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_load_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection376: +; HASWELL-O3-NEXT: vmovdqa (%rdi), %xmm0 +; HASWELL-O3-NEXT: .Lpcsection377: +; HASWELL-O3-NEXT: vmovq %xmm0, %rax +; HASWELL-O3-NEXT: .Lpcsection378: +; HASWELL-O3-NEXT: vpextrq $1, %xmm0, %rdx +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i128, ptr %a monotonic, align 16, !pcsections !0 @@ -10374,6 +12500,18 @@ define i128 @atomic128_load_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_load_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection379: +; HASWELL-O3-NEXT: vmovdqa (%rdi), %xmm0 +; HASWELL-O3-NEXT: .Lpcsection380: +; HASWELL-O3-NEXT: vmovq %xmm0, %rax +; HASWELL-O3-NEXT: .Lpcsection381: +; HASWELL-O3-NEXT: vpextrq $1, %xmm0, %rdx +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i128, ptr %a acquire, align 16, !pcsections !0 @@ -10463,6 +12601,18 @@ define i128 @atomic128_load_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_load_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection382: +; HASWELL-O3-NEXT: vmovdqa (%rdi), %xmm0 +; HASWELL-O3-NEXT: .Lpcsection383: +; HASWELL-O3-NEXT: vmovq %xmm0, %rax +; HASWELL-O3-NEXT: .Lpcsection384: +; HASWELL-O3-NEXT: vpextrq $1, %xmm0, %rdx +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic i128, ptr %a seq_cst, align 16, !pcsections !0 @@ -10502,6 +12652,14 @@ define ptr @atomic128_load_seq_cst_ptr_ty(ptr %a) { ; O3-NEXT: movq (%rdi), %rax ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_load_seq_cst_ptr_ty: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection385: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = load atomic ptr, ptr %a seq_cst, align 16, !pcsections !0 @@ -10629,6 +12787,16 @@ define void @atomic128_store_unordered(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_store_unordered: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection386: +; HASWELL-O3-NEXT: vmovss {{.*#+}} xmm0 = [42,0,0,0] +; HASWELL-O3-NEXT: .Lpcsection387: +; HASWELL-O3-NEXT: vmovaps %xmm0, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i128 42, ptr %a unordered, align 16, !pcsections !0 @@ -10756,6 +12924,16 @@ define void @atomic128_store_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_store_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection388: +; HASWELL-O3-NEXT: vmovss {{.*#+}} xmm0 = [42,0,0,0] +; HASWELL-O3-NEXT: .Lpcsection389: +; HASWELL-O3-NEXT: vmovaps %xmm0, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i128 42, ptr %a monotonic, align 16, !pcsections !0 @@ -10883,6 +13061,16 @@ define void @atomic128_store_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_store_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection390: +; HASWELL-O3-NEXT: vmovss {{.*#+}} xmm0 = [42,0,0,0] +; HASWELL-O3-NEXT: .Lpcsection391: +; HASWELL-O3-NEXT: vmovaps %xmm0, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i128 42, ptr %a release, align 16, !pcsections !0 @@ -11010,6 +13198,18 @@ define void @atomic128_store_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_store_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection392: +; HASWELL-O3-NEXT: vmovss {{.*#+}} xmm0 = [42,0,0,0] +; HASWELL-O3-NEXT: .Lpcsection393: +; HASWELL-O3-NEXT: vmovaps %xmm0, (%rdi) +; HASWELL-O3-NEXT: .Lpcsection394: +; HASWELL-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic i128 42, ptr %a seq_cst, align 16, !pcsections !0 @@ -11049,6 +13249,14 @@ define void @atomic128_store_seq_cst_ptr_ty(ptr %a, ptr %v) { ; O3-NEXT: xchgq %rsi, (%rdi) ; O3-NEXT: movq $1, foo(%rip) ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_store_seq_cst_ptr_ty: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection395: +; HASWELL-O3-NEXT: xchgq %rsi, (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 store atomic ptr %v, ptr %a seq_cst, align 16, !pcsections !0 @@ -11176,6 +13384,33 @@ define void @atomic128_xchg_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xchg_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection396: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection397: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection398: +; HASWELL-O3-NEXT: movl $42, %ebx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB208_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: .Lpcsection399: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection400: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection401: +; HASWELL-O3-NEXT: jne .LBB208_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i128 42 monotonic, align 16, !pcsections !0 @@ -11309,6 +13544,35 @@ define void @atomic128_add_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_add_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection402: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection403: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB209_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection404: +; HASWELL-O3-NEXT: addq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection405: +; HASWELL-O3-NEXT: adcq $0, %rcx +; HASWELL-O3-NEXT: .Lpcsection406: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection407: +; HASWELL-O3-NEXT: jne .LBB209_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i128 42 monotonic, align 16, !pcsections !0 @@ -11442,6 +13706,35 @@ define void @atomic128_sub_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_sub_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection408: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection409: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB210_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection410: +; HASWELL-O3-NEXT: addq $-42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection411: +; HASWELL-O3-NEXT: adcq $-1, %rcx +; HASWELL-O3-NEXT: .Lpcsection412: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection413: +; HASWELL-O3-NEXT: jne .LBB210_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i128 42 monotonic, align 16, !pcsections !0 @@ -11574,6 +13867,34 @@ define void @atomic128_and_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_and_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection414: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection415: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB211_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection416: +; HASWELL-O3-NEXT: andl $42, %ebx +; HASWELL-O3-NEXT: .Lpcsection417: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection418: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection419: +; HASWELL-O3-NEXT: jne .LBB211_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i128 42 monotonic, align 16, !pcsections !0 @@ -11699,6 +14020,33 @@ define void @atomic128_or_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_or_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection420: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection421: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB212_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection422: +; HASWELL-O3-NEXT: orq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection423: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection424: +; HASWELL-O3-NEXT: jne .LBB212_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i128 42 monotonic, align 16, !pcsections !0 @@ -11824,6 +14172,33 @@ define void @atomic128_xor_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xor_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection425: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection426: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB213_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection427: +; HASWELL-O3-NEXT: xorq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection428: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection429: +; HASWELL-O3-NEXT: jne .LBB213_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i128 42 monotonic, align 16, !pcsections !0 @@ -11964,6 +14339,36 @@ define void @atomic128_nand_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_nand_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection430: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection431: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection432: +; HASWELL-O3-NEXT: movq $-1, %rcx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB214_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection433: +; HASWELL-O3-NEXT: notl %ebx +; HASWELL-O3-NEXT: .Lpcsection434: +; HASWELL-O3-NEXT: orq $-43, %rbx +; HASWELL-O3-NEXT: .Lpcsection435: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection436: +; HASWELL-O3-NEXT: jne .LBB214_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i128 42 monotonic, align 16, !pcsections !0 @@ -12091,6 +14496,33 @@ define void @atomic128_xchg_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xchg_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection437: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection438: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection439: +; HASWELL-O3-NEXT: movl $42, %ebx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB215_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: .Lpcsection440: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection441: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection442: +; HASWELL-O3-NEXT: jne .LBB215_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i128 42 acquire, align 16, !pcsections !0 @@ -12224,6 +14656,35 @@ define void @atomic128_add_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_add_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection443: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection444: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB216_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection445: +; HASWELL-O3-NEXT: addq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection446: +; HASWELL-O3-NEXT: adcq $0, %rcx +; HASWELL-O3-NEXT: .Lpcsection447: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection448: +; HASWELL-O3-NEXT: jne .LBB216_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i128 42 acquire, align 16, !pcsections !0 @@ -12357,6 +14818,35 @@ define void @atomic128_sub_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_sub_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection449: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection450: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB217_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection451: +; HASWELL-O3-NEXT: addq $-42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection452: +; HASWELL-O3-NEXT: adcq $-1, %rcx +; HASWELL-O3-NEXT: .Lpcsection453: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection454: +; HASWELL-O3-NEXT: jne .LBB217_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i128 42 acquire, align 16, !pcsections !0 @@ -12489,6 +14979,34 @@ define void @atomic128_and_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_and_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection455: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection456: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB218_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection457: +; HASWELL-O3-NEXT: andl $42, %ebx +; HASWELL-O3-NEXT: .Lpcsection458: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection459: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection460: +; HASWELL-O3-NEXT: jne .LBB218_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i128 42 acquire, align 16, !pcsections !0 @@ -12614,6 +15132,33 @@ define void @atomic128_or_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_or_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection461: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection462: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB219_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection463: +; HASWELL-O3-NEXT: orq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection464: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection465: +; HASWELL-O3-NEXT: jne .LBB219_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i128 42 acquire, align 16, !pcsections !0 @@ -12739,6 +15284,33 @@ define void @atomic128_xor_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xor_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection466: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection467: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB220_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection468: +; HASWELL-O3-NEXT: xorq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection469: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection470: +; HASWELL-O3-NEXT: jne .LBB220_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i128 42 acquire, align 16, !pcsections !0 @@ -12879,6 +15451,36 @@ define void @atomic128_nand_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_nand_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection471: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection472: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection473: +; HASWELL-O3-NEXT: movq $-1, %rcx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB221_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection474: +; HASWELL-O3-NEXT: notl %ebx +; HASWELL-O3-NEXT: .Lpcsection475: +; HASWELL-O3-NEXT: orq $-43, %rbx +; HASWELL-O3-NEXT: .Lpcsection476: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection477: +; HASWELL-O3-NEXT: jne .LBB221_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i128 42 acquire, align 16, !pcsections !0 @@ -13006,6 +15608,33 @@ define void @atomic128_xchg_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xchg_release: +; HASWELL-O3: # %bb.0: +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection478: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection479: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection480: +; HASWELL-O3-NEXT: movl $42, %ebx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB222_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: .Lpcsection481: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection482: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection483: +; HASWELL-O3-NEXT: jne .LBB222_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i128 42 release, align 16, !pcsections !0 store volatile i64 1, ptr @foo, align 8 @@ -13138,6 +15767,35 @@ define void @atomic128_add_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_add_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection484: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection485: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB223_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection486: +; HASWELL-O3-NEXT: addq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection487: +; HASWELL-O3-NEXT: adcq $0, %rcx +; HASWELL-O3-NEXT: .Lpcsection488: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection489: +; HASWELL-O3-NEXT: jne .LBB223_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i128 42 release, align 16, !pcsections !0 @@ -13271,6 +15929,35 @@ define void @atomic128_sub_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_sub_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection490: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection491: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB224_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection492: +; HASWELL-O3-NEXT: addq $-42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection493: +; HASWELL-O3-NEXT: adcq $-1, %rcx +; HASWELL-O3-NEXT: .Lpcsection494: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection495: +; HASWELL-O3-NEXT: jne .LBB224_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i128 42 release, align 16, !pcsections !0 @@ -13403,6 +16090,34 @@ define void @atomic128_and_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_and_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection496: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection497: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB225_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection498: +; HASWELL-O3-NEXT: andl $42, %ebx +; HASWELL-O3-NEXT: .Lpcsection499: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection500: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection501: +; HASWELL-O3-NEXT: jne .LBB225_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i128 42 release, align 16, !pcsections !0 @@ -13528,6 +16243,33 @@ define void @atomic128_or_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_or_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection502: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection503: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB226_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection504: +; HASWELL-O3-NEXT: orq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection505: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection506: +; HASWELL-O3-NEXT: jne .LBB226_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i128 42 release, align 16, !pcsections !0 @@ -13653,6 +16395,33 @@ define void @atomic128_xor_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xor_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection507: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection508: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB227_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection509: +; HASWELL-O3-NEXT: xorq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection510: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection511: +; HASWELL-O3-NEXT: jne .LBB227_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i128 42 release, align 16, !pcsections !0 @@ -13793,6 +16562,36 @@ define void @atomic128_nand_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_nand_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection512: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection513: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection514: +; HASWELL-O3-NEXT: movq $-1, %rcx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB228_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection515: +; HASWELL-O3-NEXT: notl %ebx +; HASWELL-O3-NEXT: .Lpcsection516: +; HASWELL-O3-NEXT: orq $-43, %rbx +; HASWELL-O3-NEXT: .Lpcsection517: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection518: +; HASWELL-O3-NEXT: jne .LBB228_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i128 42 release, align 16, !pcsections !0 @@ -13920,6 +16719,33 @@ define void @atomic128_xchg_acq_rel(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xchg_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection519: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection520: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection521: +; HASWELL-O3-NEXT: movl $42, %ebx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB229_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: .Lpcsection522: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection523: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection524: +; HASWELL-O3-NEXT: jne .LBB229_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i128 42 acq_rel, align 16, !pcsections !0 @@ -14053,6 +16879,35 @@ define void @atomic128_add_acq_rel(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_add_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection525: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection526: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB230_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection527: +; HASWELL-O3-NEXT: addq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection528: +; HASWELL-O3-NEXT: adcq $0, %rcx +; HASWELL-O3-NEXT: .Lpcsection529: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection530: +; HASWELL-O3-NEXT: jne .LBB230_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i128 42 acq_rel, align 16, !pcsections !0 @@ -14186,6 +17041,35 @@ define void @atomic128_sub_acq_rel(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_sub_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection531: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection532: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB231_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection533: +; HASWELL-O3-NEXT: addq $-42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection534: +; HASWELL-O3-NEXT: adcq $-1, %rcx +; HASWELL-O3-NEXT: .Lpcsection535: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection536: +; HASWELL-O3-NEXT: jne .LBB231_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i128 42 acq_rel, align 16, !pcsections !0 @@ -14318,6 +17202,34 @@ define void @atomic128_and_acq_rel(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_and_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection537: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection538: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB232_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection539: +; HASWELL-O3-NEXT: andl $42, %ebx +; HASWELL-O3-NEXT: .Lpcsection540: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection541: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection542: +; HASWELL-O3-NEXT: jne .LBB232_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i128 42 acq_rel, align 16, !pcsections !0 @@ -14443,6 +17355,33 @@ define void @atomic128_or_acq_rel(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_or_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection543: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection544: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB233_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection545: +; HASWELL-O3-NEXT: orq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection546: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection547: +; HASWELL-O3-NEXT: jne .LBB233_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i128 42 acq_rel, align 16, !pcsections !0 @@ -14568,6 +17507,33 @@ define void @atomic128_xor_acq_rel(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xor_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection548: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection549: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB234_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection550: +; HASWELL-O3-NEXT: xorq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection551: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection552: +; HASWELL-O3-NEXT: jne .LBB234_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i128 42 acq_rel, align 16, !pcsections !0 @@ -14708,6 +17674,36 @@ define void @atomic128_nand_acq_rel(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_nand_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection553: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection554: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection555: +; HASWELL-O3-NEXT: movq $-1, %rcx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB235_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection556: +; HASWELL-O3-NEXT: notl %ebx +; HASWELL-O3-NEXT: .Lpcsection557: +; HASWELL-O3-NEXT: orq $-43, %rbx +; HASWELL-O3-NEXT: .Lpcsection558: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection559: +; HASWELL-O3-NEXT: jne .LBB235_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i128 42 acq_rel, align 16, !pcsections !0 @@ -14835,6 +17831,33 @@ define void @atomic128_xchg_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xchg_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection560: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection561: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection562: +; HASWELL-O3-NEXT: movl $42, %ebx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB236_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: .Lpcsection563: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection564: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection565: +; HASWELL-O3-NEXT: jne .LBB236_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xchg ptr %a, i128 42 seq_cst, align 16, !pcsections !0 @@ -14968,6 +17991,35 @@ define void @atomic128_add_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_add_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection566: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection567: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB237_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection568: +; HASWELL-O3-NEXT: addq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection569: +; HASWELL-O3-NEXT: adcq $0, %rcx +; HASWELL-O3-NEXT: .Lpcsection570: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection571: +; HASWELL-O3-NEXT: jne .LBB237_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw add ptr %a, i128 42 seq_cst, align 16, !pcsections !0 @@ -15101,6 +18153,35 @@ define void @atomic128_sub_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_sub_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection572: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection573: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB238_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection574: +; HASWELL-O3-NEXT: addq $-42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection575: +; HASWELL-O3-NEXT: adcq $-1, %rcx +; HASWELL-O3-NEXT: .Lpcsection576: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection577: +; HASWELL-O3-NEXT: jne .LBB238_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw sub ptr %a, i128 42 seq_cst, align 16, !pcsections !0 @@ -15233,6 +18314,34 @@ define void @atomic128_and_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_and_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection578: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection579: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB239_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection580: +; HASWELL-O3-NEXT: andl $42, %ebx +; HASWELL-O3-NEXT: .Lpcsection581: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection582: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection583: +; HASWELL-O3-NEXT: jne .LBB239_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw and ptr %a, i128 42 seq_cst, align 16, !pcsections !0 @@ -15358,6 +18467,33 @@ define void @atomic128_or_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_or_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection584: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection585: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB240_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection586: +; HASWELL-O3-NEXT: orq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection587: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection588: +; HASWELL-O3-NEXT: jne .LBB240_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw or ptr %a, i128 42 seq_cst, align 16, !pcsections !0 @@ -15483,6 +18619,33 @@ define void @atomic128_xor_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_xor_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection589: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection590: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB241_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movq %rax, %rbx +; HASWELL-O3-NEXT: .Lpcsection591: +; HASWELL-O3-NEXT: xorq $42, %rbx +; HASWELL-O3-NEXT: movq %rdx, %rcx +; HASWELL-O3-NEXT: .Lpcsection592: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection593: +; HASWELL-O3-NEXT: jne .LBB241_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw xor ptr %a, i128 42 seq_cst, align 16, !pcsections !0 @@ -15623,6 +18786,36 @@ define void @atomic128_nand_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_nand_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection594: +; HASWELL-O3-NEXT: movq (%rdi), %rax +; HASWELL-O3-NEXT: .Lpcsection595: +; HASWELL-O3-NEXT: movq 8(%rdi), %rdx +; HASWELL-O3-NEXT: .Lpcsection596: +; HASWELL-O3-NEXT: movq $-1, %rcx +; HASWELL-O3-NEXT: .p2align 4 +; HASWELL-O3-NEXT: .LBB242_1: # %atomicrmw.start +; HASWELL-O3-NEXT: # =>This Inner Loop Header: Depth=1 +; HASWELL-O3-NEXT: movl %eax, %ebx +; HASWELL-O3-NEXT: .Lpcsection597: +; HASWELL-O3-NEXT: notl %ebx +; HASWELL-O3-NEXT: .Lpcsection598: +; HASWELL-O3-NEXT: orq $-43, %rbx +; HASWELL-O3-NEXT: .Lpcsection599: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection600: +; HASWELL-O3-NEXT: jne .LBB242_1 +; HASWELL-O3-NEXT: # %bb.2: # %atomicrmw.end +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = atomicrmw nand ptr %a, i128 42 seq_cst, align 16, !pcsections !0 @@ -15781,6 +18974,43 @@ define void @atomic128_cas_monotonic(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_cas_monotonic: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection601: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection602: +; HASWELL-O3-NEXT: movl $1, %ebx +; HASWELL-O3-NEXT: .Lpcsection603: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection604: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection605: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection606: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection607: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection608: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection609: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection610: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection611: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection612: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection613: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i128 42, i128 1 monotonic monotonic, align 16, !pcsections !0 @@ -15941,6 +19171,43 @@ define void @atomic128_cas_acquire(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_cas_acquire: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection614: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection615: +; HASWELL-O3-NEXT: movl $1, %ebx +; HASWELL-O3-NEXT: .Lpcsection616: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection617: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection618: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection619: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection620: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection621: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection622: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection623: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection624: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection625: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection626: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i128 42, i128 1 acquire monotonic, align 16, !pcsections !0 @@ -16101,6 +19368,43 @@ define void @atomic128_cas_release(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_cas_release: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection627: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection628: +; HASWELL-O3-NEXT: movl $1, %ebx +; HASWELL-O3-NEXT: .Lpcsection629: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection630: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection631: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection632: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection633: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection634: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection635: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection636: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection637: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection638: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection639: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i128 42, i128 1 release monotonic, align 16, !pcsections !0 @@ -16261,6 +19565,43 @@ define void @atomic128_cas_acq_rel(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_cas_acq_rel: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection640: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection641: +; HASWELL-O3-NEXT: movl $1, %ebx +; HASWELL-O3-NEXT: .Lpcsection642: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection643: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection644: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection645: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection646: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection647: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection648: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection649: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection650: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection651: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection652: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: movq $1, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i128 42, i128 1 acq_rel monotonic, align 16, !pcsections !0 @@ -16421,6 +19762,43 @@ define void @atomic128_cas_seq_cst(ptr %a) { ; O3-NEXT: popq %rbx ; O3-NEXT: .cfi_def_cfa_offset 8 ; O3-NEXT: retq +; +; HASWELL-O3-LABEL: atomic128_cas_seq_cst: +; HASWELL-O3: # %bb.0: # %entry +; HASWELL-O3-NEXT: pushq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 16 +; HASWELL-O3-NEXT: .cfi_offset %rbx, -16 +; HASWELL-O3-NEXT: movq foo(%rip), %rax +; HASWELL-O3-NEXT: .Lpcsection653: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection654: +; HASWELL-O3-NEXT: movl $1, %ebx +; HASWELL-O3-NEXT: .Lpcsection655: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection656: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection657: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection658: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection659: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection660: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection661: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: .Lpcsection662: +; HASWELL-O3-NEXT: movl $42, %eax +; HASWELL-O3-NEXT: .Lpcsection663: +; HASWELL-O3-NEXT: xorl %edx, %edx +; HASWELL-O3-NEXT: .Lpcsection664: +; HASWELL-O3-NEXT: xorl %ecx, %ecx +; HASWELL-O3-NEXT: .Lpcsection665: +; HASWELL-O3-NEXT: lock cmpxchg16b (%rdi) +; HASWELL-O3-NEXT: movq $3, foo(%rip) +; HASWELL-O3-NEXT: popq %rbx +; HASWELL-O3-NEXT: .cfi_def_cfa_offset 8 +; HASWELL-O3-NEXT: retq entry: load volatile i64, ptr @foo, align 8 %x = cmpxchg ptr %a, i128 42, i128 1 seq_cst monotonic, align 16, !pcsections !0 From 0f71424280af9e3293ed481399b2b53ca708cd15 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 17 Jul 2025 07:13:50 -0700 Subject: [PATCH 176/813] [RISCV] Teach SelectAddrRegRegScale that ADD is commutable. (#149231) --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 27 +++++++++----- llvm/test/CodeGen/RISCV/xqcisls.ll | 10 +++--- llvm/test/CodeGen/RISCV/xtheadmemidx.ll | 40 ++++++++++----------- 3 files changed, 42 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index 0f948b22759fe..cfec46d23d65b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3058,17 +3058,28 @@ bool RISCVDAGToDAGISel::SelectAddrRegRegScale(SDValue Addr, }; if (auto *C1 = dyn_cast(RHS)) { + // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) if (LHS.getOpcode() == ISD::ADD && - SelectShl(LHS.getOperand(0), Index, Scale) && !isa(LHS.getOperand(1)) && isInt<12>(C1->getSExtValue())) { - // (add (add (shl A C2) B) C1) -> (add (add B C1) (shl A C2)) - SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), - SDLoc(Addr), VT); - Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, - LHS.getOperand(1), C1Val), - 0); - return true; + if (SelectShl(LHS.getOperand(1), Index, Scale)) { + SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), + SDLoc(Addr), VT); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, + LHS.getOperand(0), C1Val), + 0); + return true; + } + + // Add is commutative so we need to check both operands. + if (SelectShl(LHS.getOperand(0), Index, Scale)) { + SDValue C1Val = CurDAG->getTargetConstant(*C1->getConstantIntValue(), + SDLoc(Addr), VT); + Base = SDValue(CurDAG->getMachineNode(RISCV::ADDI, SDLoc(Addr), VT, + LHS.getOperand(1), C1Val), + 0); + return true; + } } // Don't match add with constants. diff --git a/llvm/test/CodeGen/RISCV/xqcisls.ll b/llvm/test/CodeGen/RISCV/xqcisls.ll index 828a0760044aa..709dc4ce074dc 100644 --- a/llvm/test/CodeGen/RISCV/xqcisls.ll +++ b/llvm/test/CodeGen/RISCV/xqcisls.ll @@ -309,8 +309,8 @@ define i64 @lrd(ptr %a, i32 %b) { ; RV32IZBAXQCISLS-LABEL: lrd: ; RV32IZBAXQCISLS: # %bb.0: ; RV32IZBAXQCISLS-NEXT: qc.lrw a2, a0, a1, 3 -; RV32IZBAXQCISLS-NEXT: sh3add a0, a1, a0 -; RV32IZBAXQCISLS-NEXT: lw a1, 4(a0) +; RV32IZBAXQCISLS-NEXT: addi a0, a0, 4 +; RV32IZBAXQCISLS-NEXT: qc.lrw a1, a0, a1, 3 ; RV32IZBAXQCISLS-NEXT: add a0, a2, a2 ; RV32IZBAXQCISLS-NEXT: sltu a2, a0, a2 ; RV32IZBAXQCISLS-NEXT: add a1, a1, a1 @@ -473,10 +473,10 @@ define void @srd(ptr %a, i32 %b, i64 %c) { ; RV32IZBAXQCISLS-NEXT: add a4, a2, a2 ; RV32IZBAXQCISLS-NEXT: add a3, a3, a3 ; RV32IZBAXQCISLS-NEXT: sltu a2, a4, a2 -; RV32IZBAXQCISLS-NEXT: add a2, a3, a2 -; RV32IZBAXQCISLS-NEXT: sh3add a3, a1, a0 ; RV32IZBAXQCISLS-NEXT: qc.srw a4, a0, a1, 3 -; RV32IZBAXQCISLS-NEXT: sw a2, 4(a3) +; RV32IZBAXQCISLS-NEXT: add a2, a3, a2 +; RV32IZBAXQCISLS-NEXT: addi a0, a0, 4 +; RV32IZBAXQCISLS-NEXT: qc.srw a2, a0, a1, 3 ; RV32IZBAXQCISLS-NEXT: ret %1 = add i64 %c, %c %2 = getelementptr i64, ptr %a, i32 %b diff --git a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll index 578f51a957a75..fc20fcb371179 100644 --- a/llvm/test/CodeGen/RISCV/xtheadmemidx.ll +++ b/llvm/test/CodeGen/RISCV/xtheadmemidx.ll @@ -858,14 +858,13 @@ define i64 @lurwu(ptr %a, i32 %b) { define i64 @lrd(ptr %a, i64 %b) { ; RV32XTHEADMEMIDX-LABEL: lrd: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: slli a2, a1, 3 +; RV32XTHEADMEMIDX-NEXT: th.lrw a2, a0, a1, 3 +; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 4 ; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: add a0, a0, a2 -; RV32XTHEADMEMIDX-NEXT: lw a2, 4(a0) -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 +; RV32XTHEADMEMIDX-NEXT: add a0, a2, a2 +; RV32XTHEADMEMIDX-NEXT: sltu a2, a0, a2 +; RV32XTHEADMEMIDX-NEXT: add a1, a1, a1 +; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lrd: @@ -908,14 +907,13 @@ define i64 @lrd_2(ptr %a, i64 %b) { define i64 @lurd(ptr %a, i32 %b) { ; RV32XTHEADMEMIDX-LABEL: lurd: ; RV32XTHEADMEMIDX: # %bb.0: -; RV32XTHEADMEMIDX-NEXT: slli a2, a1, 3 +; RV32XTHEADMEMIDX-NEXT: th.lrw a2, a0, a1, 3 +; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 4 ; RV32XTHEADMEMIDX-NEXT: th.lrw a1, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: add a0, a0, a2 -; RV32XTHEADMEMIDX-NEXT: lw a2, 4(a0) -; RV32XTHEADMEMIDX-NEXT: add a0, a1, a1 -; RV32XTHEADMEMIDX-NEXT: sltu a1, a0, a1 -; RV32XTHEADMEMIDX-NEXT: add a2, a2, a2 -; RV32XTHEADMEMIDX-NEXT: add a1, a2, a1 +; RV32XTHEADMEMIDX-NEXT: add a0, a2, a2 +; RV32XTHEADMEMIDX-NEXT: sltu a2, a0, a2 +; RV32XTHEADMEMIDX-NEXT: add a1, a1, a1 +; RV32XTHEADMEMIDX-NEXT: add a1, a1, a2 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: lurd: @@ -1047,11 +1045,10 @@ define void @srd(ptr %a, i64 %b, i64 %c) { ; RV32XTHEADMEMIDX-NEXT: add a2, a3, a3 ; RV32XTHEADMEMIDX-NEXT: add a4, a4, a4 ; RV32XTHEADMEMIDX-NEXT: sltu a3, a2, a3 -; RV32XTHEADMEMIDX-NEXT: add a3, a4, a3 -; RV32XTHEADMEMIDX-NEXT: slli a4, a1, 3 -; RV32XTHEADMEMIDX-NEXT: add a4, a0, a4 ; RV32XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: sw a3, 4(a4) +; RV32XTHEADMEMIDX-NEXT: add a3, a4, a3 +; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 4 +; RV32XTHEADMEMIDX-NEXT: th.srw a3, a0, a1, 3 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: srd: @@ -1071,11 +1068,10 @@ define void @surd(ptr %a, i32 %b, i64 %c) { ; RV32XTHEADMEMIDX-NEXT: add a4, a2, a2 ; RV32XTHEADMEMIDX-NEXT: add a3, a3, a3 ; RV32XTHEADMEMIDX-NEXT: sltu a2, a4, a2 -; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2 -; RV32XTHEADMEMIDX-NEXT: slli a3, a1, 3 -; RV32XTHEADMEMIDX-NEXT: add a3, a0, a3 ; RV32XTHEADMEMIDX-NEXT: th.srw a4, a0, a1, 3 -; RV32XTHEADMEMIDX-NEXT: sw a2, 4(a3) +; RV32XTHEADMEMIDX-NEXT: add a2, a3, a2 +; RV32XTHEADMEMIDX-NEXT: addi a0, a0, 4 +; RV32XTHEADMEMIDX-NEXT: th.srw a2, a0, a1, 3 ; RV32XTHEADMEMIDX-NEXT: ret ; ; RV64XTHEADMEMIDX-LABEL: surd: From ab25de7dec2af661b66b23b9794291f2fd81b6bc Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 17 Jul 2025 15:20:24 +0100 Subject: [PATCH 177/813] [AMDGPU] Move common fields out of WaitcntBrackets. NFC. (#148864) WaitcntBrackets holds per-basic-block information about the state of wait counters. It also held a bunch of fields that are constant throughout a run of the pass. This patch moves them out into the SIInsertWaitcnts class, for better logical separation and to save a tiny bit of memory. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 122 ++++++++++---------- 1 file changed, 59 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index cb72a64eaee2a..2af0a575a8885 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -407,8 +407,13 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { }; class SIInsertWaitcnts { +public: + const GCNSubtarget *ST; + InstCounterType SmemAccessCounter; + InstCounterType MaxCounter; + const unsigned *WaitEventMaskForInst; + private: - const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; const MachineRegisterInfo *MRI = nullptr; @@ -424,8 +429,6 @@ class SIInsertWaitcnts { bool Dirty = true; }; - InstCounterType SmemAccessCounter; - MapVector BlockInfos; bool ForceEmitWaitcnt[NUM_INST_CNTS]; @@ -442,7 +445,7 @@ class SIInsertWaitcnts { // message. DenseSet ReleaseVGPRInsts; - InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; + HardwareLimits Limits; public: SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, @@ -453,6 +456,30 @@ class SIInsertWaitcnts { (void)ForceVMCounter; } + unsigned getWaitCountMax(InstCounterType T) const { + switch (T) { + case LOAD_CNT: + return Limits.LoadcntMax; + case DS_CNT: + return Limits.DscntMax; + case EXP_CNT: + return Limits.ExpcntMax; + case STORE_CNT: + return Limits.StorecntMax; + case SAMPLE_CNT: + return Limits.SamplecntMax; + case BVH_CNT: + return Limits.BvhcntMax; + case KM_CNT: + return Limits.KmcntMax; + case X_CNT: + return Limits.XcntMax; + default: + break; + } + return 0; + } + bool shouldFlushVmCnt(MachineLoop *ML, const WaitcntBrackets &Brackets); bool isPreheaderToFlush(MachineBasicBlock &MBB, const WaitcntBrackets &ScoreBrackets); @@ -568,39 +595,10 @@ class SIInsertWaitcnts { // "s_waitcnt 0" before use. class WaitcntBrackets { public: - WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter, - HardwareLimits Limits, const unsigned *WaitEventMaskForInst, - InstCounterType SmemAccessCounter) - : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits), - WaitEventMaskForInst(WaitEventMaskForInst), - SmemAccessCounter(SmemAccessCounter) {} - - unsigned getWaitCountMax(InstCounterType T) const { - switch (T) { - case LOAD_CNT: - return Limits.LoadcntMax; - case DS_CNT: - return Limits.DscntMax; - case EXP_CNT: - return Limits.ExpcntMax; - case STORE_CNT: - return Limits.StorecntMax; - case SAMPLE_CNT: - return Limits.SamplecntMax; - case BVH_CNT: - return Limits.BvhcntMax; - case KM_CNT: - return Limits.KmcntMax; - case X_CNT: - return Limits.XcntMax; - default: - break; - } - return 0; - } + WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {} bool isSmemCounter(InstCounterType T) const { - return T == SmemAccessCounter || T == X_CNT; + return T == Context->SmemAccessCounter || T == X_CNT; } unsigned getSgprScoresIdx(InstCounterType T) const { @@ -658,7 +656,7 @@ class WaitcntBrackets { return PendingEvents & (1 << E); } unsigned hasPendingEvent(InstCounterType T) const { - unsigned HasPending = PendingEvents & WaitEventMaskForInst[T]; + unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[T]; assert((HasPending != 0) == (getScoreRange(T) != 0)); return HasPending; } @@ -686,7 +684,8 @@ class WaitcntBrackets { } unsigned getPendingGDSWait() const { - return std::min(getScoreUB(DS_CNT) - LastGDS, getWaitCountMax(DS_CNT) - 1); + return std::min(getScoreUB(DS_CNT) - LastGDS, + Context->getWaitCountMax(DS_CNT) - 1); } void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; } @@ -710,8 +709,9 @@ class WaitcntBrackets { } void setStateOnFunctionEntryOrReturn() { - setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT)); - PendingEvents |= WaitEventMaskForInst[STORE_CNT]; + setScoreUB(STORE_CNT, + getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT)); + PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT]; } ArrayRef getLDSDMAStores() const { @@ -747,8 +747,8 @@ class WaitcntBrackets { if (T != EXP_CNT) return; - if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT)) - ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT); + if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT)) + ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - Context->getWaitCountMax(EXP_CNT); } void setRegScore(int GprNo, InstCounterType T, unsigned Val) { @@ -763,11 +763,8 @@ class WaitcntBrackets { const MachineOperand &Op, InstCounterType CntTy, unsigned Val); - const GCNSubtarget *ST = nullptr; - InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS; - HardwareLimits Limits = {}; - const unsigned *WaitEventMaskForInst; - InstCounterType SmemAccessCounter; + const SIInsertWaitcnts *Context; + unsigned ScoreLBs[NUM_INST_CNTS] = {0}; unsigned ScoreUBs[NUM_INST_CNTS] = {0}; unsigned PendingEvents = 0; @@ -829,7 +826,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, RegInterval Result; - MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *ST); + MCRegister MCReg = AMDGPU::getMCReg(Op.getReg(), *Context->ST); unsigned RegIdx = TRI->getHWRegIndex(MCReg); assert(isUInt<8>(RegIdx)); @@ -887,7 +884,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineInstr *MI, // this at compile time, so we have to assume it might be applied if the // instruction supports it). bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const { - if (!ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) + if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI)) return false; const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); @@ -913,7 +910,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, WaitEventType E, MachineInstr &Inst) { - InstCounterType T = eventCounter(WaitEventMaskForInst, E); + InstCounterType T = eventCounter(Context->WaitEventMaskForInst, E); unsigned UB = getScoreUB(T); unsigned CurrScore = UB + 1; @@ -1082,8 +1079,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, } void WaitcntBrackets::print(raw_ostream &OS) const { + const GCNSubtarget *ST = Context->ST; + OS << '\n'; - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { unsigned SR = getScoreRange(T); switch (T) { @@ -1197,7 +1196,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // s_waitcnt instruction. if ((UB >= ScoreToWait) && (ScoreToWait > LB)) { if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() && - !ST->hasFlatLgkmVMemCountInOrder()) { + !Context->ST->hasFlatLgkmVMemCountInOrder()) { // If there is a pending FLAT operation, and this is a VMem or LGKM // waitcnt and the target can report early completion, then we need // to force a waitcnt 0. @@ -1211,7 +1210,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, RegInterval Interval, // If a counter has been maxed out avoid overflow by waiting for // MAX(CounterType) - 1 instead. unsigned NeededWait = - std::min(UB - ScoreToWait, getWaitCountMax(T) - 1); + std::min(UB - ScoreToWait, Context->getWaitCountMax(T) - 1); addWait(Wait, T, NeededWait); } } @@ -1239,7 +1238,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { setScoreLB(T, std::max(getScoreLB(T), UB - Count)); } else { setScoreLB(T, UB); - PendingEvents &= ~WaitEventMaskForInst[T]; + PendingEvents &= ~Context->WaitEventMaskForInst[T]; } } @@ -1264,7 +1263,7 @@ void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { // the decrement may go out of order. bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { // Scalar memory read always can go out of order. - if ((T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || + if ((T == Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) || (T == X_CNT && hasPendingEvent(SMEM_GROUP))) return true; return hasMixedPendingEvents(T); @@ -2388,8 +2387,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) { VgprUB = std::max(VgprUB, Other.VgprUB); SgprUB = std::max(SgprUB, Other.SgprUB); - for (auto T : inst_counter_types(MaxCounter)) { + for (auto T : inst_counter_types(Context->MaxCounter)) { // Merge event flags for this counter + const unsigned *WaitEventMaskForInst = Context->WaitEventMaskForInst; const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T]; const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T]; if (OtherEvents & ~OldEvents) @@ -2748,11 +2748,10 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; - const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask(); + WaitEventMaskForInst = WCG->getWaitEventMask(); SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS); - HardwareLimits Limits = {}; if (ST->hasExtendedWaitCounts()) { Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV); Limits.DscntMax = AMDGPU::getDscntBitMask(IV); @@ -2809,8 +2808,7 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); } - auto NonKernelInitialState = std::make_unique( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + auto NonKernelInitialState = std::make_unique(this); NonKernelInitialState->setStateOnFunctionEntryOrReturn(); BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState); @@ -2841,15 +2839,13 @@ bool SIInsertWaitcnts::run(MachineFunction &MF) { *Brackets = *BI.Incoming; } else { if (!Brackets) { - Brackets = std::make_unique( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + Brackets = std::make_unique(this); } else { // Reinitialize in-place. N.B. do not do this by assigning from a // temporary because the WaitcntBrackets class is large and it could // cause this function to use an unreasonable amount of stack space. Brackets->~WaitcntBrackets(); - new (Brackets.get()) WaitcntBrackets( - ST, MaxCounter, Limits, WaitEventMaskForInst, SmemAccessCounter); + new (Brackets.get()) WaitcntBrackets(this); } } From 96a7e954e1501239d1fc4bd6eba60428bd6609f7 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 07:22:36 -0700 Subject: [PATCH 178/813] [Sema] Remove unnecessary casts (NFC) (#149253) getParam already returns NamedDecl *. --- clang/lib/Sema/SemaTemplateDeduction.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index d09a72b71b805..e1a975bcfb3e1 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -3083,8 +3083,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments( // If there was no default argument, deduction is incomplete. if (DefArg.getArgument().isNull()) { - Info.Param = makeTemplateParameter( - const_cast(TemplateParams->getParam(I))); + Info.Param = makeTemplateParameter(TemplateParams->getParam(I)); Info.reset( TemplateArgumentList::CreateCopy(S.Context, CTAI.SugaredConverted), TemplateArgumentList::CreateCopy(S.Context, CTAI.CanonicalConverted)); @@ -3100,8 +3099,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments( if (S.CheckTemplateArgument( Param, DefArg, TD, TD->getLocation(), TD->getSourceRange().getEnd(), /*ArgumentPackIndex=*/0, CTAI, Sema::CTAK_Specified)) { - Info.Param = makeTemplateParameter( - const_cast(TemplateParams->getParam(I))); + Info.Param = makeTemplateParameter(TemplateParams->getParam(I)); // FIXME: These template arguments are temporary. Free them! Info.reset( TemplateArgumentList::CreateCopy(S.Context, CTAI.SugaredConverted), @@ -3227,7 +3225,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( if (ParamIdx >= TPL->size()) ParamIdx = TPL->size() - 1; - Decl *Param = const_cast(TPL->getParam(ParamIdx)); + Decl *Param = TPL->getParam(ParamIdx); Info.Param = makeTemplateParameter(Param); Info.FirstArg = Ps[ArgIdx].getArgument(); return TemplateDeductionResult::SubstitutionFailure; From 7fa48ce547ef9e0516564eca9c375109e83f2f71 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 07:22:43 -0700 Subject: [PATCH 179/813] [AMDGPU] Remove an unnecessary cast (NFC) (#149254) getTargetLowering() already returns const SITargetLowering *. --- llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index b0d6fd95cd271..5097ac03954d5 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2225,8 +2225,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MachineBasicBlock::iterator E = MBB->end(); MachineBasicBlock::iterator MBBI = MI.getIterator(); ++MBBI; - const SITargetLowering *TLI = - static_cast(STM->getTargetLowering()); + const SITargetLowering *TLI = STM->getTargetLowering(); for ( ; MBBI != E; ++MBBI) { MachineInstr &MINext = *MBBI; From 96bde11e307e53a1263ab6088f172716db7cb0d8 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 07:22:51 -0700 Subject: [PATCH 180/813] [TargetParser] Remove const from a return type (NFC) (#149255) getHostCPUFeatures constructs and returns a temporary instance of StringMap. We don't need const on the return type. --- llvm/include/llvm/TargetParser/Host.h | 2 +- llvm/lib/TargetParser/Host.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h index be3d41e022ad9..40a9b6cc13902 100644 --- a/llvm/include/llvm/TargetParser/Host.h +++ b/llvm/include/llvm/TargetParser/Host.h @@ -53,7 +53,7 @@ LLVM_ABI StringRef getHostCPUName(); /// which features may appear in this map, except that they are all valid LLVM /// feature names. The map can be empty, for example if feature detection /// fails. -LLVM_ABI const StringMap getHostCPUFeatures(); +LLVM_ABI StringMap getHostCPUFeatures(); /// This is a function compatible with cl::AddExtraVersionPrinter, which adds /// info about the current target triple and detected CPU. diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 8fd91fcd33f63..78bd5b4b5bd25 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -1855,7 +1855,7 @@ VendorSignatures getVendorSignature(unsigned *MaxLeaf) { #if defined(__i386__) || defined(_M_IX86) || \ defined(__x86_64__) || defined(_M_X64) -const StringMap sys::getHostCPUFeatures() { +StringMap sys::getHostCPUFeatures() { unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0; unsigned MaxLevel; StringMap Features; @@ -2068,7 +2068,7 @@ const StringMap sys::getHostCPUFeatures() { return Features; } #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__)) -const StringMap sys::getHostCPUFeatures() { +StringMap sys::getHostCPUFeatures() { StringMap Features; std::unique_ptr P = getProcCpuinfoContent(); if (!P) @@ -2148,7 +2148,7 @@ const StringMap sys::getHostCPUFeatures() { return Features; } #elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64)) -const StringMap sys::getHostCPUFeatures() { +StringMap sys::getHostCPUFeatures() { StringMap Features; // If we're asking the OS at runtime, believe what the OS says @@ -2167,7 +2167,7 @@ const StringMap sys::getHostCPUFeatures() { } #elif defined(__linux__) && defined(__loongarch__) #include -const StringMap sys::getHostCPUFeatures() { +StringMap sys::getHostCPUFeatures() { unsigned long hwcap = getauxval(AT_HWCAP); bool HasFPU = hwcap & (1UL << 3); // HWCAP_LOONGARCH_FPU uint32_t cpucfg2 = 0x2, cpucfg3 = 0x3; @@ -2196,7 +2196,7 @@ const StringMap sys::getHostCPUFeatures() { return Features; } #elif defined(__linux__) && defined(__riscv) -const StringMap sys::getHostCPUFeatures() { +StringMap sys::getHostCPUFeatures() { RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0}, {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0}, {/*RISCV_HWPROBE_KEY_MISALIGNED_SCALAR_PERF=*/9, 0}}; @@ -2279,7 +2279,7 @@ const StringMap sys::getHostCPUFeatures() { return Features; } #else -const StringMap sys::getHostCPUFeatures() { return {}; } +StringMap sys::getHostCPUFeatures() { return {}; } #endif #if __APPLE__ From 73e8ada540acbd60f916ef4b0a5a2b454c8ece44 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 07:22:59 -0700 Subject: [PATCH 181/813] [Sema] Use llvm::all_of (NFC) (#149256) We can pass a range to llvm::all_of. --- clang/lib/Sema/AnalysisBasedWarnings.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index 5eba024e83634..d1400cbfc884d 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -522,8 +522,7 @@ static bool areAllValuesNoReturn(const VarDecl *VD, const CFGBlock &VarBlk, } // If all checked blocks satisfy the condition, the check is finished. - if (std::all_of(BlocksToCheck.begin(), BlocksToCheck.end(), - BlockSatisfiesCondition)) + if (llvm::all_of(BlocksToCheck, BlockSatisfiesCondition)) return true; // If this block does not contain the variable definition, check From 577585198637fc2ced2a4fdf20f91c58fb74c717 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 07:23:07 -0700 Subject: [PATCH 182/813] [llvm] Use *Map::try_emplace (NFC) (#149257) - try_emplace(Key) is shorter than insert({Key, nullptr}). - try_emplace performs value initialization without value parameters. - We overwrite values on successful insertion anyway. While we are at it, this patch simplifies the code with structured binding. --- llvm/include/llvm/ADT/EquivalenceClasses.h | 8 ++++---- llvm/lib/MC/MCContext.cpp | 5 ++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/ADT/EquivalenceClasses.h b/llvm/include/llvm/ADT/EquivalenceClasses.h index b1009f8b49992..1a2331c1a0322 100644 --- a/llvm/include/llvm/ADT/EquivalenceClasses.h +++ b/llvm/include/llvm/ADT/EquivalenceClasses.h @@ -218,12 +218,12 @@ template class EquivalenceClasses { /// insert - Insert a new value into the union/find set, ignoring the request /// if the value already exists. const ECValue &insert(const ElemTy &Data) { - auto I = TheMapping.insert({Data, nullptr}); - if (!I.second) - return *I.first->second; + auto [I, Inserted] = TheMapping.try_emplace(Data); + if (!Inserted) + return *I->second; auto *ECV = new (ECValueAllocator) ECValue(Data); - I.first->second = ECV; + I->second = ECV; Members.push_back(ECV); return *ECV; } diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 070be621a4b2c..12b3fbab8fb8f 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -734,9 +734,8 @@ MCSectionGOFF *MCContext::getGOFFSection(SectionKind Kind, StringRef Name, UniqueName.append("/").append(P->getName()); } // Do the lookup. If we don't have a hit, return a new section. - auto IterBool = GOFFUniquingMap.insert(std::make_pair(UniqueName, nullptr)); - auto Iter = IterBool.first; - if (!IterBool.second) + auto [Iter, Inserted] = GOFFUniquingMap.try_emplace(UniqueName); + if (!Inserted) return Iter->second; StringRef CachedName = StringRef(Iter->first.c_str(), Name.size()); From 756e07734b4f2aa64bd1bd5b6a29389c4c642323 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 17 Jul 2025 14:22:51 +0000 Subject: [PATCH 183/813] [lldb][test] Adjust TestTypeList.py on Windows with exceptions Since https://github.com/llvm/llvm-project/pull/148691 enabled exceptions when compiling the tests, this test has been failing. Much like was noted there, one of the variables disappeared from the debug info. Giving it a non-zero size and initialising it fixed that. --- lldb/test/API/python_api/type/main.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/lldb/test/API/python_api/type/main.cpp b/lldb/test/API/python_api/type/main.cpp index 6acde5bb666a6..449f77db0d75e 100644 --- a/lldb/test/API/python_api/type/main.cpp +++ b/lldb/test/API/python_api/type/main.cpp @@ -44,7 +44,12 @@ template struct PointerInfo { }; template > -struct Pointer {}; +struct Pointer { + // When compiling for Windows with exceptions enabled, this struct + // must contain something that takes space and is initialised. + // Otherwise it will not be present in the debug information. + int pad = 0; +}; enum EnumType {}; enum class ScopedEnumType {}; From 9de32d56e4fdf08d88aca74149f5f815eb6505ec Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Thu, 17 Jul 2025 07:25:12 -0700 Subject: [PATCH 184/813] [clang-tools-extra] Bump ReleaseNotes to 22.0.0git (#149306) Move over change post 21.x branch cut. --- clang-tools-extra/docs/ReleaseNotes.rst | 287 +----------------------- 1 file changed, 1 insertion(+), 286 deletions(-) diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 9eb3835fe8340..07ebf8008928d 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -58,9 +58,6 @@ Semantic Highlighting Compile flags ^^^^^^^^^^^^^ -- Added `BuiltinHeaders` config key which controls whether clangd's built-in - headers are used or ones extracted from the driver. - Hover ^^^^^ @@ -93,301 +90,18 @@ Improvements to clang-query arguments. So when porting a query to C++, remove all instances of trailing comma (otherwise C++ compiler will just complain about "expected expression"). -Improvements to include-cleaner -------------------------------- -- Deprecated the ``-insert`` and ``-remove`` command line options, and added - the ``-disable-remove`` and ``-disable-insert`` command line options as - replacements. The previous command line options were confusing because they - did not imply the default state of the option (which is inserts and removes - being enabled). The new options are easier to understand the semantics of. - Improvements to clang-tidy -------------------------- -- Changed the :program:`check_clang_tidy.py` tool to use FileCheck's - ``--match-full-lines`` instead of ``strict-whitespace`` for ``CHECK-FIXES`` - clauses. Added a ``--match-partial-fixes`` option to keep previous behavior on - specific tests. This may break tests for users with custom out-of-tree checks - who use :program:`check_clang_tidy.py` as-is. - -- Improved :program:`clang-tidy-diff.py` script. Add the `-warnings-as-errors` - argument to treat warnings as errors. - -- Improved :program:`clang-tidy` to show `CheckOptions` only for checks enabled - in `Checks` when running ``--dump-config``. - -- Fixed bug in :program:`clang-tidy` by which `HeaderFilterRegex` did not take - effect when passed via the `.clang-tidy` file. - -- Fixed bug in :program:`run_clang_tidy.py` where the program would not - correctly display the checks enabled by the top-level `.clang-tidy` file. - New checks ^^^^^^^^^^ -- New :doc:`bugprone-capturing-this-in-member-variable - ` check. - - Finds lambda captures and ``bind`` function calls that capture the ``this`` - pointer and store it as class members without handle the copy and move - constructors and the assignments. - -- New :doc:`bugprone-misleading-setter-of-reference - ` check. - - Finds setter-like member functions that take a pointer parameter and set a - reference member of the same class with the pointed value. - -- New :doc:`bugprone-unintended-char-ostream-output - ` check. - - Finds unintended character output from ``unsigned char`` and ``signed char`` - to an ``ostream``. - -- New :doc:`cppcoreguidelines-use-enum-class - ` check. - - Finds unscoped (non-class) ``enum`` declarations and suggests using - ``enum class`` instead. - -- New :doc:`llvm-prefer-static-over-anonymous-namespace - ` check. - - Finds function and variable declarations inside anonymous namespace and - suggests replacing them with ``static`` declarations. - -- New :doc:`modernize-use-scoped-lock - ` check. - - Finds uses of ``std::lock_guard`` and suggests replacing them with C++17's - alternative ``std::scoped_lock``. - -- New :doc:`portability-avoid-pragma-once - ` check. - - Finds uses of ``#pragma once`` and suggests replacing them with standard - include guards (``#ifndef``/``#define``/``#endif``) for improved portability. - -- New :doc:`readability-ambiguous-smartptr-reset-call - ` check. - - Finds potentially erroneous calls to ``reset`` method on smart pointers when - the pointee type also has a ``reset`` method. - -- New :doc:`readability-use-concise-preprocessor-directives - ` check. - - Finds uses of ``#if`` that can be simplified to ``#ifdef`` or ``#ifndef`` and, - since C23 and C++23, uses of ``#elif`` that can be simplified to ``#elifdef`` - or ``#elifndef``. - New check aliases ^^^^^^^^^^^^^^^^^ Changes in existing checks ^^^^^^^^^^^^^^^^^^^^^^^^^^ -- Improved :doc:`bugprone-crtp-constructor-accessibility - ` check by fixing - false positives on deleted constructors that cannot be used to construct - objects, even if they have public or protected access. - -- Improved :doc:`bugprone-exception-escape - ` check to print stack trace - of a potentially escaped exception. - -- Added an option to :doc:`bugprone-multi-level-implicit-pointer-conversion - ` to - choose whether to enable the check in C code or not. - -- Improved :doc:`bugprone-optional-value-conversion - ` check to detect - conversion in argument of ``std::make_optional``. - -- Improved :doc:`bugprone-sizeof-expression - ` check by adding - `WarnOnSizeOfInLoopTermination` option to detect misuses of ``sizeof`` - expression in loop conditions. - -- Improved :doc:`bugprone-string-constructor - ` check to find suspicious - calls of ``std::string`` constructor with char pointer, start position and - length parameters. - -- Improved :doc:`bugprone-unchecked-optional-access - ` fixing false - positives from smart pointer accessors repeated in checking ``has_value`` - and accessing ``value``. The option `IgnoreSmartPointerDereference` should - no longer be needed and will be removed. Also fixing false positive from - const reference accessors to objects containing optional member. - -- Improved :doc:`bugprone-unsafe-functions - ` check to allow specifying - additional C++ member functions to match. - -- Improved :doc:`cert-err33-c - ` check by fixing false positives when - a function name is just prefixed with a targeted function name. - -- Improved :doc:`concurrency-mt-unsafe - ` check by fixing a false positive - where ``strerror`` was flagged as MT-unsafe. - -- Improved :doc:`cppcoreguidelines-avoid-goto - ` check by adding the option - `IgnoreMacros` to ignore ``goto`` labels defined in macros. - -- Improved :doc:`cppcoreguidelines-interfaces-global-init - ` check by - fixing false positives on uses of ``constinit`` variables. - -- Improved :doc:`cppcoreguidelines-missing-std-forward - ` check by adding a - flag to specify the function used for forwarding instead of ``std::forward``. - -- Improved :doc:`cppcoreguidelines-pro-bounds-pointer-arithmetic - ` check by - fixing false positives when calling indexing operators that do not perform - pointer arithmetic in template, for example ``std::map::operator[]`` and - when pointer arithmetic was used through type aliases. - -- Improved :doc:`cppcoreguidelines-rvalue-reference-param-not-moved - ` check - by adding a flag to specify the function used for moving instead of - ``std::move``. - -- Improved :doc:`cppcoreguidelines-special-member-functions - ` check by - adding the option `IgnoreMacros` to ignore classes defined in macros. - -- Improved :doc:`google-readability-namespace-comments - ` check by adding - the option `AllowOmittingNamespaceComments` to accept if a namespace comment - is omitted entirely. - -- Improved :doc:`hicpp-avoid-goto - ` check by adding the option - `IgnoreMacros` to ignore ``goto`` labels defined in macros. - -- Improved :doc:`hicpp-special-member-functions - ` check by adding the - option `IgnoreMacros` to ignore classes defined in macros. - -- Improved :doc:`llvm-namespace-comment - ` check by adding the option - `AllowOmittingNamespaceComments` to accept if a namespace comment is omitted - entirely. - -- Improved :doc:`misc-const-correctness - ` check by adding the option - `AllowedTypes`, that excludes specified types from const-correctness - checking and fixing false positives when modifying variant by ``operator[]`` - with template in parameters and supporting to check pointee mutation by - `AnalyzePointers` option and fixing false positives when using const array - type. - -- Improved :doc:`misc-include-cleaner - ` check by adding the options - `UnusedIncludes` and `MissingIncludes`, which specify whether the check should - report unused or missing includes respectively. - -- Improved :doc:`misc-redundant-expression - ` check by providing additional - examples and fixing some macro related false positives. - -- Improved :doc:`misc-unconventional-assign-operator - ` check by fixing - false positives when copy assignment operator function in a template class - returns the result of another assignment to ``*this`` (``return *this=...``). - -- Improved :doc:`misc-unused-using-decls - ` check by fixing false positives - on ``operator""`` with template parameters. - -- Improved :doc:`misc-use-internal-linkage - ` check by fix false positives - for function or variable in header file which contains macro expansion and - excluding variables with ``thread_local`` storage class specifier from being - matched. - -- Improved :doc:`modernize-pass-by-value - ` check by fixing false positives - when class passed by const-reference had a private move constructor. - -- Improved :doc:`modernize-type-traits - ` check by detecting more type traits. - -- Improved :doc:`modernize-use-default-member-init - ` check by matching - arithmetic operations, ``constexpr`` and ``static`` values, and detecting - explicit casting of built-in types within member list initialization. - -- Improved :doc:`modernize-use-designated-initializers - ` check by avoiding - diagnosing designated initializers for ``std::array`` initializations. - -- Improved :doc:`modernize-use-ranges - ` check by updating suppress - warnings logic for ``nullptr`` in ``std::find``. - -- Improved :doc:`modernize-use-starts-ends-with - ` check by adding more - matched scenarios of ``find`` and ``rfind`` methods and fixing false - positives when those methods were called with 3 arguments. - -- Improved :doc:`modernize-use-std-numbers - ` check to support math - functions of different precisions. - -- Improved :doc:`modernize-use-trailing-return-type - ` check by adding - support to modernize lambda signatures to use trailing return type and adding - two new options: `TransformFunctions` and `TransformLambdas` to control - whether function declarations and lambdas should be transformed by the check. - Fixed false positives when lambda was matched as a function in C++11 mode. - -- Improved :doc:`performance-move-const-arg - ` check by fixing false - negatives on ternary operators calling ``std::move``. - -- Improved :doc:`performance-unnecessary-value-param - ` check performance by - tolerating fix-it breaking compilation when functions is used as pointers - to avoid matching usage of functions within the current compilation unit. - Added an option `IgnoreCoroutines` with the default value `true` to - suppress this check for coroutines where passing by reference may be unsafe. - -- Improved :doc:`readability-convert-member-functions-to-static - ` check by - fixing false positives on member functions with an explicit object parameter. - -- Improved :doc:`readability-function-size - ` check by adding new option - `CountMemberInitAsStmt` that allows counting class member initializers in - constructors as statements. - -- Improved :doc:`readability-math-missing-parentheses - ` check by fixing - false negatives where math expressions are the operand of assignment operators - or comparison operators. - -- Improved :doc:`readability-named-parameter - ` check by adding the option - `InsertPlainNamesInForwardDecls` to insert parameter names without comments - for forward declarations only. - -- Improved :doc:`readability-qualified-auto - ` check by adding the option - `AllowedTypes`, that excludes specified types from adding qualifiers. - -- Improved :doc:`readability-redundant-inline-specifier - ` check by fixing - false positives on out-of-line explicitly defaulted functions. - -- Improved :doc:`readability-redundant-smartptr-get - ` check by fixing - some false positives involving smart pointers to arrays. - Removed checks ^^^^^^^^^^^^^^ @@ -414,3 +128,4 @@ Improvements to pp-trace Clang-tidy Visual Studio plugin ------------------------------- + From 95639b75487895f7ef45c1468d4639f1544a1851 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 17 Jul 2025 07:29:27 -0700 Subject: [PATCH 185/813] [CI] Drop CLEAR_CACHE Support in monolithic-* scripts This patch drops support for clearing the cache with the CLEAR_CACHE environment variable. This is an artifact of the old infrastructure as we now do not persist the cache across builds, instead redownloading the cache directory everytime. This makes the scripts slightly simpler as we are no longer supporting unneeded functionality. Reviewers: Endilll, cmtice, dschuff, Keenuts, lnihlen, gburgessiv Reviewed By: Keenuts, cmtice Pull Request: https://github.com/llvm/llvm-project/pull/149193 --- .ci/monolithic-linux.sh | 5 ----- .ci/monolithic-windows.sh | 5 ----- 2 files changed, 10 deletions(-) diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index 8d1faab13986c..303b430c28e3f 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -23,11 +23,6 @@ rm -rf "${BUILD_DIR}" ccache --zero-stats -if [[ -n "${CLEAR_CACHE:-}" ]]; then - echo "clearing cache" - ccache --clear -fi - mkdir -p artifacts/reproducers # Make sure any clang reproducers will end up as artifacts. diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh index 176350fac604c..50a741677d734 100755 --- a/.ci/monolithic-windows.sh +++ b/.ci/monolithic-windows.sh @@ -21,11 +21,6 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}" rm -rf "${BUILD_DIR}" -if [[ -n "${CLEAR_CACHE:-}" ]]; then - echo "clearing sccache" - rm -rf "$SCCACHE_DIR" -fi - sccache --zero-stats function at-exit { retcode=$? From ba5f31cfaa2452a4a94a482b53d899d6f2ee0e66 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 17 Jul 2025 14:29:25 +0000 Subject: [PATCH 186/813] [lldb][test] Disable TestChildCountTruncation on Windows This fails because it tells clang to use DWARF which link.exe then discards. The test may not need DWARF, but I'm going to confirm that in a follow up PR review. Test added by https://github.com/llvm/llvm-project/pull/149088. --- lldb/test/Shell/Settings/TestChildCountTruncation.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test index a96a0d8310eeb..2660ccae1aa5b 100644 --- a/lldb/test/Shell/Settings/TestChildCountTruncation.test +++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test @@ -1,6 +1,9 @@ # Test that we warn the user about truncated output # when target.max-children-count wasn't explicitly set. +# link.exe discards the DWARF information needed. +# UNSUPPORTED: system-windows + # RUN: split-file %s %t # RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ From 9fa3971fac27fbe0a6e3b9745d201c16f5f98bc2 Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Thu, 17 Jul 2025 16:37:59 +0200 Subject: [PATCH 187/813] [DAGCombiner] Fold vector subtraction if above threshold to `umin` (#148834) This extends #134235 and #135194 to vectors. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 87 +++++++++++------- .../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 88 +++++++------------ llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll | 88 +++++++------------ 3 files changed, 122 insertions(+), 141 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 0e8e4c9618bb2..40464e91f9efc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -609,6 +609,8 @@ namespace { SDValue foldABSToABD(SDNode *N, const SDLoc &DL); SDValue foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode CC, const SDLoc &DL); + SDValue foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True, + SDValue False, ISD::CondCode CC, const SDLoc &DL); SDValue unfoldMaskedMerge(SDNode *N); SDValue unfoldExtremeBitClearingToShifts(SDNode *N); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, @@ -859,7 +861,7 @@ namespace { auto LK = TLI.getTypeConversion(*DAG.getContext(), VT); return (LK.first == TargetLoweringBase::TypeLegal || LK.first == TargetLoweringBase::TypePromoteInteger) && - TLI.isOperationLegal(ISD::UMIN, LK.second); + TLI.isOperationLegalOrCustom(ISD::UMIN, LK.second); } public: @@ -4093,6 +4095,26 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { return N0; } + // (sub x, ([v]select (ult x, y), 0, y)) -> (umin x, (sub x, y)) + // (sub x, ([v]select (uge x, y), y, 0)) -> (umin x, (sub x, y)) + if (N1.hasOneUse() && hasUMin(VT)) { + SDValue Y; + if (sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETULT)), + m_Zero(), m_Deferred(Y))) || + sd_match(N1, m_Select(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETUGE)), + m_Deferred(Y), m_Zero())) || + sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETULT)), + m_Zero(), m_Deferred(Y))) || + sd_match(N1, m_VSelect(m_SetCC(m_Specific(N0), m_Value(Y), + m_SpecificCondCode(ISD::SETUGE)), + m_Deferred(Y), m_Zero()))) + return DAG.getNode(ISD::UMIN, DL, VT, N0, + DAG.getNode(ISD::SUB, DL, VT, N0, Y)); + } + if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; @@ -4442,20 +4464,6 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { sd_match(N1, m_UMaxLike(m_Specific(A), m_Specific(B)))) return DAG.getNegative(DAG.getNode(ISD::ABDU, DL, VT, A, B), DL, VT); - // (sub x, (select (ult x, y), 0, y)) -> (umin x, (sub x, y)) - // (sub x, (select (uge x, y), y, 0)) -> (umin x, (sub x, y)) - if (hasUMin(VT)) { - SDValue Y; - if (sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETULT)), - m_Zero(), m_Deferred(Y)))) || - sd_match(N1, m_OneUse(m_Select(m_SetCC(m_Specific(N0), m_Value(Y), - m_SpecificCondCode(ISD::SETUGE)), - m_Deferred(Y), m_Zero())))) - return DAG.getNode(ISD::UMIN, DL, VT, N0, - DAG.getNode(ISD::SUB, DL, VT, N0, Y)); - } - return SDValue(); } @@ -12173,6 +12181,30 @@ SDValue DAGCombiner::foldSelectToABD(SDValue LHS, SDValue RHS, SDValue True, return SDValue(); } +// ([v]select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) +// ([v]select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) +SDValue DAGCombiner::foldSelectToUMin(SDValue LHS, SDValue RHS, SDValue True, + SDValue False, ISD::CondCode CC, + const SDLoc &DL) { + APInt C; + EVT VT = True.getValueType(); + if (sd_match(RHS, m_ConstInt(C)) && hasUMin(VT)) { + if (CC == ISD::SETUGT && LHS == False && + sd_match(True, m_Add(m_Specific(False), m_SpecificInt(~C)))) { + SDValue AddC = DAG.getConstant(~C, DL, VT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, False, AddC); + return DAG.getNode(ISD::UMIN, DL, VT, Add, False); + } + if (CC == ISD::SETULT && LHS == True && + sd_match(False, m_Add(m_Specific(True), m_SpecificInt(-C)))) { + SDValue AddC = DAG.getConstant(-C, DL, VT); + SDValue Add = DAG.getNode(ISD::ADD, DL, VT, True, AddC); + return DAG.getNode(ISD::UMIN, DL, VT, True, Add); + } + } + return SDValue(); +} + SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -12358,24 +12390,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) { // (select (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) // (select (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) - APInt C; - if (sd_match(Cond1, m_ConstInt(C)) && hasUMin(VT)) { - if (CC == ISD::SETUGT && Cond0 == N2 && - sd_match(N1, m_Add(m_Specific(N2), m_SpecificInt(~C)))) { - // The resulting code relies on an unsigned wrap in ADD. - // Recreating ADD to drop possible nuw/nsw flags. - SDValue AddC = DAG.getConstant(~C, DL, VT); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N2, AddC); - return DAG.getNode(ISD::UMIN, DL, VT, Add, N2); - } - if (CC == ISD::SETULT && Cond0 == N1 && - sd_match(N2, m_Add(m_Specific(N1), m_SpecificInt(-C)))) { - // Ditto. - SDValue AddC = DAG.getConstant(-C, DL, VT); - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, AddC); - return DAG.getNode(ISD::UMIN, DL, VT, N1, Add); - } - } + if (SDValue UMin = foldSelectToUMin(Cond0, Cond1, N1, N2, CC, DL)) + return UMin; } if (!VT.isVector()) @@ -13412,6 +13428,11 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) { } } } + + // (vselect (ugt x, C), (add x, ~C), x) -> (umin (add x, ~C), x) + // (vselect (ult x, C), x, (add x, -C)) -> (umin x, (add x, -C)) + if (SDValue UMin = foldSelectToUMin(LHS, RHS, N1, N2, CC, DL)) + return UMin; } if (SimplifySelectOps(N, N1, N2)) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 35b9457fbc1ff..9df71cfc96cc7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -5712,9 +5712,8 @@ define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) { ; CHECK-LABEL: vsub_if_uge_v8i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <8 x i8> %va, %vb %select = select <8 x i1> %cmp, <8 x i8> zeroinitializer, <8 x i8> %vb @@ -5725,9 +5724,9 @@ define <8 x i8> @vsub_if_uge_v8i8(<8 x i8> %va, <8 x i8> %vb) { define <8 x i8> @vsub_if_uge_swapped_v8i8(<8 x i8> %va, <8 x i8> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <8 x i8> %va, %vb %select = select <8 x i1> %cmp, <8 x i8> %vb, <8 x i8> zeroinitializer @@ -5739,9 +5738,8 @@ define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) { ; CHECK-LABEL: vsub_if_uge_v8i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <8 x i16> %va, %vb %select = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vb @@ -5752,9 +5750,9 @@ define <8 x i16> @vsub_if_uge_v8i16(<8 x i16> %va, <8 x i16> %vb) { define <8 x i16> @vsub_if_uge_swapped_v8i16(<8 x i16> %va, <8 x i16> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <8 x i16> %va, %vb %select = select <8 x i1> %cmp, <8 x i16> %vb, <8 x i16> zeroinitializer @@ -5766,9 +5764,8 @@ define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) { ; CHECK-LABEL: vsub_if_uge_v4i32: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <4 x i32> %va, %vb %select = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vb @@ -5779,9 +5776,9 @@ define <4 x i32> @vsub_if_uge_v4i32(<4 x i32> %va, <4 x i32> %vb) { define <4 x i32> @vsub_if_uge_swapped_v4i32(<4 x i32> %va, <4 x i32> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <4 x i32> %va, %vb %select = select <4 x i1> %cmp, <4 x i32> %vb, <4 x i32> zeroinitializer @@ -5793,9 +5790,8 @@ define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) { ; CHECK-LABEL: vsub_if_uge_v2i64: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vmsltu.vv v0, v8, v9 ; CHECK-NEXT: vsub.vv v9, v8, v9 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <2 x i64> %va, %vb %select = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vb @@ -5806,9 +5802,9 @@ define <2 x i64> @vsub_if_uge_v2i64(<2 x i64> %va, <2 x i64> %vb) { define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_v2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge <2 x i64> %va, %vb %select = select <2 x i1> %cmp, <2 x i64> %vb, <2 x i64> zeroinitializer @@ -5819,9 +5815,9 @@ define <2 x i64> @vsub_if_uge_swapped_v2i64(<2 x i64> %va, <2 x i64> %vb) { define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) { ; CHECK-LABEL: sub_if_uge_C_v8i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vmsgtu.vi v0, v8, 12 -; CHECK-NEXT: vadd.vi v8, v8, -13, v0.t +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vadd.vi v9, v8, -13 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <8 x i8> %x, splat (i8 12) %sub = add <8 x i8> %x, splat (i8 -13) @@ -5832,11 +5828,10 @@ define <8 x i8> @sub_if_uge_C_v8i8(<8 x i8> %x) { define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) { ; CHECK-LABEL: sub_if_uge_C_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 2000 -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, mu -; CHECK-NEXT: vmsgtu.vx v0, v8, a0 ; CHECK-NEXT: li a0, -2001 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <8 x i16> %x, splat (i16 2000) %sub = add <8 x i16> %x, splat (i16 -2001) @@ -5847,13 +5842,11 @@ define <8 x i16> @sub_if_uge_C_v8i16(<8 x i16> %x) { define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) { ; CHECK-LABEL: sub_if_uge_C_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -16 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmsgtu.vx v0, v8, a0 ; CHECK-NEXT: lui a0, 1048560 ; CHECK-NEXT: addi a0, a0, 15 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt <4 x i32> %x, splat (i32 65520) %sub = add <4 x i32> %x, splat (i32 -65521) @@ -5864,14 +5857,11 @@ define <4 x i32> @sub_if_uge_C_v4i32(<4 x i32> %x) { define <4 x i32> @sub_if_uge_C_swapped_v4i32(<4 x i32> %x) { ; CHECK-LABEL: sub_if_uge_C_swapped_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -15 -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: lui a0, 1048560 ; CHECK-NEXT: addi a0, a0, 15 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vadd.vx v9, v8, a0 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult <4 x i32> %x, splat (i32 65521) %sub = add <4 x i32> %x, splat (i32 -65521) @@ -5883,38 +5873,28 @@ define <2 x i64> @sub_if_uge_C_v2i64(<2 x i64> %x) nounwind { ; RV32-LABEL: sub_if_uge_C_v2i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: li a0, 1 -; RV32-NEXT: lui a1, 172127 -; RV32-NEXT: mv a2, sp -; RV32-NEXT: addi a1, a1, 512 -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: sw a0, 4(sp) ; RV32-NEXT: li a0, -2 -; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV32-NEXT: vlse64.v v9, (a2), zero ; RV32-NEXT: lui a1, 876449 ; RV32-NEXT: addi a1, a1, -513 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vmsltu.vv v0, v9, v8 -; RV32-NEXT: vadd.vv v8, v8, v10, v0.t +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vlse64.v v9, (a0), zero +; RV32-NEXT: vadd.vv v9, v8, v9 +; RV32-NEXT: vminu.vv v8, v9, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: sub_if_uge_C_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, 2384 -; RV64-NEXT: addi a0, a0, 761 -; RV64-NEXT: slli a0, a0, 9 -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; RV64-NEXT: vmsgtu.vx v0, v8, a0 ; RV64-NEXT: lui a0, 1048278 ; RV64-NEXT: addi a0, a0, -95 ; RV64-NEXT: slli a0, a0, 12 ; RV64-NEXT: addi a0, a0, -513 -; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vadd.vx v9, v8, a0 +; RV64-NEXT: vminu.vv v8, v9, v8 ; RV64-NEXT: ret %cmp = icmp ugt <2 x i64> %x, splat (i64 5000000000) %sub = add <2 x i64> %x, splat (i64 -5000000001) diff --git a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll index a21a526e00ec8..9b58cb3d5c891 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vminu-sdnode.ll @@ -898,9 +898,8 @@ define @vsub_if_uge_nxv2i8( %va, %va, %vb %select = select %cmp, zeroinitializer, %vb @@ -911,9 +910,9 @@ define @vsub_if_uge_nxv2i8( %va, @vsub_if_uge_swapped_nxv2i8( %va, %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge %va, %vb %select = select %cmp, %vb, zeroinitializer @@ -925,9 +924,8 @@ define @vsub_if_uge_nxv2i16( %va, %va, %vb %select = select %cmp, zeroinitializer, %vb @@ -938,9 +936,9 @@ define @vsub_if_uge_nxv2i16( %va, @vsub_if_uge_swapped_nxv2i16( %va, %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge %va, %vb %select = select %cmp, %vb, zeroinitializer @@ -952,9 +950,8 @@ define @vsub_if_uge_nxv2i32( %va, %va, %vb %select = select %cmp, zeroinitializer, %vb @@ -965,9 +962,9 @@ define @vsub_if_uge_nxv2i32( %va, @vsub_if_uge_swapped_nxv2i32( %va, %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v9, v8 -; CHECK-NEXT: vsub.vv v8, v8, v9, v0.t +; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma +; CHECK-NEXT: vsub.vv v9, v8, v9 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp uge %va, %vb %select = select %cmp, %vb, zeroinitializer @@ -979,9 +976,8 @@ define @vsub_if_uge_nxv2i64( %va, %va, %vb %select = select %cmp, zeroinitializer, %vb @@ -992,9 +988,9 @@ define @vsub_if_uge_nxv2i64( %va, @vsub_if_uge_swapped_nxv2i64( %va, %vb) { ; CHECK-LABEL: vsub_if_uge_swapped_nxv2i64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, mu -; CHECK-NEXT: vmsleu.vv v0, v10, v8 -; CHECK-NEXT: vsub.vv v8, v8, v10, v0.t +; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma +; CHECK-NEXT: vsub.vv v10, v8, v10 +; CHECK-NEXT: vminu.vv v8, v8, v10 ; CHECK-NEXT: ret %cmp = icmp uge %va, %vb %select = select %cmp, %vb, zeroinitializer @@ -1005,9 +1001,9 @@ define @vsub_if_uge_swapped_nxv2i64( %va, < define @sub_if_uge_C_nxv2i8( %x) { ; CHECK-LABEL: sub_if_uge_C_nxv2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu -; CHECK-NEXT: vmsgtu.vi v0, v8, 12 -; CHECK-NEXT: vadd.vi v8, v8, -13, v0.t +; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, ma +; CHECK-NEXT: vadd.vi v9, v8, -13 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt %x, splat (i8 12) %sub = add %x, splat (i8 -13) @@ -1018,11 +1014,10 @@ define @sub_if_uge_C_nxv2i8( %x) { define @sub_if_uge_C_nxv2i16( %x) { ; CHECK-LABEL: sub_if_uge_C_nxv2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 2000 -; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, mu -; CHECK-NEXT: vmsgtu.vx v0, v8, a0 ; CHECK-NEXT: li a0, -2001 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt %x, splat (i16 2000) %sub = add %x, splat (i16 -2001) @@ -1033,13 +1028,11 @@ define @sub_if_uge_C_nxv2i16( %x) { define @sub_if_uge_C_nxv2i32( %x) { ; CHECK-LABEL: sub_if_uge_C_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -16 -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, mu -; CHECK-NEXT: vmsgtu.vx v0, v8, a0 ; CHECK-NEXT: lui a0, 1048560 ; CHECK-NEXT: addi a0, a0, 15 -; CHECK-NEXT: vadd.vx v8, v8, a0, v0.t +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vadd.vx v9, v8, a0 +; CHECK-NEXT: vminu.vv v8, v9, v8 ; CHECK-NEXT: ret %cmp = icmp ugt %x, splat (i32 65520) %sub = add %x, splat (i32 -65521) @@ -1050,14 +1043,11 @@ define @sub_if_uge_C_nxv2i32( %x) { define @sub_if_uge_C_swapped_nxv2i32( %x) { ; CHECK-LABEL: sub_if_uge_C_swapped_nxv2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addi a0, a0, -15 -; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma -; CHECK-NEXT: vmsltu.vx v0, v8, a0 ; CHECK-NEXT: lui a0, 1048560 ; CHECK-NEXT: addi a0, a0, 15 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma ; CHECK-NEXT: vadd.vx v9, v8, a0 -; CHECK-NEXT: vmerge.vvm v8, v9, v8, v0 +; CHECK-NEXT: vminu.vv v8, v8, v9 ; CHECK-NEXT: ret %cmp = icmp ult %x, splat (i32 65521) %sub = add %x, splat (i32 -65521) @@ -1069,38 +1059,28 @@ define @sub_if_uge_C_nxv2i64( %x) nounwind ; RV32-LABEL: sub_if_uge_C_nxv2i64: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 -; RV32-NEXT: li a0, 1 -; RV32-NEXT: lui a1, 172127 -; RV32-NEXT: mv a2, sp -; RV32-NEXT: addi a1, a1, 512 -; RV32-NEXT: sw a1, 0(sp) -; RV32-NEXT: sw a0, 4(sp) ; RV32-NEXT: li a0, -2 -; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, mu -; RV32-NEXT: vlse64.v v10, (a2), zero ; RV32-NEXT: lui a1, 876449 ; RV32-NEXT: addi a1, a1, -513 ; RV32-NEXT: sw a1, 8(sp) ; RV32-NEXT: sw a0, 12(sp) ; RV32-NEXT: addi a0, sp, 8 -; RV32-NEXT: vlse64.v v12, (a0), zero -; RV32-NEXT: vmsltu.vv v0, v10, v8 -; RV32-NEXT: vadd.vv v8, v8, v12, v0.t +; RV32-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV32-NEXT: vlse64.v v10, (a0), zero +; RV32-NEXT: vadd.vv v10, v8, v10 +; RV32-NEXT: vminu.vv v8, v10, v8 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: sub_if_uge_C_nxv2i64: ; RV64: # %bb.0: -; RV64-NEXT: lui a0, 2384 -; RV64-NEXT: addi a0, a0, 761 -; RV64-NEXT: slli a0, a0, 9 -; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, mu -; RV64-NEXT: vmsgtu.vx v0, v8, a0 ; RV64-NEXT: lui a0, 1048278 ; RV64-NEXT: addi a0, a0, -95 ; RV64-NEXT: slli a0, a0, 12 ; RV64-NEXT: addi a0, a0, -513 -; RV64-NEXT: vadd.vx v8, v8, a0, v0.t +; RV64-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; RV64-NEXT: vadd.vx v10, v8, a0 +; RV64-NEXT: vminu.vv v8, v10, v8 ; RV64-NEXT: ret %cmp = icmp ugt %x, splat (i64 5000000000) %sub = add %x, splat (i64 -5000000001) From 8f18dde6c0b38a67ad0f06aab79cdadb78b35d33 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 17 Jul 2025 07:32:59 -0700 Subject: [PATCH 188/813] [RISCV][IA] Rearrange code for readability and ease of merge [nfc] --- .../Target/RISCV/RISCVInterleavedAccess.cpp | 66 +++++++++---------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 8fb6ccaac2c9a..0d4f24172b574 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -69,6 +69,39 @@ static const Intrinsic::ID ScalableVlsegIntrIds[] = { Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, Intrinsic::riscv_vlseg8_mask}; +static const Intrinsic::ID FixedVssegIntrIds[] = { + Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, + Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, + Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, + Intrinsic::riscv_seg8_store_mask}; + +static const Intrinsic::ID ScalableVssegIntrIds[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + +static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { + assert(N); + if (N == 1) + return true; + + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + if (match(V, m_CombineOr(m_ConstantInt(C), + m_NUWMul(m_Value(), m_ConstantInt(C)))) && + C && C % N == 0) + return true; + + if (isPowerOf2_32(N)) { + KnownBits KB = llvm::computeKnownBits(V, DL); + return KB.countMinTrailingZeros() >= Log2_32(N); + } + + return false; +} + /// Lower an interleaved load into a vlsegN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -134,18 +167,6 @@ bool RISCVTargetLowering::lowerInterleavedLoad( return true; } -static const Intrinsic::ID FixedVssegIntrIds[] = { - Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, - Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, - Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, - Intrinsic::riscv_seg8_store_mask}; - -static const Intrinsic::ID ScalableVssegIntrIds[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; - /// Lower an interleaved store into a vssegN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): @@ -235,27 +256,6 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, return true; } -static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { - assert(N); - if (N == 1) - return true; - - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - if (match(V, m_CombineOr(m_ConstantInt(C), - m_NUWMul(m_Value(), m_ConstantInt(C)))) && - C && C % N == 0) - return true; - - if (isPowerOf2_32(N)) { - KnownBits KB = llvm::computeKnownBits(V, DL); - return KB.countMinTrailingZeros() >= Log2_32(N); - } - - return false; -} - bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); From 46357438baefbdcf630abc5d74565afcbf1c48dd Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 17 Jul 2025 16:47:54 +0200 Subject: [PATCH 189/813] [SCEV] Try to re-use existing LCSSA phis when expanding SCEVAddRecExpr. (#147214) If an AddRec is expanded outside a loop with a single exit block, check if any of the (lcssa) phi nodes in the exit block match the AddRec. If that's the case, simply use the existing lcssa phi. This can reduce the number of instruction created for SCEV expansions, mainly for runtime checks generated by the loop vectorizer. Compile-time impact should be mostly neutral https://llvm-compile-time-tracker.com/compare.php?from=48c7a3187f9831304a38df9bdb3b4d5bf6b6b1a2&to=cf9d039a7b0db5d0d912e0e2c01b19c2a653273a&stat=instructions:u PR: https://github.com/llvm/llvm-project/pull/147214 --- .../Utils/ScalarEvolutionExpander.h | 1 + .../Utils/ScalarEvolutionExpander.cpp | 23 +++++++++++++++++++ .../reuse-lcssa-phi-scev-expansion.ll | 4 +--- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h index a101151eed7cc..39fef921a9590 100644 --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -530,6 +530,7 @@ class SCEVExpander : public SCEVVisitor { bool isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV, const Loop *L); + Value *tryToReuseLCSSAPhi(const SCEVAddRecExpr *S); Value *expandAddRecExprLiterally(const SCEVAddRecExpr *); PHINode *getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized, const Loop *L, Type *&TruncTy, diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 739ac00ba47c5..ed08c0bfa2e7d 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1223,6 +1223,24 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) { return Result; } +Value *SCEVExpander::tryToReuseLCSSAPhi(const SCEVAddRecExpr *S) { + const Loop *L = S->getLoop(); + BasicBlock *EB = L->getExitBlock(); + if (!EB || !EB->getSinglePredecessor() || + !SE.DT.dominates(EB, Builder.GetInsertBlock())) + return nullptr; + + for (auto &PN : EB->phis()) { + if (!SE.isSCEVable(PN.getType()) || PN.getType() != S->getType()) + continue; + auto *ExitV = SE.getSCEV(&PN); + if (S == ExitV) + return &PN; + } + + return nullptr; +} + Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { // In canonical mode we compute the addrec as an expression of a canonical IV // using evaluateAtIteration and expand the resulting SCEV expression. This @@ -1262,6 +1280,11 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { return V; } + // If S is expanded outside the defining loop, check if there is a + // matching LCSSA phi node for it. + if (Value *V = tryToReuseLCSSAPhi(S)) + return V; + // {X,+,F} --> X + {0,+,F} if (!S->getStart()->isZero()) { if (isa(S->getType())) { diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index 2747895f06a7b..ce4270dc4b7fa 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -18,11 +18,9 @@ define void @reuse_lcssa_phi_for_add_rec1(ptr %head) { ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: br i1 [[EC_1]], label %[[PH:.*]], label %[[LOOP_1]] ; CHECK: [[PH]]: -; CHECK-NEXT: [[IV_2_LCSSA:%.*]] = phi i32 [ [[IV_2]], %[[LOOP_1]] ] ; CHECK-NEXT: [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP_1]] ] -; CHECK-NEXT: [[IV_2_NEXT_LCSSA:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[IV_2_NEXT]], %[[LOOP_1]] ] ; CHECK-NEXT: [[SRC_2:%.*]] = tail call noalias noundef dereferenceable_or_null(8) ptr @calloc(i64 1, i64 8) -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[IV_2_LCSSA]], 1 ; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[TMP0]], i32 1) ; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] ; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 From 7817163663b3bb662a46a73cf1903ec900ba6146 Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Thu, 17 Jul 2025 08:01:58 -0700 Subject: [PATCH 190/813] [mlir] [presburger] Add IntegerRelation::rangeProduct (#148092) This is intended to match `isl::map`'s `flat_range_product`. --------- Co-authored-by: Jeremy Kun --- .../Analysis/Presburger/IntegerRelation.h | 13 +++ .../Analysis/Presburger/IntegerRelation.cpp | 38 ++++++++ .../Presburger/IntegerRelationTest.cpp | 94 +++++++++++++++++++ 3 files changed, 145 insertions(+) diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h index b68262f09f485..ee401cca8f552 100644 --- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h +++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h @@ -707,6 +707,19 @@ class IntegerRelation { /// this for uniformity with `applyDomain`. void applyRange(const IntegerRelation &rel); + /// Let the relation `this` be R1, and the relation `rel` be R2. Requires + /// R1 and R2 to have the same domain. + /// + /// Let R3 be the rangeProduct of R1 and R2. Then x R3 (y, z) iff + /// (x R1 y and x R2 z). + /// + /// Example: + /// + /// R1: (i, j) -> k : f(i, j, k) = 0 + /// R2: (i, j) -> l : g(i, j, l) = 0 + /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0 + IntegerRelation rangeProduct(const IntegerRelation &rel); + /// Given a relation `other: (A -> B)`, this operation merges the symbol and /// local variables and then takes the composition of `other` on `this: (B -> /// C)`. The resulting relation represents tuples of the form: `A -> C`. diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp index 17e48e0d069b7..5c4d4d13580a0 100644 --- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp +++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp @@ -2481,6 +2481,44 @@ void IntegerRelation::applyDomain(const IntegerRelation &rel) { void IntegerRelation::applyRange(const IntegerRelation &rel) { compose(rel); } +IntegerRelation IntegerRelation::rangeProduct(const IntegerRelation &rel) { + /// R1: (i, j) -> k : f(i, j, k) = 0 + /// R2: (i, j) -> l : g(i, j, l) = 0 + /// R1.rangeProduct(R2): (i, j) -> (k, l) : f(i, j, k) = 0 and g(i, j, l) = 0 + assert(getNumDomainVars() == rel.getNumDomainVars() && + "Range product is only defined for relations with equal domains"); + + // explicit copy of `this` + IntegerRelation result = *this; + unsigned relRangeVarStart = rel.getVarKindOffset(VarKind::Range); + unsigned numThisRangeVars = getNumRangeVars(); + unsigned numNewSymbolVars = result.getNumSymbolVars() - getNumSymbolVars(); + + result.appendVar(VarKind::Range, rel.getNumRangeVars()); + + // Copy each equality from `rel` and update the copy to account for range + // variables from `this`. The `rel` equality is a list of coefficients of the + // variables from `rel`, and so the range variables need to be shifted right + // by the number of `this` range variables and symbols. + for (unsigned i = 0; i < rel.getNumEqualities(); ++i) { + SmallVector copy = + SmallVector(rel.getEquality(i)); + copy.insert(copy.begin() + relRangeVarStart, + numThisRangeVars + numNewSymbolVars, DynamicAPInt(0)); + result.addEquality(copy); + } + + for (unsigned i = 0; i < rel.getNumInequalities(); ++i) { + SmallVector copy = + SmallVector(rel.getInequality(i)); + copy.insert(copy.begin() + relRangeVarStart, + numThisRangeVars + numNewSymbolVars, DynamicAPInt(0)); + result.addInequality(copy); + } + + return result; +} + void IntegerRelation::printSpace(raw_ostream &os) const { space.print(os); os << getNumConstraints() << " constraints\n"; diff --git a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp index 7df500bc9568a..dd0b09f7f05d2 100644 --- a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp +++ b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp @@ -608,3 +608,97 @@ TEST(IntegerRelationTest, convertVarKindToLocal) { EXPECT_EQ(space.getId(VarKind::Symbol, 0), Identifier(&identifiers[3])); EXPECT_EQ(space.getId(VarKind::Symbol, 1), Identifier(&identifiers[4])); } + +TEST(IntegerRelationTest, rangeProduct) { + IntegerRelation r1 = parseRelationFromSet( + "(i, j, k) : (2*i + 3*k == 0, i >= 0, j >= 0, k >= 0)", 2); + IntegerRelation r2 = parseRelationFromSet( + "(i, j, l) : (4*i + 6*j + 9*l == 0, i >= 0, j >= 0, l >= 0)", 2); + + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, j, k, l) : (2*i + 3*k == 0, 4*i + 6*j + 9*l == " + "0, i >= 0, j >= 0, k >= 0, l >= 0)", + 2); + + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductMultdimRange) { + IntegerRelation r1 = + parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1); + IntegerRelation r2 = parseRelationFromSet( + "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1); + + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, k, l, m) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == " + "0, i >= 0, k >= 0, l >= 0, m >= 0)", + 1); + + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductMultdimRangeSwapped) { + IntegerRelation r1 = parseRelationFromSet( + "(i, l, m) : (4*i + 6*m + 9*l == 0, i >= 0, l >= 0, m >= 0)", 1); + IntegerRelation r2 = + parseRelationFromSet("(i, k) : (2*i + 3*k == 0, i >= 0, k >= 0)", 1); + + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, l, m, k) : (2*i + 3*k == 0, 4*i + 6*m + 9*l == " + "0, i >= 0, k >= 0, l >= 0, m >= 0)", + 1); + + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductEmptyDomain) { + IntegerRelation r1 = + parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 0); + IntegerRelation r2 = + parseRelationFromSet("(k, l) : (2*k + 3*l == 0, k >= 0, l >= 0)", 0); + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, j, k, l) : (2*k + 3*l == 0, 4*i + 9*j == " + "0, i >= 0, j >= 0, k >= 0, l >= 0)", + 0); + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductEmptyRange) { + IntegerRelation r1 = + parseRelationFromSet("(i, j) : (4*i + 9*j == 0, i >= 0, j >= 0)", 2); + IntegerRelation r2 = + parseRelationFromSet("(i, j) : (2*i + 3*j == 0, i >= 0, j >= 0)", 2); + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = + parseRelationFromSet("(i, j) : (2*i + 3*j == 0, 4*i + 9*j == " + "0, i >= 0, j >= 0)", + 2); + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductEmptyDomainAndRange) { + IntegerRelation r1 = parseRelationFromSet("() : ()", 0); + IntegerRelation r2 = parseRelationFromSet("() : ()", 0); + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = parseRelationFromSet("() : ()", 0); + EXPECT_TRUE(expected.isEqual(rangeProd)); +} + +TEST(IntegerRelationTest, rangeProductSymbols) { + IntegerRelation r1 = parseRelationFromSet( + "(i, j)[s] : (2*i + 3*j + s == 0, i >= 0, j >= 0)", 1); + IntegerRelation r2 = parseRelationFromSet( + "(i, l)[s] : (3*i + 4*l + s == 0, i >= 0, l >= 0)", 1); + + IntegerRelation rangeProd = r1.rangeProduct(r2); + IntegerRelation expected = parseRelationFromSet( + "(i, j, l)[s] : (2*i + 3*j + s == 0, 3*i + 4*l + s == " + "0, i >= 0, j >= 0, l >= 0)", + 1); + + EXPECT_TRUE(expected.isEqual(rangeProd)); +} From 4bf82aebc0da985cf6b2e70812714875e8fa78fa Mon Sep 17 00:00:00 2001 From: David Spickett Date: Thu, 17 Jul 2025 16:06:03 +0100 Subject: [PATCH 191/813] [lldb][test] Fix TestChildCountTruncation on Windows (#149322) By not forcing the DWARF debug info format. When left as the default, the tests pass. Test added by https://github.com/llvm/llvm-project/pull/149088. --- lldb/test/Shell/Settings/TestChildCountTruncation.test | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lldb/test/Shell/Settings/TestChildCountTruncation.test b/lldb/test/Shell/Settings/TestChildCountTruncation.test index 2660ccae1aa5b..da6436cb5ca20 100644 --- a/lldb/test/Shell/Settings/TestChildCountTruncation.test +++ b/lldb/test/Shell/Settings/TestChildCountTruncation.test @@ -1,11 +1,8 @@ # Test that we warn the user about truncated output # when target.max-children-count wasn't explicitly set. -# link.exe discards the DWARF information needed. -# UNSUPPORTED: system-windows - # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clang_host -g %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=DWIM # From 149aa7679457e4c434374076fa3ad6d02efbe414 Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Thu, 17 Jul 2025 18:38:05 +0300 Subject: [PATCH 192/813] [clang-tools-extra][NFC] Fix link to code review in README.txt (#148384) --- clang-tools-extra/README.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/README.txt b/clang-tools-extra/README.txt index 6891e4078997f..1195db9b468dd 100644 --- a/clang-tools-extra/README.txt +++ b/clang-tools-extra/README.txt @@ -8,12 +8,13 @@ Clang frontend. These tools are kept in a separate "extra" repository to allow lighter weight checkouts of the core Clang codebase. All discussion regarding Clang, Clang-based tools, and code in this repository -should be held using the standard Clang forum: +should be held using the standard Clang forums: https://discourse.llvm.org/c/clang + https://discourse.llvm.org/c/clang/clang-tidy/71 + https://discourse.llvm.org/c/clang/clangd/34 -Code review for this tree should take place on the standard Clang patch and -commit lists: - http://lists.llvm.org/mailman/listinfo/cfe-commits +Code review for this tree should take place on Github: + https://github.com/llvm/llvm-project/pulls?q=label%3Aclang-tools-extra If you find a bug in these tools, please file it in the LLVM bug tracker: https://github.com/llvm/llvm-project/issues/ From 84d65e9d19ab577027238d38d053e293ba656e32 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Thu, 17 Jul 2025 18:00:32 +0200 Subject: [PATCH 193/813] [CIR] Upstream builtin_conj for ComplexType (#149170) This change adds support for builtin_conj for ComplexType https://github.com/llvm/llvm-project/issues/141365 --- clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 14 +++++++- clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 3 +- clang/lib/CIR/CodeGen/CIRGenValue.h | 6 ++++ clang/test/CIR/CodeGen/complex-builtins.cpp | 36 +++++++++++++++++++++ 4 files changed, 56 insertions(+), 3 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index 476f994959285..61d1c54ee9ec9 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -125,7 +125,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, mlir::Value real = emitScalarExpr(e->getArg(0)); mlir::Value imag = emitScalarExpr(e->getArg(1)); mlir::Value complex = builder.createComplexCreate(loc, real, imag); - return RValue::get(complex); + return RValue::getComplex(complex); } case Builtin::BI__builtin_creal: @@ -150,6 +150,18 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, return RValue::get(imag); } + case Builtin::BI__builtin_conj: + case Builtin::BI__builtin_conjf: + case Builtin::BI__builtin_conjl: + case Builtin::BIconj: + case Builtin::BIconjf: + case Builtin::BIconjl: { + mlir::Value complex = emitComplexExpr(e->getArg(0)); + mlir::Value conj = builder.createUnaryOp(getLoc(e->getExprLoc()), + cir::UnaryOpKind::Not, complex); + return RValue::getComplex(conj); + } + case Builtin::BI__builtin_clrsb: case Builtin::BI__builtin_clrsbl: case Builtin::BI__builtin_clrsbll: diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index 6663f5ea1e758..9f36be5397ad8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -231,8 +231,7 @@ mlir::Value ComplexExprEmitter::VisitBinComma(const BinaryOperator *e) { mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) { if (e->getCallReturnType(cgf.getContext())->isReferenceType()) return emitLoadOfLValue(e); - - return cgf.emitCallExpr(e).getValue(); + return cgf.emitCallExpr(e).getComplexValue(); } mlir::Value ComplexExprEmitter::VisitCastExpr(CastExpr *e) { diff --git a/clang/lib/CIR/CodeGen/CIRGenValue.h b/clang/lib/CIR/CodeGen/CIRGenValue.h index 0a6dba5e80a62..0832c4141a10f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenValue.h +++ b/clang/lib/CIR/CodeGen/CIRGenValue.h @@ -58,6 +58,12 @@ class RValue { return value; } + /// Return the value of this complex value. + mlir::Value getComplexValue() const { + assert(isComplex() && "Not a complex!"); + return value; + } + /// Return the value of the address of the aggregate. Address getAggregateAddress() const { assert(isAggregate() && "Not an aggregate!"); diff --git a/clang/test/CIR/CodeGen/complex-builtins.cpp b/clang/test/CIR/CodeGen/complex-builtins.cpp index f0d12d0ef6663..811af47a704f5 100644 --- a/clang/test/CIR/CodeGen/complex-builtins.cpp +++ b/clang/test/CIR/CodeGen/complex-builtins.cpp @@ -83,3 +83,39 @@ void foo3() { // OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { double, double }, ptr %[[COMPLEX]], i32 0, i32 1 // OGCG: %[[A_IMAG:.*]] = load double, ptr %[[A_IMAG_PTR]], align 8 // OGCG: store double %[[A_IMAG]], ptr %[[INIT]], align 8 + +void foo4() { + float _Complex a; + float _Complex b = __builtin_conjf(a); +} + +// CIR: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !cir.float +// CIR: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !cir.float +// CIR: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float +// CIR: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex +// CIR: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex, !cir.ptr> + +// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[RESULT:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[COMPLEX]], align 4 +// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0 +// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1 +// LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]] +// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0 +// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[RESULT]], align 4 + +// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4 +// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0 +// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1 +// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float %[[A_IMAG]] +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1 +// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4 From a7f595efd840f7ed2210f2703048fad4d0027fac Mon Sep 17 00:00:00 2001 From: nvptm Date: Thu, 17 Jul 2025 09:04:44 -0700 Subject: [PATCH 194/813] [flang][acc] Create UseDeviceOp for both results of hlfir.declare (#148017) A sample such as ``` program test integer :: N = 100 real*8 :: b(-1:N) !$acc data copy(b) !$acc host_data use_device(b) call vadd(b) !$acc end host_data !$acc end data end ``` is lowered to ``` %13:2 = hlfir.declare %11(%12) {uniq_name = "_QFEb"} : (!fir.ref>, !fir.shapeshift<1>) -> (!fir.box>, !fir.ref>) %14 = acc.copyin var(%13#0 : !fir.box>) -> !fir.box> {dataClause = #acc, name = "b"} acc.data dataOperands(%14 : !fir.box>) { %15 = acc.use_device var(%13#0 : !fir.box>) -> !fir.box> {name = "b"} acc.host_data dataOperands(%15 : !fir.box>) { fir.call @_QPvadd(%13#1) fastmath : (!fir.ref>) -> () acc.terminator } acc.terminator } acc.copyout accVar(%14 : !fir.box>) to var(%13#0 : !fir.box>) {dataClause = #acc, name = "b"} ``` Note that while the use_device clause is applied to %13#0, the argument passed to vadd is %13#1. To avoid problems later in lowering, this change additionally applies the use_device clause to %13#1, so that the resulting MLIR is ``` %13:2 = hlfir.declare %11(%12) {uniq_name = "_QFEb"} : (!fir.ref>, !fir.shapeshift<1>) -> (!fir.box>, !fir.ref>) %14 = acc.copyin var(%13#0 : !fir.box>) -> !fir.box> {dataClause = #acc, name = "b"} acc.data dataOperands(%14 : !fir.box>) { %15 = acc.use_device var(%13#0 : !fir.box>) -> !fir.box> {name = "b"} %16 = acc.use_device varPtr(%13#1 : !fir.ref>) -> !fir.ref> {name = "b"} acc.host_data dataOperands(%15, %16 : !fir.box>, !fir.ref>) { fir.call @_QPvadd(%13#1) fastmath : (!fir.ref>) -> () acc.terminator } acc.terminator } acc.copyout accVar(%14 : !fir.box>) to var(%13#0 : !fir.box>) {dataClause = #acc, name = "b"} ``` --- flang/lib/Lower/OpenACC.cpp | 20 +++++- .../acc-host-data-unwrap-defaultbounds.f90 | 14 +++-- flang/test/Lower/OpenACC/acc-host-data.f90 | 21 ++++--- flang/test/Lower/OpenACC/acc-use-device.f90 | 61 +++++++++++++++++++ 4 files changed, 100 insertions(+), 16 deletions(-) create mode 100644 flang/test/Lower/OpenACC/acc-use-device.f90 diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 39e4444cde4e3..25682cba5620e 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -708,6 +708,7 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList, bool setDeclareAttr = false) { fir::FirOpBuilder &builder = converter.getFirOpBuilder(); Fortran::evaluate::ExpressionAnalyzer ea{semanticsContext}; + const bool unwrapBoxAddr = true; for (const auto &accObject : objectList.v) { llvm::SmallVector bounds; std::stringstream asFortran; @@ -735,8 +736,25 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList, Op op = createDataEntryOp( builder, operandLocation, baseAddr, asFortran, bounds, structured, implicit, dataClause, baseAddr.getType(), async, asyncDeviceTypes, - asyncOnlyDeviceTypes, /*unwrapBoxAddr=*/true, info.isPresent); + asyncOnlyDeviceTypes, unwrapBoxAddr, info.isPresent); dataOperands.push_back(op.getAccVar()); + + // For UseDeviceOp, if operand is one of a pair resulting from a + // declare operation, create a UseDeviceOp for the other operand as well. + if constexpr (std::is_same_v) { + if (auto declareOp = + mlir::dyn_cast(baseAddr.getDefiningOp())) { + mlir::Value otherAddr = declareOp.getResult(1); + if (baseAddr != otherAddr) { + Op op = createDataEntryOp(builder, operandLocation, otherAddr, + asFortran, bounds, structured, implicit, + dataClause, otherAddr.getType(), async, + asyncDeviceTypes, asyncOnlyDeviceTypes, + unwrapBoxAddr, info.isPresent); + dataOperands.push_back(op.getAccVar()); + } + } + } } } diff --git a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 index 164eb32a8f684..2de7cc5761a2b 100644 --- a/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 +++ b/flang/test/Lower/OpenACC/acc-host-data-unwrap-defaultbounds.f90 @@ -15,15 +15,17 @@ subroutine acc_host_data() !$acc end host_data ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index) -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND]]) -> !fir.ref> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref>) +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND]]) -> !fir.ref> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref>) bounds(%[[BOUND]]) -> !fir.ref> {name = "a"} + ! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref>, !fir.ref>) !$acc host_data use_device(a) if_present !$acc end host_data ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index) -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND]]) -> !fir.ref> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref>) { +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND]]) -> !fir.ref> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref>) bounds(%[[BOUND]]) -> !fir.ref> {name = "a"} +! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref>{{.*}}) { ! CHECK: } attributes {ifPresent} !$acc host_data use_device(a) if(ifCondition) @@ -33,14 +35,14 @@ subroutine acc_host_data() ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND]]) -> !fir.ref> {name = "a"} ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref> ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1 -! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref>) +! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]]{{.*}} : !fir.ref>{{.*}}) !$acc host_data use_device(a) if(.true.) !$acc end host_data ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%{{.*}} : index) upperbound(%{{.*}} : index) stride(%{{.*}} : index) startIdx(%{{.*}} : index) ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) bounds(%[[BOUND]]) -> !fir.ref> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref>) +! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref>{{.*}}) !$acc host_data use_device(a) if(.false.) a = 1.0 diff --git a/flang/test/Lower/OpenACC/acc-host-data.f90 b/flang/test/Lower/OpenACC/acc-host-data.f90 index 871eabd256ca6..4d09b25b983b9 100644 --- a/flang/test/Lower/OpenACC/acc-host-data.f90 +++ b/flang/test/Lower/OpenACC/acc-host-data.f90 @@ -14,34 +14,37 @@ subroutine acc_host_data() !$acc host_data use_device(a) !$acc end host_data -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref>) +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref>) -> !fir.ref> {name = "a"} +! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref>, !fir.ref>) !$acc host_data use_device(a) if_present !$acc end host_data -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref>) { +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref>) -> !fir.ref> {name = "a"} +! CHECK: acc.host_data dataOperands(%[[DA0]], %[[DA1]] : !fir.ref>, !fir.ref>) ! CHECK: } attributes {ifPresent} - !$acc host_data use_device(a) if_present if_present + !$acc host_data use_device(a) if_present !$acc end host_data -! CHECK: acc.host_data dataOperands(%{{.*}} : !fir.ref>) { +! CHECK: acc.host_data dataOperands(%{{.*}}{{.*}} : !fir.ref>{{.*}}) { ! CHECK: } attributes {ifPresent} !$acc host_data use_device(a) if(ifCondition) !$acc end host_data -! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {name = "a"} +! CHECK: %[[DA0:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {name = "a"} +! CHECK: %[[DA1:.*]] = acc.use_device varPtr(%[[DECLA]]#1 : !fir.ref>) -> !fir.ref> {name = "a"} ! CHECK: %[[LOAD_IFCOND:.*]] = fir.load %[[DECLIFCOND]]#0 : !fir.ref> ! CHECK: %[[IFCOND_I1:.*]] = fir.convert %[[LOAD_IFCOND]] : (!fir.logical<4>) -> i1 -! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA]] : !fir.ref>) +! CHECK: acc.host_data if(%[[IFCOND_I1]]) dataOperands(%[[DA0]]{{.*}} : !fir.ref>{{.*}}) !$acc host_data use_device(a) if(.true.) !$acc end host_data ! CHECK: %[[DA:.*]] = acc.use_device varPtr(%[[DECLA]]#0 : !fir.ref>) -> !fir.ref> {name = "a"} -! CHECK: acc.host_data dataOperands(%[[DA]] : !fir.ref>) +! CHECK: acc.host_data dataOperands(%[[DA]]{{.*}} : !fir.ref>{{.*}}) !$acc host_data use_device(a) if(.false.) a = 1.0 diff --git a/flang/test/Lower/OpenACC/acc-use-device.f90 b/flang/test/Lower/OpenACC/acc-use-device.f90 new file mode 100644 index 0000000000000..081a6e317bfc9 --- /dev/null +++ b/flang/test/Lower/OpenACC/acc-use-device.f90 @@ -0,0 +1,61 @@ +! This test checks whether the OpenACC use_device clause is applied on both results of hlfir.declare. + +! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s + +! Test for automatic variable appearing in use_device clause. +subroutine test() + integer :: N = 100 + real*8 :: b(-1:N) +! CHECK: %[[A0:.*]] = fir.alloca !fir.array, %{{.*}} {bindc_name = "b", uniq_name = "_QFtestEb"} +! CHECK: %[[A1:.*]] = fir.shape_shift {{.*}} : (index, index) -> !fir.shapeshift<1> +! CHECK: %[[A:.*]]:2 = hlfir.declare %[[A0]](%[[A1]]) {uniq_name = "_QFtestEb"} : (!fir.ref>, !fir.shapeshift<1>) -> (!fir.box>, !fir.ref>) + + !$acc data copy(b) +! CHECK: %[[B:.*]] = acc.copyin var(%[[A]]#0 : !fir.box>) -> !fir.box> {dataClause = #acc, name = "b"} +! CHECK: acc.data dataOperands(%[[B]] : !fir.box>) { + + !$acc host_data use_device(b) + call vadd(b) + !$acc end host_data +! CHECK: %[[C:.*]] = acc.use_device var(%[[A]]#0 : !fir.box>) -> !fir.box> {name = "b"} +! CHECK: %[[D:.*]] = acc.use_device varPtr(%[[A]]#1 : !fir.ref>) -> !fir.ref> {name = "b"} +! CHECK: acc.host_data dataOperands(%[[C]], %[[D]] : !fir.box>, !fir.ref>) { +! CHECK: fir.call @_QPvadd(%[[A]]#1) fastmath : (!fir.ref>) -> () + !$acc end data +! CHECK: acc.copyout accVar(%[[B]] : !fir.box>) to var(%[[A]]#0 : !fir.box>) {dataClause = #acc, name = "b"} +end + +! Test for allocatable, pointer and assumed-shape variables appearing in use_device clause. +subroutine test2(a, b, c) + integer :: N = 100 + real*8, allocatable :: a(:) + real*8, target, allocatable :: d(:) + real*8 :: b(:) + real*8, pointer :: c(:) + call allocate(a(N)) + call allocate(d(N)) + c => d +! CHECK: %[[DS:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[E:.*]]:2 = hlfir.declare %arg0 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest2Ea"} : (!fir.ref>>>, !fir.dscope) -> (!fir.ref>>>, !fir.ref>>>) +! CHECK: %[[F:.*]]:2 = hlfir.declare %arg1 dummy_scope %[[DS]] {uniq_name = "_QFtest2Eb"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +! CHECK: %[[G:.*]]:2 = hlfir.declare %arg2 dummy_scope %[[DS]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFtest2Ec"} : (!fir.ref>>>, !fir.dscope) -> (!fir.ref>>>, !fir.ref>>>) + + !$acc data copy(a,b,c,d) + !$acc host_data use_device(a,b,c) + call vadd2(a,b,c) + !$acc end host_data + +! CHECK: %[[H:.*]] = acc.use_device varPtr(%[[E]]#0 : !fir.ref>>>) -> !fir.ref>>> {name = "a"} +! CHECK: %[[I:.*]] = acc.use_device varPtr(%[[E]]#1 : !fir.ref>>>) -> !fir.ref>>> {name = "a"} +! CHECK: %[[J:.*]] = acc.use_device var(%[[F]]#0 : !fir.box>) -> !fir.box> {name = "b"} +! CHECK: %[[K:.*]] = acc.use_device var(%[[F]]#1 : !fir.box>) -> !fir.box> {name = "b"} +! CHECK: %[[L:.*]] = acc.use_device varPtr(%[[G]]#0 : !fir.ref>>>) -> !fir.ref>>> {name = "c"} +! CHECK: %[[M:.*]] = acc.use_device varPtr(%[[G]]#1 : !fir.ref>>>) -> !fir.ref>>> {name = "c"} +! CHECK: acc.host_data dataOperands(%[[H]], %[[I]], %[[J]], %[[K]], %[[L]], %[[M]] : !fir.ref>>>, !fir.ref>>>, !fir.box>, !fir.box>, !fir.ref>>>, !fir.ref>>>) { + + + + + !$acc end data + +end From 7caf12da0bb09d6b6992bf42afd256d453753dcb Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Thu, 17 Jul 2025 09:09:12 -0700 Subject: [PATCH 195/813] [mlir][core] Add an MLIR "pattern catalog" generator (#146228) This PR adds a feature that attaches a listener to all RewritePatterns that logs information about the modified operations. When the MLIR test suite is run, these debug outputs can be filtered and combined into an index linking operations to the patterns that insert, modify, or replace them. This index is intended to be used to create a website that allows one to look up patterns from an operation name. The debug logs emitted can be viewed with --debug-only=generate-pattern-catalog, and the lit config is modified to do this when the env var MLIR_GENERATE_PATTERN_CATALOG is set. Example usage: ``` mkdir build && cd build cmake -G Ninja ../llvm \ -DLLVM_ENABLE_PROJECTS="mlir" \ -DLLVM_TARGETS_TO_BUILD="host" \ -DCMAKE_BUILD_TYPE=DEBUG ninja -j 24 check-mlir MLIR_GENERATE_PATTERN_CATALOG=1 bin/llvm-lit -j 24 -v -a tools/mlir/test | grep 'pattern-logging-listener' | sed 's/^# | [pattern-logging-listener] //g' | sort | uniq > pattern_catalog.txt ``` Sample pattern catalog output (that fits in a gist): https://gist.github.com/j2kun/02d1ab8d31c10d71027724984c89905a --------- Co-authored-by: Jeremy Kun Co-authored-by: Mehdi Amini --- mlir/include/mlir/IR/PatternMatch.h | 19 +++++++ mlir/lib/IR/CMakeLists.txt | 1 + mlir/lib/IR/PatternLoggingListener.cpp | 50 +++++++++++++++++++ mlir/lib/Rewrite/PatternApplicator.cpp | 16 +++++- .../IR/test-pattern-logging-listener.mlir | 17 +++++++ mlir/test/lit.cfg.py | 11 ++++ 6 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 mlir/lib/IR/PatternLoggingListener.cpp create mode 100644 mlir/test/IR/test-pattern-logging-listener.mlir diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h index afeb784b85a12..3a2dbd136b438 100644 --- a/mlir/include/mlir/IR/PatternMatch.h +++ b/mlir/include/mlir/IR/PatternMatch.h @@ -475,6 +475,25 @@ class RewriterBase : public OpBuilder { RewriterBase::Listener *rewriteListener; }; + /// A listener that logs notification events to llvm::dbgs() before + /// forwarding to the base listener. + struct PatternLoggingListener : public RewriterBase::ForwardingListener { + PatternLoggingListener(OpBuilder::Listener *listener, StringRef patternName) + : RewriterBase::ForwardingListener(listener), patternName(patternName) { + } + + void notifyOperationInserted(Operation *op, InsertPoint previous) override; + void notifyOperationModified(Operation *op) override; + void notifyOperationReplaced(Operation *op, Operation *newOp) override; + void notifyOperationReplaced(Operation *op, + ValueRange replacement) override; + void notifyOperationErased(Operation *op) override; + void notifyPatternBegin(const Pattern &pattern, Operation *op) override; + + private: + StringRef patternName; + }; + /// Move the blocks that belong to "region" before the given position in /// another region "parent". The two regions must be different. The caller /// is responsible for creating or updating the operation transferring flow diff --git a/mlir/lib/IR/CMakeLists.txt b/mlir/lib/IR/CMakeLists.txt index 4cabac185171c..3ef69cea18f0a 100644 --- a/mlir/lib/IR/CMakeLists.txt +++ b/mlir/lib/IR/CMakeLists.txt @@ -29,6 +29,7 @@ add_mlir_library(MLIRIR ODSSupport.cpp Operation.cpp OperationSupport.cpp + PatternLoggingListener.cpp PatternMatch.cpp Region.cpp RegionKindInterface.cpp diff --git a/mlir/lib/IR/PatternLoggingListener.cpp b/mlir/lib/IR/PatternLoggingListener.cpp new file mode 100644 index 0000000000000..ce2123ae1a19a --- /dev/null +++ b/mlir/lib/IR/PatternLoggingListener.cpp @@ -0,0 +1,50 @@ +#include "mlir/IR/PatternMatch.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "pattern-logging-listener" +#define DBGS() (llvm::dbgs() << "[" << DEBUG_TYPE << "] ") +#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") + +using namespace mlir; + +void RewriterBase::PatternLoggingListener::notifyOperationInserted( + Operation *op, InsertPoint previous) { + LDBG(patternName << " | notifyOperationInserted" + << " | " << op->getName()); + ForwardingListener::notifyOperationInserted(op, previous); +} + +void RewriterBase::PatternLoggingListener::notifyOperationModified( + Operation *op) { + LDBG(patternName << " | notifyOperationModified" + << " | " << op->getName()); + ForwardingListener::notifyOperationModified(op); +} + +void RewriterBase::PatternLoggingListener::notifyOperationReplaced( + Operation *op, Operation *newOp) { + LDBG(patternName << " | notifyOperationReplaced (with op)" + << " | " << op->getName() << " | " << newOp->getName()); + ForwardingListener::notifyOperationReplaced(op, newOp); +} + +void RewriterBase::PatternLoggingListener::notifyOperationReplaced( + Operation *op, ValueRange replacement) { + LDBG(patternName << " | notifyOperationReplaced (with values)" + << " | " << op->getName()); + ForwardingListener::notifyOperationReplaced(op, replacement); +} + +void RewriterBase::PatternLoggingListener::notifyOperationErased( + Operation *op) { + LDBG(patternName << " | notifyOperationErased" + << " | " << op->getName()); + ForwardingListener::notifyOperationErased(op); +} + +void RewriterBase::PatternLoggingListener::notifyPatternBegin( + const Pattern &pattern, Operation *op) { + LDBG(patternName << " | notifyPatternBegin" + << " | " << op->getName()); + ForwardingListener::notifyPatternBegin(pattern, op); +} diff --git a/mlir/lib/Rewrite/PatternApplicator.cpp b/mlir/lib/Rewrite/PatternApplicator.cpp index 4a12183492fd4..b2b372b7b1249 100644 --- a/mlir/lib/Rewrite/PatternApplicator.cpp +++ b/mlir/lib/Rewrite/PatternApplicator.cpp @@ -15,6 +15,10 @@ #include "ByteCode.h" #include "llvm/Support/Debug.h" +#ifndef NDEBUG +#include "llvm/ADT/ScopeExit.h" +#endif + #define DEBUG_TYPE "pattern-application" using namespace mlir; @@ -206,11 +210,19 @@ LogicalResult PatternApplicator::matchAndRewrite( } else { LLVM_DEBUG(llvm::dbgs() << "Trying to match \"" << bestPattern->getDebugName() << "\"\n"); - const auto *pattern = static_cast(bestPattern); - result = pattern->matchAndRewrite(op, rewriter); +#ifndef NDEBUG + OpBuilder::Listener *oldListener = rewriter.getListener(); + auto loggingListener = + std::make_unique( + oldListener, pattern->getDebugName()); + rewriter.setListener(loggingListener.get()); + auto resetListenerCallback = llvm::make_scope_exit( + [&] { rewriter.setListener(oldListener); }); +#endif + result = pattern->matchAndRewrite(op, rewriter); LLVM_DEBUG(llvm::dbgs() << "\"" << bestPattern->getDebugName() << "\" result " << succeeded(result) << "\n"); diff --git a/mlir/test/IR/test-pattern-logging-listener.mlir b/mlir/test/IR/test-pattern-logging-listener.mlir new file mode 100644 index 0000000000000..a1d27741a0723 --- /dev/null +++ b/mlir/test/IR/test-pattern-logging-listener.mlir @@ -0,0 +1,17 @@ +// RUN: mlir-opt %s --test-walk-pattern-rewrite-driver \ +// RUN: --allow-unregistered-dialect --debug-only=pattern-logging-listener 2>&1 | FileCheck %s + +// Check that when replacing an op with a new op, we get appropriate +// pattern-logging lines. The regex is because the anonymous namespace is +// printed differently on different platforms. + +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationInserted | test.new_op +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationReplaced (with values) | test.replace_with_new_op +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationModified | arith.addi +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationModified | arith.addi +// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationErased | test.replace_with_new_op +func.func @replace_with_new_op() -> i32 { + %a = "test.replace_with_new_op"() : () -> (i32) + %res = arith.addi %a, %a : i32 + return %res : i32 +} diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 9b5cadd62befc..233fef8ec4296 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -301,6 +301,17 @@ def find_real_python_interpreter(): ToolSubst("mlir-opt", "mlir-opt --verify-roundtrip", unresolved="fatal"), ] ) +elif "MLIR_GENERATE_PATTERN_CATALOG" in os.environ: + tools.extend( + [ + ToolSubst( + "mlir-opt", + "mlir-opt --debug-only=pattern-logging-listener --mlir-disable-threading", + unresolved="fatal", + ), + ToolSubst("FileCheck", "FileCheck --dump-input=always", unresolved="fatal"), + ] + ) else: tools.extend(["mlir-opt"]) From d97c224e8cbba9158ebda6f12f9a06b09534ae29 Mon Sep 17 00:00:00 2001 From: Connector Switch Date: Fri, 18 Jul 2025 00:10:07 +0800 Subject: [PATCH 196/813] [libc][NFC]: Correct some comments about SDCOMP-26094. (#149317) --- libc/test/src/math/cospif_test.cpp | 2 +- libc/test/src/math/sincosf_test.cpp | 2 +- libc/test/src/math/sinpif_test.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp index cb88bfcade0dc..5c30fb7c8718f 100644 --- a/libc/test/src/math/cospif_test.cpp +++ b/libc/test/src/math/cospif_test.cpp @@ -100,7 +100,7 @@ TEST_F(LlvmLibcCospifTest, SmallValues) { LIBC_NAMESPACE::cospif(x), 0.5); } -// SDCOMP-26094: check sinfpi in the cases for which the range reducer +// SDCOMP-26094: check cospif in the cases for which the range reducer // returns values furthest beyond its nominal upper bound of pi/4. TEST_F(LlvmLibcCospifTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp index ad2155f329cd9..4aac1fabfbd62 100644 --- a/libc/test/src/math/sincosf_test.cpp +++ b/libc/test/src/math/sincosf_test.cpp @@ -164,7 +164,7 @@ TEST_F(LlvmLibcSinCosfTest, SpecialValues) { } } -// SDCOMP-26094: check sinf in the cases for which the range reducer +// SDCOMP-26094: check sincosf in the cases for which the range reducer // returns values furthest beyond its nominal upper bound of pi/4. TEST_F(LlvmLibcSinCosfTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp index 986c676761f0e..94e3dbc4f07d4 100644 --- a/libc/test/src/math/sinpif_test.cpp +++ b/libc/test/src/math/sinpif_test.cpp @@ -100,7 +100,7 @@ TEST_F(LlvmLibcSinpifTest, SmallValues) { LIBC_NAMESPACE::sinpif(x), 0.5); } -// SDCOMP-26094: check sinfpi in the cases for which the range reducer +// SDCOMP-26094: check sinpif in the cases for which the range reducer // returns values furthest beyond its nominal upper bound of pi/4. TEST_F(LlvmLibcSinpifTest, SDCOMP_26094) { for (uint32_t v : SDCOMP26094_VALUES) { From 011d38bdac95647a872a5faa339465e26535df35 Mon Sep 17 00:00:00 2001 From: erichkeane Date: Thu, 17 Jul 2025 07:43:14 -0700 Subject: [PATCH 197/813] [OpenACC] Update OpenACC macro, remove override macro As we are now Sema-complete for OpenACC 3.4 (and thus have a conforming implementation, in all modes), we can now set the _OPENACC macro correctly. Additionally, we remove the temporary 'override' functionality, which was intended to allow people to experiment with this. We aren't having a deprecation period as OpenACC support is still considered experimental. --- clang/include/clang/Basic/LangOptions.h | 5 ----- clang/include/clang/Driver/Options.td | 13 ------------- clang/lib/Driver/ToolChains/Clang.cpp | 9 --------- clang/lib/Frontend/CompilerInvocation.cpp | 12 ++---------- clang/lib/Frontend/InitPreprocessor.cpp | 12 ++---------- clang/test/Driver/openacc.c | 12 ------------ clang/test/Preprocessor/openacc.c | 8 ++------ 7 files changed, 6 insertions(+), 65 deletions(-) diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 937cbff4e3ea3..0407897359b5e 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -633,11 +633,6 @@ class LangOptions : public LangOptionsBase { // received as a result of a standard operator new (-fcheck-new) bool CheckNew = false; - // In OpenACC mode, contains a user provided override for the _OPENACC macro. - // This exists so that we can override the macro value and test our incomplete - // implementation on real-world examples. - std::string OpenACCMacroOverride; - /// The HLSL root signature version for dxil. llvm::dxbc::RootSignatureVersion HLSLRootSigVer = llvm::dxbc::RootSignatureVersion::V1_1; diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a8c1b5dd8ab3b..6c22f06b269fb 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1422,19 +1422,6 @@ def fno_hip_emit_relocatable : Flag<["-"], "fno-hip-emit-relocatable">, HelpText<"Do not override toolchain to compile HIP source to relocatable">; } -// Clang specific/exclusive options for OpenACC. -def openacc_macro_override - : Separate<["-"], "fexperimental-openacc-macro-override">, - Visibility<[ClangOption, CC1Option]>, - Group, - HelpText<"Overrides the _OPENACC macro value for experimental testing " - "during OpenACC support development">; -def openacc_macro_override_EQ - : Joined<["-"], "fexperimental-openacc-macro-override=">, - Alias; - -// End Clang specific/exclusive options for OpenACC. - def libomptarget_amdgpu_bc_path_EQ : Joined<["--"], "libomptarget-amdgpu-bc-path=">, Group, HelpText<"Path to libomptarget-amdgcn bitcode library">; def libomptarget_amdgcn_bc_path_EQ : Joined<["--"], "libomptarget-amdgcn-bc-path=">, Group, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 456bfe885f354..8880c9375143f 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -3846,15 +3846,6 @@ static void RenderOpenACCOptions(const Driver &D, const ArgList &Args, return; CmdArgs.push_back("-fopenacc"); - - if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override)) { - StringRef Value = A->getValue(); - int Version; - if (!Value.getAsInteger(10, Version)) - A->renderAsInput(Args, CmdArgs); - else - D.Diag(diag::err_drv_clang_unsupported) << Value; - } } static void RenderBuiltinOptions(const ToolChain &TC, const llvm::Triple &T, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 6ab36d8675966..3a36250da57a3 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -3913,12 +3913,8 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts, if (Opts.OpenMPCUDAMode) GenerateArg(Consumer, OPT_fopenmp_cuda_mode); - if (Opts.OpenACC) { + if (Opts.OpenACC) GenerateArg(Consumer, OPT_fopenacc); - if (!Opts.OpenACCMacroOverride.empty()) - GenerateArg(Consumer, OPT_openacc_macro_override, - Opts.OpenACCMacroOverride); - } // The arguments used to set Optimize, OptimizeSize and NoInlineDefine are // generated from CodeGenOptions. @@ -4424,13 +4420,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args, Args.hasArg(options::OPT_fopenmp_cuda_mode); // OpenACC Configuration. - if (Args.hasArg(options::OPT_fopenacc)) { + if (Args.hasArg(options::OPT_fopenacc)) Opts.OpenACC = true; - if (Arg *A = Args.getLastArg(options::OPT_openacc_macro_override)) - Opts.OpenACCMacroOverride = A->getValue(); - } - if (Arg *A = Args.getLastArg(OPT_ffp_contract)) { StringRef Val = A->getValue(); if (Val == "fast") diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 38b2e0cf1ca59..382ccd610946c 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -639,16 +639,8 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, } } - if (LangOpts.OpenACC) { - // FIXME: When we have full support for OpenACC, we should set this to the - // version we support. Until then, set as '1' by default, but provide a - // temporary mechanism for users to override this so real-world examples can - // be tested against. - if (!LangOpts.OpenACCMacroOverride.empty()) - Builder.defineMacro("_OPENACC", LangOpts.OpenACCMacroOverride); - else - Builder.defineMacro("_OPENACC", "1"); - } + if (LangOpts.OpenACC) + Builder.defineMacro("_OPENACC", "202506"); } /// Initialize the predefined C++ language feature test macros defined in diff --git a/clang/test/Driver/openacc.c b/clang/test/Driver/openacc.c index c7f1d2545bd03..f46e2a32bcab2 100644 --- a/clang/test/Driver/openacc.c +++ b/clang/test/Driver/openacc.c @@ -1,14 +1,2 @@ // RUN: %clang -S -### -fopenacc %s 2>&1 | FileCheck %s --check-prefix=CHECK-DRIVER // CHECK-DRIVER: "-cc1" {{.*}} "-fopenacc" - -// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override=202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE -// RUN: %clang -S -### -fopenacc -fexperimental-openacc-macro-override 202211 %s 2>&1 | FileCheck %s --check-prefix=CHECK-MACRO-OVERRIDE -// CHECK-MACRO-OVERRIDE: "-cc1"{{.*}} "-fexperimental-openacc-macro-override" "202211" - -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 202211L %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override L202211 %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override=2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID -// RUN: not %clang -S -fopenacc -fexperimental-openacc-macro-override 2022L11 %s 2>&1 | FileCheck %s --check-prefix=INVALID -// INVALID: error: the clang compiler does not support diff --git a/clang/test/Preprocessor/openacc.c b/clang/test/Preprocessor/openacc.c index be7052f00e0ce..283baa6c2fe4b 100644 --- a/clang/test/Preprocessor/openacc.c +++ b/clang/test/Preprocessor/openacc.c @@ -1,13 +1,9 @@ // RUN: %clang_cc1 -E -fopenacc %s | FileCheck %s --check-prefix=DEFAULT -// RUN: %clang_cc1 -E -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=OVERRIDE -// DEFAULT: OpenACC:1: -// OVERRIDE: OpenACC:202211: +// DEFAULT: OpenACC:202506: OpenACC:_OPENACC: // RUN: %clang_cc1 -E -dM -fopenacc %s | FileCheck %s --check-prefix=MACRO_PRINT_DEF -// RUN: %clang_cc1 -E -dM -fopenacc -fexperimental-openacc-macro-override 202211 %s | FileCheck %s --check-prefix=MACRO_PRINT_OVR -// MACRO_PRINT_DEF: #define _OPENACC 1 -// MACRO_PRINT_OVR: #define _OPENACC 202211 +// MACRO_PRINT_DEF: #define _OPENACC 202506 From a6fb3b3c18fd48a2eaaa8c969edbc013b9276a09 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Thu, 17 Jul 2025 09:18:23 -0700 Subject: [PATCH 198/813] [LLDB] Process minidump better error messages (#149206) Prior, Process Minidump would return ``` Status::FromErrorString("could not parse memory info"); ``` For any unsuccessful memory read, with no differentiation between an error in LLDB and the data simply not being present. This lead to a lot of user confusion and overall pretty terrible user experience. To fix this I've refactored the APIs so we can pass an error back in an llvm expected. There were also no shell tests for memory read and process Minidump so I added one. --- .../Process/minidump/MinidumpParser.cpp | 34 ++++++++++++--- .../Plugins/Process/minidump/MinidumpParser.h | 3 +- .../Process/minidump/ProcessMinidump.cpp | 9 ++-- .../Shell/Minidump/missing-memory-region.yaml | 42 +++++++++++++++++++ .../Process/minidump/MinidumpParserTest.cpp | 23 +++++----- 5 files changed, 91 insertions(+), 20 deletions(-) create mode 100644 lldb/test/Shell/Minidump/missing-memory-region.yaml diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp index ef691b77193ce..58ebb7be11994 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp +++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.cpp @@ -108,13 +108,21 @@ MinidumpParser::GetThreadContext(const minidump::Thread &td) { llvm::ArrayRef MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) { + Log *log = GetLog(LLDBLog::Process); // On Windows, a 32-bit process can run on a 64-bit machine under WOW64. If // the minidump was captured with a 64-bit debugger, then the CONTEXT we just // grabbed from the mini_dump_thread is the one for the 64-bit "native" // process rather than the 32-bit "guest" process we care about. In this // case, we can get the 32-bit CONTEXT from the TEB (Thread Environment // Block) of the 64-bit process. - auto teb_mem = GetMemory(td.EnvironmentBlock, sizeof(TEB64)); + auto teb_mem_maybe = GetMemory(td.EnvironmentBlock, sizeof(TEB64)); + if (!teb_mem_maybe) { + LLDB_LOG_ERROR(log, teb_mem_maybe.takeError(), + "Failed to read Thread Environment Block: {0}"); + return {}; + } + + auto teb_mem = *teb_mem_maybe; if (teb_mem.empty()) return {}; @@ -126,8 +134,16 @@ MinidumpParser::GetThreadContextWow64(const minidump::Thread &td) { // Slot 1 of the thread-local storage in the 64-bit TEB points to a structure // that includes the 32-bit CONTEXT (after a ULONG). See: // https://msdn.microsoft.com/en-us/library/ms681670.aspx - auto context = + auto context_maybe = GetMemory(wow64teb->tls_slots[1] + 4, sizeof(MinidumpContext_x86_32)); + if (!context_maybe) { + LLDB_LOG_ERROR(log, context_maybe.takeError(), + "Failed to read WOW Thread Context: {0}"); + return {}; + } + + auto context = *context_maybe; + if (context.size() < sizeof(MinidumpContext_x86_32)) return {}; @@ -478,11 +494,13 @@ void MinidumpParser::PopulateMemoryRanges() { m_memory_ranges.Sort(); } -llvm::ArrayRef MinidumpParser::GetMemory(lldb::addr_t addr, - size_t size) { +llvm::Expected> +MinidumpParser::GetMemory(lldb::addr_t addr, size_t size) { std::optional range = FindMemoryRange(addr); if (!range) - return {}; + return llvm::createStringError( + llvm::inconvertibleErrorCode(), + "No memory range found for address (0x%" PRIx64 ")", addr); // There's at least some overlap between the beginning of the desired range // (addr) and the current range. Figure out where the overlap begins and @@ -491,7 +509,11 @@ llvm::ArrayRef MinidumpParser::GetMemory(lldb::addr_t addr, const size_t offset = addr - range->start; if (addr < range->start || offset >= range->range_ref.size()) - return {}; + return llvm::createStringError( + llvm::inconvertibleErrorCode(), + "Address (0x%" PRIx64 ") is not in range [0x%" PRIx64 " - 0x%" PRIx64 + ")", + addr, range->start, range->start + range->range_ref.size()); const size_t overlap = std::min(size, range->range_ref.size() - offset); return range->range_ref.slice(offset, overlap); diff --git a/lldb/source/Plugins/Process/minidump/MinidumpParser.h b/lldb/source/Plugins/Process/minidump/MinidumpParser.h index 14599f8d572aa..3b7d33daca717 100644 --- a/lldb/source/Plugins/Process/minidump/MinidumpParser.h +++ b/lldb/source/Plugins/Process/minidump/MinidumpParser.h @@ -104,7 +104,8 @@ class MinidumpParser { std::optional FindMemoryRange(lldb::addr_t addr); - llvm::ArrayRef GetMemory(lldb::addr_t addr, size_t size); + llvm::Expected> GetMemory(lldb::addr_t addr, + size_t size); /// Returns a list of memory regions and a flag indicating whether the list is /// complete (includes all regions mapped into the process memory). diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp index ef3c00e2857df..17a421a722743 100644 --- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp +++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp @@ -322,12 +322,15 @@ size_t ProcessMinidump::ReadMemory(lldb::addr_t addr, void *buf, size_t size, size_t ProcessMinidump::DoReadMemory(lldb::addr_t addr, void *buf, size_t size, Status &error) { - llvm::ArrayRef mem = m_minidump_parser->GetMemory(addr, size); - if (mem.empty()) { - error = Status::FromErrorString("could not parse memory info"); + llvm::Expected> mem_maybe = + m_minidump_parser->GetMemory(addr, size); + if (!mem_maybe) { + error = Status::FromError(mem_maybe.takeError()); return 0; } + llvm::ArrayRef mem = *mem_maybe; + std::memcpy(buf, mem.data(), mem.size()); return mem.size(); } diff --git a/lldb/test/Shell/Minidump/missing-memory-region.yaml b/lldb/test/Shell/Minidump/missing-memory-region.yaml new file mode 100644 index 0000000000000..1784cacfaf1ba --- /dev/null +++ b/lldb/test/Shell/Minidump/missing-memory-region.yaml @@ -0,0 +1,42 @@ +# Check that looking up a memory region not present in the Minidump fails +# even if it's in the /proc//maps file. + +# RUN: yaml2obj %s -o %t +# RUN: %lldb -c %t -o "memory read 0x5000" 2>&1 | FileCheck %s + +# CHECK-LABEL: (lldb) memory read 0x5000 +# CHECK-NEXT: error: No memory range found for address (0x5000) + +--- !minidump +Streams: + - Type: SystemInfo + Processor Arch: AMD64 + Processor Level: 6 + Processor Revision: 15876 + Number of Processors: 40 + Platform ID: Linux + CSD Version: 'Linux 3.13.0-91-generic #138-Ubuntu SMP Fri Jun 24 17:00:34 UTC 2016 x86_64' + CPU: + Vendor ID: GenuineIntel + Version Info: 0x00000000 + Feature Info: 0x00000000 + - Type: LinuxProcStatus + Text: | + Name: test-yaml + Umask: 0002 + State: t (tracing stop) + Pid: 8567 + - Type: LinuxMaps + Text: | + 0x1000-0x1100 r-xp 00000000 00:00 0 + 0x2000-0x2200 rw-p 00000000 00:00 0 + 0x4000-0x6000 rw-- 00000000 00:00 0 + - Type: Memory64List + Memory Ranges: + - Start of Memory Range: 0x1000 + Data Size: 0x100 + Content : '' + - Start of Memory Range: 0x2000 + Data Size: 0x200 + Content : '' +... diff --git a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp index ee31c8e63644b..44f653c6fa135 100644 --- a/lldb/unittests/Process/minidump/MinidumpParserTest.cpp +++ b/lldb/unittests/Process/minidump/MinidumpParserTest.cpp @@ -308,16 +308,19 @@ TEST_F(MinidumpParserTest, GetMemory) { )"), llvm::Succeeded()); - EXPECT_EQ((llvm::ArrayRef{0x54}), parser->GetMemory(0x401d46, 1)); - EXPECT_EQ((llvm::ArrayRef{0x54, 0x21}), - parser->GetMemory(0x401d46, 4)); - - EXPECT_EQ((llvm::ArrayRef{0xc8, 0x4d, 0x04, 0xbc, 0xe9}), - parser->GetMemory(0x7ffceb34a000, 5)); - EXPECT_EQ((llvm::ArrayRef{0xc8, 0x4d, 0x04}), - parser->GetMemory(0x7ffceb34a000, 3)); - - EXPECT_EQ(llvm::ArrayRef(), parser->GetMemory(0x500000, 512)); + EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 1), + llvm::HasValue(llvm::ArrayRef{0x54})); + EXPECT_THAT_EXPECTED(parser->GetMemory(0x401d46, 4), + llvm::HasValue(llvm::ArrayRef{0x54, 0x21})); + EXPECT_THAT_EXPECTED( + parser->GetMemory(0x7ffceb34a000, 5), + llvm::HasValue(llvm::ArrayRef{0xc8, 0x4d, 0x04, 0xbc, 0xe9})); + EXPECT_THAT_EXPECTED( + parser->GetMemory(0x7ffceb34a000, 3), + llvm::HasValue(llvm::ArrayRef{0xc8, 0x4d, 0x04})); + EXPECT_THAT_EXPECTED( + parser->GetMemory(0x500000, 512), + llvm::FailedWithMessage("No memory range found for address (0x500000)")); } TEST_F(MinidumpParserTest, FindMemoryRangeWithFullMemoryMinidump) { From e8a891b0f9d2a742ac3904116aaec2c7c9231b24 Mon Sep 17 00:00:00 2001 From: Jonathan Cohen Date: Thu, 17 Jul 2025 19:21:57 +0300 Subject: [PATCH 199/813] [AArch64][Machine-Combiner] Split gather patterns into neon regs to multiple vectors (#142941) This changes optimizes gather-like sequences, where we load values separately into lanes of a neon vector. Since each load has serial dependency, when performing multiple i32 loads into a 128 bit vector for example, it is more profitable to load into separate vector registers and zip them. rdar://151851094 --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 265 +++++++++++++ llvm/lib/Target/AArch64/AArch64InstrInfo.h | 4 + .../AArch64/aarch64-combine-gather-lanes.mir | 364 ++++++++++++++++++ .../complex-deinterleaving-uniform-cases.ll | 134 +++---- llvm/test/CodeGen/AArch64/concat-vector.ll | 5 +- .../AArch64/fp-maximumnum-minimumnum.ll | 50 +-- llvm/test/CodeGen/AArch64/fsh.ll | 113 +++--- llvm/test/CodeGen/AArch64/llvm.frexp.ll | 14 +- llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 345 +++++++++-------- llvm/test/CodeGen/AArch64/nontemporal.ll | 48 +-- 10 files changed, 996 insertions(+), 346 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 996b0edd24200..bc57537ad5dfb 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,6 +20,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" @@ -35,6 +36,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DebugInfoMetadata.h" @@ -7351,6 +7353,9 @@ bool AArch64InstrInfo::isThroughputPattern(unsigned Pattern) const { case AArch64MachineCombinerPattern::MULSUBv2i32_indexed_OP2: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP1: case AArch64MachineCombinerPattern::MULSUBv4i32_indexed_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return true; } // end switch (Pattern) return false; @@ -7391,11 +7396,252 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } +static bool getGatherPattern(MachineInstr &Root, + SmallVectorImpl &Patterns, + unsigned LoadLaneOpCode, unsigned NumLanes) { + const MachineFunction *MF = Root.getMF(); + + // Early exit if optimizing for size. + if (MF->getFunction().hasMinSize()) + return false; + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // The root of the pattern must load into the last lane of the vector. + if (Root.getOperand(2).getImm() != NumLanes - 1) + return false; + + // Check that we have load into all lanes except lane 0. + // For each load we also want to check that: + // 1. It has a single non-debug use (since we will be replacing the virtual + // register) + // 2. That the addressing mode only uses a single offset register. + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto Range = llvm::seq(1, NumLanes - 1); + SmallSet RemainingLanes(Range.begin(), Range.end()); + while (!RemainingLanes.empty() && CurrInstr && + CurrInstr->getOpcode() == LoadLaneOpCode && + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && + CurrInstr->getNumOperands() == 4) { + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + if (!RemainingLanes.empty()) + return false; + + // Match the SUBREG_TO_REG sequence. + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) + return false; + + // Verify that the subreg to reg loads an integer into the first lane. + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); + unsigned SingleLaneSizeInBits = 128 / NumLanes; + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) + return false; + + // Verify that it also has a single non debug use. + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) + return false; + + switch (NumLanes) { + case 4: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); + break; + case 8: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); + break; + case 16: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); + break; + default: + llvm_unreachable("Got bad number of lanes for gather pattern."); + } + + return true; +} + +/// Search for patterns where we use LD1 instructions to load into +/// separate lanes of an 128 bit Neon register. We can increase Memory Level +/// Parallelism by loading into 2 Neon registers instead. +static bool getLoadPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + + // The pattern searches for loads into single lanes. + switch (Root.getOpcode()) { + case AArch64::LD1i32: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 4); + case AArch64::LD1i16: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 8); + case AArch64::LD1i8: + return getGatherPattern(Root, Patterns, Root.getOpcode(), 16); + default: + return false; + } +} + +static void +generateGatherPattern(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + unsigned Pattern, unsigned NumLanes) { + + MachineFunction &MF = *Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + // Gather the initial load instructions to build the pattern + SmallVector LoadToLaneInstrs; + MachineInstr *CurrInstr = &Root; + for (unsigned i = 0; i < NumLanes - 1; ++i) { + LoadToLaneInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Sort the load instructions according to the lane. + llvm::sort(LoadToLaneInstrs, + [](const MachineInstr *A, const MachineInstr *B) { + return A->getOperand(2).getImm() > B->getOperand(2).getImm(); + }); + + MachineInstr *SubregToReg = CurrInstr; + LoadToLaneInstrs.push_back( + MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); + auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); + + const TargetRegisterClass *FPR128RegClass = + MRI.getRegClass(Root.getOperand(0).getReg()); + + auto LoadLaneToRegister = [&](MachineInstr *OriginalInstr, + Register SrcRegister, unsigned Lane, + Register OffsetRegister) { + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); + MachineInstrBuilder LoadIndexIntoRegister = + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), + NewRegister) + .addReg(SrcRegister) + .addImm(Lane) + .addReg(OffsetRegister, getKillRegState(true)); + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); + InsInstrs.push_back(LoadIndexIntoRegister); + return NewRegister; + }; + + // Helper to create load instruction based on opcode + auto CreateLoadInstruction = [&](unsigned NumLanes, Register DestReg, + Register OffsetReg) -> MachineInstrBuilder { + unsigned Opcode; + switch (NumLanes) { + case 4: + Opcode = AArch64::LDRSui; + break; + case 8: + Opcode = AArch64::LDRHui; + break; + case 16: + Opcode = AArch64::LDRBui; + break; + default: + llvm_unreachable( + "Got unsupported number of lanes in machine-combiner gather pattern"); + } + // Immediate offset load + return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) + .addReg(OffsetReg) + .addImm(0); // immediate offset + }; + + // Load the remaining lanes into register 0. + auto LanesToLoadToReg0 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, + LoadToLaneInstrsAscending.begin() + NumLanes / 2); + auto PrevReg = SubregToReg->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg0 = PrevReg; + + // First load into register 1. Perform a LDRSui to zero out the upper lanes in + // a single instruction. + auto Lane0Load = *LoadToLaneInstrsAscending.begin(); + auto OriginalSplitLoad = + *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); + auto DestRegForMiddleIndex = MRI.createVirtualRegister( + MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + + MachineInstrBuilder MiddleIndexLoadInstr = + CreateLoadInstruction(NumLanes, DestRegForMiddleIndex, + OriginalSplitLoad->getOperand(3).getReg()); + + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); + InsInstrs.push_back(MiddleIndexLoadInstr); + DelInstrs.push_back(OriginalSplitLoad); + + // Subreg To Reg instruction for register 1. + auto DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + unsigned SubregType; + switch (NumLanes) { + case 4: + SubregType = AArch64::ssub; + break; + case 8: + SubregType = AArch64::hsub; + break; + case 16: + SubregType = AArch64::bsub; + break; + default: + llvm_unreachable( + "Got invalid NumLanes for machine-combiner gather pattern"); + } + + auto SubRegToRegInstr = + BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), + DestRegForSubregToReg) + .addImm(0) + .addReg(DestRegForMiddleIndex, getKillRegState(true)) + .addImm(SubregType); + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForSubregToReg, InsInstrs.size())); + InsInstrs.push_back(SubRegToRegInstr); + + // Load remaining lanes into register 1. + auto LanesToLoadToReg1 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, + LoadToLaneInstrsAscending.end()); + PrevReg = SubRegToRegInstr->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + PrevReg = LoadLaneToRegister(LoadInstr, PrevReg, Index + 1, + LoadInstr->getOperand(3).getReg()); + if (Index == NumLanes / 2 - 2) { + break; + } + DelInstrs.push_back(LoadInstr); + } + auto LastLoadReg1 = PrevReg; + + // Create the final zip instruction to combine the results. + MachineInstrBuilder ZipInstr = + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), + Root.getOperand(0).getReg()) + .addReg(LastLoadReg0) + .addReg(LastLoadReg1); + InsInstrs.push_back(ZipInstr); +} + CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7425,6 +7671,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; + // Load patterns + if (getLoadPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8680,6 +8930,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } + case AArch64MachineCombinerPattern::GATHER_LANE_i32: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 4); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i16: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 8); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i8: { + generateGatherPattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 16); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 7c255da333e4b..02734866e7122 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, + + GATHER_LANE_i32, + GATHER_LANE_i16, + GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir new file mode 100644 index 0000000000000..09eb18b0e3574 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir @@ -0,0 +1,364 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: split_loads_to_fpr128 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: split_loads_to_fpr128 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %2 + %8:fpr128 = LD1i32 %7, 2, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_ui +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: split_loads_to_fpr128_ui + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSui %0, 0 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %1 + %8:fpr128 = LD1i32 %7, 2, killed %2 + %9:fpr128 = LD1i32 %8, 3, killed %3 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i16 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i16 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:fpr16 = LDRHroX %0, killed %1, 0, 1 + %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub + %11:fpr128 = LD1i16 %10, 1, killed %2 + %12:fpr128 = LD1i16 %11, 2, killed %3 + %13:fpr128 = LD1i16 %12, 3, killed %4 + %14:fpr128 = LD1i16 %13, 4, killed %5 + %15:fpr128 = LD1i16 %14, 5, killed %6 + %16:fpr128 = LD1i16 %15, 6, killed %7 + %17:fpr128 = LD1i16 %16, 7, killed %8 + $q0 = COPY %17 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i16_ui +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:fpr16 = LDRHui %0, 0 + %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub + %11:fpr128 = LD1i16 %10, 1, killed %1 + %12:fpr128 = LD1i16 %11, 2, killed %2 + %13:fpr128 = LD1i16 %12, 3, killed %3 + %14:fpr128 = LD1i16 %13, 4, killed %4 + %15:fpr128 = LD1i16 %14, 5, killed %5 + %16:fpr128 = LD1i16 %15, 6, killed %6 + %17:fpr128 = LD1i16 %16, 7, killed %7 + $q0 = COPY %17 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i8 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i8 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16 + ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]] + ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]] + ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]] + ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]] + ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]] + ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]] + ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]] + ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:gpr64common = COPY $x9 + %10:gpr64common = COPY $x10 + %11:gpr64common = COPY $x11 + %12:gpr64common = COPY $x12 + %13:gpr64common = COPY $x13 + %14:gpr64common = COPY $x14 + %15:gpr64common = COPY $x15 + %16:gpr64common = COPY $x16 + %17:fpr8 = LDRBroX %0, killed %1, 0, 0 + %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub + %19:fpr128 = LD1i8 %18, 1, killed %2 + %20:fpr128 = LD1i8 %19, 2, killed %3 + %21:fpr128 = LD1i8 %20, 3, killed %4 + %22:fpr128 = LD1i8 %21, 4, killed %5 + %23:fpr128 = LD1i8 %22, 5, killed %6 + %24:fpr128 = LD1i8 %23, 6, killed %7 + %25:fpr128 = LD1i8 %24, 7, killed %8 + %26:fpr128 = LD1i8 %25, 8, killed %9 + %27:fpr128 = LD1i8 %26, 9, killed %10 + %28:fpr128 = LD1i8 %27, 10, killed %11 + %29:fpr128 = LD1i8 %28, 11, killed %12 + %30:fpr128 = LD1i8 %29, 12, killed %13 + %31:fpr128 = LD1i8 %30, 13, killed %14 + %32:fpr128 = LD1i8 %31, 14, killed %15 + %33:fpr128 = LD1i8 %32, 15, killed %16 + $q0 = COPY %33 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_missing_lanes +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: negative_pattern_missing_lanes + ; CHECK: [[LD1:%.*]]:fpr128 = LDRQui $x1, 0 + ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]] + + %0:gpr64common = COPY $x0 + %1:fpr128 = LDRQui $x1, 0 + %2:fpr128 = LD1i32 %1, 3, %0 + $q0 = COPY %2 + RET_ReallyLR implicit $q0 + +--- +name: out_of_order_lanes +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: out_of_order_lanes + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 2, killed %2 + %8:fpr128 = LD1i32 %7, 1, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_no_subreg_to_reg +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: negative_pattern_no_subreg_to_reg + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[INITIAL_VEC:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[INITIAL_VEC]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY3]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:fpr128 = LDRQui %0, 0 + %5:fpr128 = LD1i32 %4, 1, killed %1 + %6:fpr128 = LD1i32 %5, 2, killed %2 + %7:fpr128 = LD1i32 %6, 3, killed %3 + $q0 = COPY %7 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_multiple_users +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: negative_pattern_multiple_users + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: $q1 = COPY [[LD_LANE_2]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %2 + %8:fpr128 = LD1i32 %7, 2, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + $q1 = COPY %8 + RET_ReallyLR implicit $q0, implicit $q1 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index 7686740aec302..13434fabefa78 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -203,89 +203,93 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: ldr s17, [sp, #40] -; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 +; CHECK-NEXT: ldr s17, [sp, #32] +; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: add x10, sp, #64 ; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: ldr s3, [sp, #32] -; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: ld1 { v17.s }[1], [x10] -; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-NEXT: ldr s16, [sp, #8] ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 -; CHECK-NEXT: add x10, sp, #24 -; CHECK-NEXT: ld1 { v3.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: add x11, sp, #72 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ldr s18, [x10] +; CHECK-NEXT: add x9, sp, #80 +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 +; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: ldr s16, [sp, #8] +; CHECK-NEXT: ldr s3, [sp, #96] +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #88 ; CHECK-NEXT: ldr s2, [sp] -; CHECK-NEXT: ld1 { v16.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: ldr s20, [sp, #136] ; CHECK-NEXT: mov v1.s[2], v5.s[0] -; CHECK-NEXT: ld1 { v17.s }[2], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ldr s5, [sp, #96] -; CHECK-NEXT: ld1 { v3.s }[2], [x9] +; CHECK-NEXT: ldr s5, [sp, #40] ; CHECK-NEXT: mov v0.s[2], v4.s[0] -; CHECK-NEXT: add x9, sp, #88 -; CHECK-NEXT: ldr s4, [sp, #104] -; CHECK-NEXT: ldr s19, [sp, #192] ; CHECK-NEXT: ld1 { v5.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #80 -; CHECK-NEXT: ld1 { v17.s }[3], [x9] -; CHECK-NEXT: mov v1.s[3], v7.s[0] -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: ld1 { v3.s }[3], [x10] -; CHECK-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-NEXT: ldr s7, [sp, #128] +; CHECK-NEXT: ldr s19, [x11] ; CHECK-NEXT: add x10, sp, #144 +; CHECK-NEXT: zip1 v4.2d, v17.2d, v18.2d +; CHECK-NEXT: add x11, sp, #160 +; CHECK-NEXT: ldr s18, [sp, #136] +; CHECK-NEXT: ld1 { v19.s }[1], [x9] ; CHECK-NEXT: mov v0.s[3], v6.s[0] -; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ldr s6, [sp, #128] +; CHECK-NEXT: mov v1.s[3], v7.s[0] +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: ldr s7, [sp, #104] +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v6.s }[1], [x10] +; CHECK-NEXT: zip1 v5.2d, v5.2d, v19.2d +; CHECK-NEXT: add x10, sp, #120 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: fmul v6.4s, v17.4s, v1.4s -; CHECK-NEXT: fmul v18.4s, v4.4s, v16.4s -; CHECK-NEXT: fmul v16.4s, v5.4s, v16.4s -; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add x10, sp, #208 -; CHECK-NEXT: ld1 { v7.s }[2], [x9] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v19.s }[1], [x10] -; CHECK-NEXT: ld1 { v20.s }[1], [x9] +; CHECK-NEXT: ldr s17, [x11] ; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: add x10, sp, #184 -; CHECK-NEXT: fneg v6.4s, v6.4s -; CHECK-NEXT: fneg v18.4s, v18.4s -; CHECK-NEXT: fmla v16.4s, v2.4s, v4.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v17.4s -; CHECK-NEXT: ld1 { v7.s }[3], [x9] -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: ld1 { v20.s }[2], [x9] -; CHECK-NEXT: ldr s4, [sp, #200] +; CHECK-NEXT: add x10, sp, #16 +; CHECK-NEXT: add x11, sp, #168 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ld1 { v2.s }[1], [x10] +; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: fmul v19.4s, v5.4s, v1.4s +; CHECK-NEXT: fmul v20.4s, v7.4s, v16.4s +; CHECK-NEXT: fmul v16.4s, v3.4s, v16.4s +; CHECK-NEXT: fmul v1.4s, v4.4s, v1.4s +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: ldr s21, [x11] +; CHECK-NEXT: zip1 v6.2d, v6.2d, v17.2d +; CHECK-NEXT: ldr s17, [sp, #192] +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x10, sp, #208 +; CHECK-NEXT: ld1 { v21.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #216 -; CHECK-NEXT: fmla v6.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v18.4s, v2.4s, v5.4s -; CHECK-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-NEXT: fsub v0.4s, v7.4s, v1.4s -; CHECK-NEXT: fsub v1.4s, v19.4s, v16.4s -; CHECK-NEXT: ld1 { v20.s }[3], [x10] -; CHECK-NEXT: fadd v2.4s, v4.4s, v18.4s -; CHECK-NEXT: fadd v3.4s, v20.4s, v6.4s +; CHECK-NEXT: fneg v19.4s, v19.4s +; CHECK-NEXT: fneg v20.4s, v20.4s +; CHECK-NEXT: fmla v16.4s, v2.4s, v7.4s +; CHECK-NEXT: fmla v1.4s, v0.4s, v5.4s +; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: ldr s5, [sp, #200] +; CHECK-NEXT: zip1 v7.2d, v18.2d, v21.2d +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: fmla v19.4s, v0.4s, v4.4s +; CHECK-NEXT: fmla v20.4s, v2.4s, v3.4s +; CHECK-NEXT: fsub v0.4s, v6.4s, v1.4s +; CHECK-NEXT: fsub v1.4s, v17.4s, v16.4s +; CHECK-NEXT: fadd v2.4s, v7.4s, v19.4s +; CHECK-NEXT: fadd v3.4s, v5.4s, v20.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 -; CHECK-NEXT: ext v5.16b, v3.16b, v2.16b, #12 -; CHECK-NEXT: trn2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12 -; CHECK-NEXT: ext v5.16b, v3.16b, v5.16b, #8 +; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 ; CHECK-NEXT: rev64 v4.4s, v4.4s -; CHECK-NEXT: trn2 v2.4s, v4.4s, v5.4s -; CHECK-NEXT: zip2 v4.4s, v0.4s, v3.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: mov v4.d[1], v2.d[0] +; CHECK-NEXT: trn2 v3.4s, v4.4s, v5.4s +; CHECK-NEXT: zip2 v4.4s, v0.4s, v2.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v1.16b, v3.16b, v1.16b, #8 +; CHECK-NEXT: mov v4.d[1], v3.d[0] ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: stp q4, q1, [x8, #16] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index acf15f1bd1178..e6f27b95d92c8 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -186,8 +186,9 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1] -; CHECK-NEXT: ld1 { v0.s }[2], [x2] -; CHECK-NEXT: ld1 { v0.s }[3], [x3] +; CHECK-NEXT: ldr s1, [x2] +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %A = load <4 x i8>, ptr %ptrA %B = load <4 x i8>, ptr %ptrB diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll index c6b8e41f9bdfd..4906e2e15e51c 100644 --- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll +++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll @@ -1431,6 +1431,7 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: add x9, sp, #16 ; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: add x10, sp, #40 ; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 @@ -1439,30 +1440,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] ; FULLFP16-NEXT: add x9, sp, #24 ; FULLFP16-NEXT: mov v0.h[2], v2.h[0] -; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #32 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: mov v0.h[3], v3.h[0] ; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] -; FULLFP16-NEXT: add x9, sp, #40 -; FULLFP16-NEXT: ldr h3, [sp, #72] -; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: ldr h2, [x10] ; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v2.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #56 ; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; FULLFP16-NEXT: mov v0.h[4], v4.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] -; FULLFP16-NEXT: add x9, sp, #56 -; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h -; FULLFP16-NEXT: mov v0.h[5], v5.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: ld1 { v2.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #64 -; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v2.h }[3], [x9] +; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d +; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: mov v0.h[6], v6.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h ; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: str h2, [x8, #16] ; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h ; FULLFP16-NEXT: str q0, [x8] ; FULLFP16-NEXT: ret @@ -2012,6 +2013,7 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: add x9, sp, #16 ; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: add x10, sp, #40 ; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 @@ -2020,30 +2022,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] ; FULLFP16-NEXT: add x9, sp, #24 ; FULLFP16-NEXT: mov v0.h[2], v2.h[0] -; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #32 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: mov v0.h[3], v3.h[0] ; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] -; FULLFP16-NEXT: add x9, sp, #40 -; FULLFP16-NEXT: ldr h3, [sp, #72] -; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: ldr h2, [x10] ; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v2.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #56 ; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; FULLFP16-NEXT: mov v0.h[4], v4.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] -; FULLFP16-NEXT: add x9, sp, #56 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h -; FULLFP16-NEXT: mov v0.h[5], v5.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: ld1 { v2.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #64 -; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v2.h }[3], [x9] +; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d +; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: mov v0.h[6], v6.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h ; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: str h2, [x8, #16] ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h ; FULLFP16-NEXT: str q0, [x8] ; FULLFP16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 4c28c90824028..ae2ef2649102e 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -2509,87 +2509,88 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) { ; ; CHECK-GI-LABEL: fshl_v7i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr s3, [sp, #48] -; CHECK-GI-NEXT: ldr s20, [sp, #56] -; CHECK-GI-NEXT: add x9, sp, #56 +; CHECK-GI-NEXT: ldr s17, [sp, #48] +; CHECK-GI-NEXT: add x8, sp, #56 +; CHECK-GI-NEXT: add x9, sp, #64 ; CHECK-GI-NEXT: ldr s4, [sp, #48] -; CHECK-GI-NEXT: ldr s7, [sp, #80] -; CHECK-GI-NEXT: mov w12, #-1 // =0xffffffff -; CHECK-GI-NEXT: ldr s21, [sp, #88] -; CHECK-GI-NEXT: mov v3.s[1], v20.s[0] -; CHECK-GI-NEXT: fmov s20, w12 -; CHECK-GI-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-GI-NEXT: ldr s17, [sp] -; CHECK-GI-NEXT: add x13, sp, #64 -; CHECK-GI-NEXT: mov v7.s[1], v21.s[0] +; CHECK-GI-NEXT: ldr s21, [sp, #56] +; CHECK-GI-NEXT: mov w10, #-1 // =0xffffffff +; CHECK-GI-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-GI-NEXT: ldr s20, [x9] +; CHECK-GI-NEXT: add x8, sp, #72 +; CHECK-GI-NEXT: mov v4.s[1], v21.s[0] ; CHECK-GI-NEXT: fmov s21, w7 +; CHECK-GI-NEXT: ldr s6, [sp] +; CHECK-GI-NEXT: ld1 { v20.s }[1], [x8] ; CHECK-GI-NEXT: ldr s19, [sp, #64] -; CHECK-GI-NEXT: mov w11, #31 // =0x1f -; CHECK-GI-NEXT: mov v20.s[1], w12 +; CHECK-GI-NEXT: ldr s7, [sp, #80] +; CHECK-GI-NEXT: ldr s22, [sp, #88] +; CHECK-GI-NEXT: mov w9, #31 // =0x1f +; CHECK-GI-NEXT: mov w11, #1 // =0x1 +; CHECK-GI-NEXT: mov v21.s[1], v6.s[0] +; CHECK-GI-NEXT: fmov s6, w9 ; CHECK-GI-NEXT: ldr s18, [sp, #96] -; CHECK-GI-NEXT: ld1 { v4.s }[2], [x13] -; CHECK-GI-NEXT: mov w13, #1 // =0x1 -; CHECK-GI-NEXT: mov v3.s[2], v19.s[0] -; CHECK-GI-NEXT: mov v21.s[1], v17.s[0] -; CHECK-GI-NEXT: fmov s17, w11 -; CHECK-GI-NEXT: fmov s19, w13 +; CHECK-GI-NEXT: zip1 v17.2d, v17.2d, v20.2d +; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: mov v7.s[1], v22.s[0] +; CHECK-GI-NEXT: mov v4.s[2], v19.s[0] +; CHECK-GI-NEXT: fmov s19, w11 ; CHECK-GI-NEXT: fmov s23, w0 -; CHECK-GI-NEXT: fmov s24, w11 -; CHECK-GI-NEXT: ldr s6, [sp, #8] +; CHECK-GI-NEXT: mov v6.s[1], w9 +; CHECK-GI-NEXT: fmov s24, w9 +; CHECK-GI-NEXT: ldr s2, [sp, #8] +; CHECK-GI-NEXT: mov v20.s[1], w10 ; CHECK-GI-NEXT: ldr s0, [sp, #24] ; CHECK-GI-NEXT: ldr s5, [sp, #32] +; CHECK-GI-NEXT: mov v19.s[1], w11 ; CHECK-GI-NEXT: mov v7.s[2], v18.s[0] -; CHECK-GI-NEXT: mov v17.s[1], w11 -; CHECK-GI-NEXT: mov v19.s[1], w13 -; CHECK-GI-NEXT: mov v20.s[2], w12 ; CHECK-GI-NEXT: ldr s16, [sp, #72] ; CHECK-GI-NEXT: mov v23.s[1], w1 ; CHECK-GI-NEXT: ldr s18, [sp, #80] -; CHECK-GI-NEXT: mov v21.s[2], v6.s[0] -; CHECK-GI-NEXT: mov v24.s[1], w11 +; CHECK-GI-NEXT: mov v21.s[2], v2.s[0] +; CHECK-GI-NEXT: mov v24.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[1], v5.s[0] -; CHECK-GI-NEXT: fmov s6, w4 -; CHECK-GI-NEXT: add x10, sp, #88 +; CHECK-GI-NEXT: fmov s5, w4 +; CHECK-GI-NEXT: mov v20.s[2], w10 +; CHECK-GI-NEXT: add x8, sp, #88 ; CHECK-GI-NEXT: movi v22.4s, #31 -; CHECK-GI-NEXT: mov v3.s[3], v16.s[0] -; CHECK-GI-NEXT: mov v17.s[2], w11 -; CHECK-GI-NEXT: mov v19.s[2], w13 -; CHECK-GI-NEXT: ldr s2, [sp, #16] -; CHECK-GI-NEXT: ldr s1, [sp, #40] -; CHECK-GI-NEXT: ld1 { v18.s }[1], [x10] -; CHECK-GI-NEXT: eor v5.16b, v7.16b, v20.16b +; CHECK-GI-NEXT: mov v4.s[3], v16.s[0] +; CHECK-GI-NEXT: mov v6.s[2], w9 +; CHECK-GI-NEXT: mov v19.s[2], w11 +; CHECK-GI-NEXT: ldr s1, [sp, #16] +; CHECK-GI-NEXT: ldr s3, [sp, #40] +; CHECK-GI-NEXT: ld1 { v18.s }[1], [x8] ; CHECK-GI-NEXT: mov v23.s[2], w2 -; CHECK-GI-NEXT: mov v6.s[1], w5 -; CHECK-GI-NEXT: add x8, sp, #72 -; CHECK-GI-NEXT: add x9, sp, #96 -; CHECK-GI-NEXT: mov v21.s[3], v2.s[0] -; CHECK-GI-NEXT: mov v24.s[2], w11 -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: ld1 { v4.s }[3], [x8] -; CHECK-GI-NEXT: bic v2.16b, v22.16b, v3.16b -; CHECK-GI-NEXT: ld1 { v18.s }[2], [x9] -; CHECK-GI-NEXT: and v1.16b, v5.16b, v17.16b +; CHECK-GI-NEXT: mov v5.s[1], w5 +; CHECK-GI-NEXT: add x8, sp, #96 +; CHECK-GI-NEXT: eor v2.16b, v7.16b, v20.16b +; CHECK-GI-NEXT: mov v21.s[3], v1.s[0] +; CHECK-GI-NEXT: mov v24.s[2], w9 +; CHECK-GI-NEXT: mov v0.s[2], v3.s[0] +; CHECK-GI-NEXT: bic v1.16b, v22.16b, v4.16b +; CHECK-GI-NEXT: ld1 { v18.s }[2], [x8] ; CHECK-GI-NEXT: neg v3.4s, v19.4s +; CHECK-GI-NEXT: and v4.16b, v17.16b, v22.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b ; CHECK-GI-NEXT: mov v23.s[3], w3 -; CHECK-GI-NEXT: mov v6.s[2], w6 -; CHECK-GI-NEXT: and v4.16b, v4.16b, v22.16b -; CHECK-GI-NEXT: ushr v5.4s, v21.4s, #1 -; CHECK-GI-NEXT: neg v2.4s, v2.4s -; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b +; CHECK-GI-NEXT: mov v5.s[2], w6 +; CHECK-GI-NEXT: ushr v6.4s, v21.4s, #1 ; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: neg v2.4s, v2.4s ; CHECK-GI-NEXT: ushl v3.4s, v23.4s, v4.4s -; CHECK-GI-NEXT: ushl v2.4s, v5.4s, v2.4s -; CHECK-GI-NEXT: ushl v4.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: orr v1.16b, v3.16b, v2.16b +; CHECK-GI-NEXT: ushl v1.4s, v6.4s, v1.4s +; CHECK-GI-NEXT: ushl v4.4s, v5.4s, v7.4s +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: orr v1.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v4.16b, v0.16b ; CHECK-GI-NEXT: mov s2, v1.s[1] ; CHECK-GI-NEXT: mov s3, v1.s[2] ; CHECK-GI-NEXT: mov s4, v1.s[3] +; CHECK-GI-NEXT: fmov w0, s1 ; CHECK-GI-NEXT: mov s5, v0.s[1] ; CHECK-GI-NEXT: mov s6, v0.s[2] -; CHECK-GI-NEXT: fmov w0, s1 ; CHECK-GI-NEXT: fmov w4, s0 ; CHECK-GI-NEXT: fmov w1, s2 ; CHECK-GI-NEXT: fmov w2, s3 diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll index 2213aa1429dbd..4e1876db772ed 100644 --- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll +++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll @@ -700,13 +700,14 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; CHECK-NEXT: ldr s1, [sp, #44] ; CHECK-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ld1 { v1.s }[1], [x19] ; CHECK-NEXT: mov v2.s[3], v0.s[0] -; CHECK-NEXT: ld1 { v1.s }[2], [x20] +; CHECK-NEXT: ld1 { v1.s }[1], [x19] +; CHECK-NEXT: ldr s0, [x20] +; CHECK-NEXT: ld1 { v0.s }[1], [x21] ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: ld1 { v1.s }[3], [x21] ; CHECK-NEXT: ldp x30, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: zip1 v1.2d, v1.2d, v0.2d +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret ; @@ -872,10 +873,11 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; CHECK-NEXT: bl frexpf ; CHECK-NEXT: ldr s0, [sp, #28] ; CHECK-NEXT: ld1 { v0.s }[1], [x19] -; CHECK-NEXT: ld1 { v0.s }[2], [x20] +; CHECK-NEXT: ldr s1, [x20] +; CHECK-NEXT: ld1 { v1.s }[1], [x21] ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ld1 { v0.s }[3], [x21] ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 4f0c4080aa0ce..9443004ea434b 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -6810,195 +6810,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: .cfi_offset w29, -16 -; CHECK-SD-NEXT: ldr b5, [sp, #208] +; CHECK-SD-NEXT: ldr b0, [sp, #208] ; CHECK-SD-NEXT: add x8, sp, #216 -; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: add x9, sp, #272 +; CHECK-SD-NEXT: ldr b2, [sp, #80] ; CHECK-SD-NEXT: ldr b4, [sp, #976] -; CHECK-SD-NEXT: add x9, sp, #984 -; CHECK-SD-NEXT: add x12, sp, #328 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #224 -; CHECK-SD-NEXT: movi v1.16b, #1 -; CHECK-SD-NEXT: mov v0.b[1], w1 -; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] -; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 -; CHECK-SD-NEXT: add x11, sp, #992 ; CHECK-SD-NEXT: ldr b6, [sp, #720] -; CHECK-SD-NEXT: ldr b7, [sp, #80] -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #224 +; CHECK-SD-NEXT: fmov s16, w0 +; CHECK-SD-NEXT: ldr b17, [sp, #848] +; CHECK-SD-NEXT: add x10, sp, #24 +; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 +; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #232 -; CHECK-SD-NEXT: add x13, sp, #88 -; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11] -; CHECK-SD-NEXT: ld1 { v7.b }[1], [x13] -; CHECK-SD-NEXT: add x13, sp, #856 -; CHECK-SD-NEXT: mov v0.b[2], w2 -; CHECK-SD-NEXT: add x14, sp, #1008 -; CHECK-SD-NEXT: add x15, sp, #872 -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-SD-NEXT: mov v16.b[1], w1 +; CHECK-SD-NEXT: ld1 { v0.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #240 -; CHECK-SD-NEXT: add x16, sp, #888 -; CHECK-SD-NEXT: add x10, sp, #16 -; CHECK-SD-NEXT: add x9, sp, #24 -; CHECK-SD-NEXT: add x11, sp, #40 -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-SD-NEXT: mov v16.b[2], w2 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: add x8, sp, #248 -; CHECK-SD-NEXT: mov v0.b[3], w3 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-SD-NEXT: mov v16.b[3], w3 +; CHECK-SD-NEXT: ld1 { v0.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #256 -; CHECK-SD-NEXT: mov v0.b[4], w4 -; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-SD-NEXT: ld1 { v0.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #264 -; CHECK-SD-NEXT: mov v0.b[5], w5 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] -; CHECK-SD-NEXT: add x8, sp, #272 -; CHECK-SD-NEXT: ld1 { v5.b }[8], [x8] +; CHECK-SD-NEXT: mov v16.b[4], w4 +; CHECK-SD-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-SD-NEXT: ldr b1, [x9] ; CHECK-SD-NEXT: add x8, sp, #280 -; CHECK-SD-NEXT: mov v0.b[6], w6 -; CHECK-SD-NEXT: ld1 { v5.b }[9], [x8] +; CHECK-SD-NEXT: add x9, sp, #88 +; CHECK-SD-NEXT: mov v16.b[5], w5 +; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #288 -; CHECK-SD-NEXT: mov v0.b[7], w7 -; CHECK-SD-NEXT: ld1 { v5.b }[10], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #296 -; CHECK-SD-NEXT: ld1 { v0.b }[8], [x10] -; CHECK-SD-NEXT: add x10, sp, #128 -; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] +; CHECK-SD-NEXT: mov v16.b[6], w6 +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #304 -; CHECK-SD-NEXT: ld1 { v0.b }[9], [x9] -; CHECK-SD-NEXT: add x9, sp, #136 -; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8] +; CHECK-SD-NEXT: mov v16.b[7], w7 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] ; CHECK-SD-NEXT: add x8, sp, #312 -; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #320 -; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #32 -; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8] -; CHECK-SD-NEXT: add x8, sp, #144 -; CHECK-SD-NEXT: ld1 { v5.b }[15], [x12] -; CHECK-SD-NEXT: add x12, sp, #728 -; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12] -; CHECK-SD-NEXT: add x12, sp, #1000 -; CHECK-SD-NEXT: ld1 { v0.b }[11], [x11] -; CHECK-SD-NEXT: ld1 { v4.b }[3], [x12] -; CHECK-SD-NEXT: add x12, sp, #736 -; CHECK-SD-NEXT: add x11, sp, #920 -; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v1.16b -; CHECK-SD-NEXT: ldr b5, [sp, #848] -; CHECK-SD-NEXT: ld1 { v6.b }[2], [x12] -; CHECK-SD-NEXT: add x12, sp, #48 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x13] -; CHECK-SD-NEXT: add x13, sp, #744 -; CHECK-SD-NEXT: ld1 { v4.b }[4], [x14] -; CHECK-SD-NEXT: add x14, sp, #96 -; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12] -; CHECK-SD-NEXT: ld1 { v6.b }[3], [x13] -; CHECK-SD-NEXT: add x13, sp, #864 -; CHECK-SD-NEXT: ld1 { v7.b }[2], [x14] -; CHECK-SD-NEXT: add x14, sp, #1016 -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13] -; CHECK-SD-NEXT: add x13, sp, #752 -; CHECK-SD-NEXT: ld1 { v4.b }[5], [x14] -; CHECK-SD-NEXT: add x14, sp, #104 -; CHECK-SD-NEXT: ld1 { v6.b }[4], [x13] -; CHECK-SD-NEXT: add x13, sp, #1024 -; CHECK-SD-NEXT: ld1 { v7.b }[3], [x14] -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x15] -; CHECK-SD-NEXT: add x15, sp, #760 -; CHECK-SD-NEXT: add x14, sp, #112 -; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13] -; CHECK-SD-NEXT: add x13, sp, #880 -; CHECK-SD-NEXT: ld1 { v6.b }[5], [x15] -; CHECK-SD-NEXT: add x15, sp, #1032 -; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14] -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x13] -; CHECK-SD-NEXT: add x14, sp, #768 -; CHECK-SD-NEXT: add x13, sp, #120 -; CHECK-SD-NEXT: ld1 { v4.b }[7], [x15] -; CHECK-SD-NEXT: add x15, sp, #1040 -; CHECK-SD-NEXT: ld1 { v6.b }[6], [x14] -; CHECK-SD-NEXT: ld1 { v7.b }[5], [x13] -; CHECK-SD-NEXT: add x13, sp, #776 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x16] -; CHECK-SD-NEXT: add x14, sp, #1048 -; CHECK-SD-NEXT: ld1 { v4.b }[8], [x15] -; CHECK-SD-NEXT: add x15, sp, #896 -; CHECK-SD-NEXT: ld1 { v6.b }[7], [x13] -; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-SD-NEXT: add x10, sp, #784 -; CHECK-SD-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-SD-NEXT: add x13, sp, #1056 -; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14] -; CHECK-SD-NEXT: add x14, sp, #904 -; CHECK-SD-NEXT: ld1 { v6.b }[8], [x10] -; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-SD-NEXT: add x9, sp, #792 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-SD-NEXT: add x10, sp, #1064 -; CHECK-SD-NEXT: ld1 { v4.b }[10], [x13] -; CHECK-SD-NEXT: add x13, sp, #912 -; CHECK-SD-NEXT: ld1 { v6.b }[9], [x9] -; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8] -; CHECK-SD-NEXT: add x9, sp, #800 -; CHECK-SD-NEXT: ld1 { v5.b }[8], [x13] +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #328 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #96 +; CHECK-SD-NEXT: add x9, sp, #144 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #104 +; CHECK-SD-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: movi v1.16b, #1 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #112 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #120 +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #128 +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #136 +; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] +; CHECK-SD-NEXT: ldr b3, [x9] ; CHECK-SD-NEXT: add x8, sp, #152 -; CHECK-SD-NEXT: ld1 { v4.b }[11], [x10] -; CHECK-SD-NEXT: add x10, sp, #1072 -; CHECK-SD-NEXT: ld1 { v6.b }[10], [x9] -; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8] -; CHECK-SD-NEXT: add x9, sp, #808 -; CHECK-SD-NEXT: ld1 { v5.b }[9], [x11] -; CHECK-SD-NEXT: add x8, sp, #56 -; CHECK-SD-NEXT: ld1 { v4.b }[12], [x10] -; CHECK-SD-NEXT: add x10, sp, #160 -; CHECK-SD-NEXT: ld1 { v0.b }[13], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[11], [x9] -; CHECK-SD-NEXT: add x9, sp, #928 -; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10] -; CHECK-SD-NEXT: add x10, sp, #1080 -; CHECK-SD-NEXT: ld1 { v5.b }[10], [x9] +; CHECK-SD-NEXT: add x9, sp, #984 +; CHECK-SD-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #160 +; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #168 +; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #176 +; CHECK-SD-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #184 +; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #192 +; CHECK-SD-NEXT: ld1 { v3.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #200 +; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #992 +; CHECK-SD-NEXT: add x9, sp, #1040 +; CHECK-SD-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #1000 +; CHECK-SD-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #1008 +; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #1016 +; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #1024 +; CHECK-SD-NEXT: ld1 { v4.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #1032 +; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-SD-NEXT: ldr b5, [x9] +; CHECK-SD-NEXT: add x8, sp, #1048 +; CHECK-SD-NEXT: add x9, sp, #728 +; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #1056 +; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #1064 +; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #1072 +; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #1080 +; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #1088 +; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #1096 +; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v6.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #736 +; CHECK-SD-NEXT: add x9, sp, #784 +; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #744 +; CHECK-SD-NEXT: zip1 v4.2d, v4.2d, v5.2d +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #752 +; CHECK-SD-NEXT: sdot v19.4s, v4.16b, v1.16b +; CHECK-SD-NEXT: sdot v5.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #760 +; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #768 +; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #776 +; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8] +; CHECK-SD-NEXT: ldr b7, [x9] +; CHECK-SD-NEXT: add x8, sp, #792 +; CHECK-SD-NEXT: add x9, sp, #856 +; CHECK-SD-NEXT: ld1 { v7.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #800 +; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #808 +; CHECK-SD-NEXT: ld1 { v7.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #816 -; CHECK-SD-NEXT: ld1 { v4.b }[13], [x10] -; CHECK-SD-NEXT: add x9, sp, #168 -; CHECK-SD-NEXT: add x10, sp, #176 -; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8] -; CHECK-SD-NEXT: add x8, sp, #936 -; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9] -; CHECK-SD-NEXT: add x9, sp, #1088 -; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] -; CHECK-SD-NEXT: add x8, sp, #64 -; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9] -; CHECK-SD-NEXT: add x9, sp, #824 -; CHECK-SD-NEXT: ld1 { v0.b }[14], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[13], [x9] -; CHECK-SD-NEXT: add x9, sp, #944 -; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10] -; CHECK-SD-NEXT: add x10, sp, #1096 -; CHECK-SD-NEXT: ld1 { v5.b }[12], [x9] +; CHECK-SD-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #824 +; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #832 -; CHECK-SD-NEXT: ld1 { v4.b }[15], [x10] -; CHECK-SD-NEXT: add x9, sp, #184 -; CHECK-SD-NEXT: add x10, sp, #72 -; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #952 -; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-SD-NEXT: ld1 { v7.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #840 -; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10] -; CHECK-SD-NEXT: sdot v2.4s, v4.16b, v1.16b -; CHECK-SD-NEXT: add x9, sp, #192 -; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8] +; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v17.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #864 +; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: ld1 { v16.b }[8], [x9] +; CHECK-SD-NEXT: add x9, sp, #912 +; CHECK-SD-NEXT: ld1 { v17.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #872 +; CHECK-SD-NEXT: zip1 v0.2d, v6.2d, v7.2d +; CHECK-SD-NEXT: ld1 { v16.b }[9], [x10] +; CHECK-SD-NEXT: ld1 { v17.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #880 +; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v17.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #888 +; CHECK-SD-NEXT: ld1 { v17.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #896 +; CHECK-SD-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #904 +; CHECK-SD-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-SD-NEXT: ldr b18, [x9] +; CHECK-SD-NEXT: add x8, sp, #920 +; CHECK-SD-NEXT: ld1 { v18.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #32 +; CHECK-SD-NEXT: ld1 { v16.b }[10], [x8] +; CHECK-SD-NEXT: add x8, sp, #928 +; CHECK-SD-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #40 +; CHECK-SD-NEXT: ld1 { v16.b }[11], [x8] +; CHECK-SD-NEXT: add x8, sp, #936 +; CHECK-SD-NEXT: ld1 { v18.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #48 +; CHECK-SD-NEXT: ld1 { v16.b }[12], [x8] +; CHECK-SD-NEXT: add x8, sp, #944 +; CHECK-SD-NEXT: ld1 { v18.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #56 +; CHECK-SD-NEXT: ld1 { v16.b }[13], [x8] +; CHECK-SD-NEXT: add x8, sp, #952 +; CHECK-SD-NEXT: ld1 { v18.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #64 +; CHECK-SD-NEXT: ld1 { v16.b }[14], [x8] ; CHECK-SD-NEXT: add x8, sp, #960 -; CHECK-SD-NEXT: ld1 { v7.b }[14], [x9] -; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b -; CHECK-SD-NEXT: add x8, sp, #200 -; CHECK-SD-NEXT: add x9, sp, #968 -; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b -; CHECK-SD-NEXT: ld1 { v7.b }[15], [x8] -; CHECK-SD-NEXT: ld1 { v5.b }[15], [x9] -; CHECK-SD-NEXT: sdot v3.4s, v7.16b, v1.16b -; CHECK-SD-NEXT: sdot v2.4s, v5.16b, v1.16b -; CHECK-SD-NEXT: add v0.4s, v3.4s, v2.4s +; CHECK-SD-NEXT: ld1 { v18.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #72 +; CHECK-SD-NEXT: ld1 { v16.b }[15], [x8] +; CHECK-SD-NEXT: add x8, sp, #968 +; CHECK-SD-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-SD-NEXT: sdot v5.4s, v16.16b, v1.16b +; CHECK-SD-NEXT: zip1 v0.2d, v17.2d, v18.2d +; CHECK-SD-NEXT: sdot v5.4s, v2.16b, v1.16b +; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: add v0.4s, v5.4s, v19.4s ; CHECK-SD-NEXT: addv s0, v0.4s ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll index f8ba150a0405f..f7a87ae340a73 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal.ll @@ -683,41 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) { ; ; CHECK-BE-LABEL: test_stnp_v17f32: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 +; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-BE-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-BE-NEXT: ldr s16, [sp, #36] +; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 ; CHECK-BE-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-BE-NEXT: ldr s17, [sp, #4] -; CHECK-BE-NEXT: add x8, sp, #44 -; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] +; CHECK-BE-NEXT: add x8, sp, #12 +; CHECK-BE-NEXT: add x9, sp, #20 +; CHECK-BE-NEXT: ldr s16, [sp, #36] ; CHECK-BE-NEXT: mov v0.s[1], v1.s[0] +; CHECK-BE-NEXT: ldr s1, [sp, #4] +; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] +; CHECK-BE-NEXT: add x10, sp, #52 ; CHECK-BE-NEXT: // kill: def $s6 killed $s6 def $q6 ; CHECK-BE-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-BE-NEXT: // kill: def $s7 killed $s7 def $q7 ; CHECK-BE-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-BE-NEXT: ldr s1, [sp, #68] -; CHECK-BE-NEXT: ld1 { v16.s }[1], [x8] -; CHECK-BE-NEXT: add x8, sp, #12 -; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] -; CHECK-BE-NEXT: add x8, sp, #52 -; CHECK-BE-NEXT: str s1, [x0, #64] -; CHECK-BE-NEXT: ld1 { v16.s }[2], [x8] -; CHECK-BE-NEXT: add x8, sp, #20 +; CHECK-BE-NEXT: ld1 { v1.s }[1], [x8] +; CHECK-BE-NEXT: ldr s5, [x9] +; CHECK-BE-NEXT: add x8, sp, #28 +; CHECK-BE-NEXT: add x9, sp, #44 +; CHECK-BE-NEXT: ld1 { v5.s }[1], [x8] +; CHECK-BE-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-BE-NEXT: ldr s17, [x10] +; CHECK-BE-NEXT: add x8, sp, #60 ; CHECK-BE-NEXT: mov v4.s[2], v6.s[0] ; CHECK-BE-NEXT: mov v0.s[2], v2.s[0] -; CHECK-BE-NEXT: ld1 { v17.s }[2], [x8] -; CHECK-BE-NEXT: add x8, sp, #60 -; CHECK-BE-NEXT: ld1 { v16.s }[3], [x8] -; CHECK-BE-NEXT: add x8, sp, #28 -; CHECK-BE-NEXT: ld1 { v17.s }[3], [x8] +; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-BE-NEXT: ldr s2, [sp, #68] +; CHECK-BE-NEXT: add x8, x0, #32 +; CHECK-BE-NEXT: zip1 v1.2d, v1.2d, v5.2d +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: str s2, [x0, #64] +; CHECK-BE-NEXT: zip1 v5.2d, v16.2d, v17.2d ; CHECK-BE-NEXT: mov v4.s[3], v7.s[0] -; CHECK-BE-NEXT: add x8, x0, #48 ; CHECK-BE-NEXT: mov v0.s[3], v3.s[0] -; CHECK-BE-NEXT: st1 { v16.4s }, [x8] -; CHECK-BE-NEXT: add x8, x0, #32 -; CHECK-BE-NEXT: st1 { v17.4s }, [x8] +; CHECK-BE-NEXT: st1 { v1.4s }, [x8] ; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: st1 { v5.4s }, [x9] ; CHECK-BE-NEXT: st1 { v4.4s }, [x8] ; CHECK-BE-NEXT: st1 { v0.4s }, [x0] ; CHECK-BE-NEXT: ret From 661cbd5a5254de22ba87a49e89f54b30e2874fb3 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 17 Jul 2025 11:27:47 -0500 Subject: [PATCH 200/813] [utils][TableGen] Make some non-bitmask enums iterable (#148647) Additionally, add sentinel values ::First_ and ::Last_ to each one of those enums. This will allow using `enum_seq_inclusive` to generate the list of enum-typed values of any generated scoped (non-bitmask) enum. --- llvm/test/TableGen/directive1.td | 25 +++++++++++++++++++ llvm/test/TableGen/directive2.td | 25 +++++++++++++++++++ .../OpenMPDirectiveNameParserTest.cpp | 21 +++++----------- .../utils/TableGen/Basic/DirectiveEmitter.cpp | 22 ++++++++++++++-- 4 files changed, 76 insertions(+), 17 deletions(-) diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td index 1d2bd51204e4f..3eda077eeabf7 100644 --- a/llvm/test/TableGen/directive1.td +++ b/llvm/test/TableGen/directive1.td @@ -53,6 +53,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: #include "llvm/ADT/ArrayRef.h" // CHECK-NEXT: #include "llvm/ADT/BitmaskEnum.h" +// CHECK-NEXT: #include "llvm/ADT/Sequence.h" // CHECK-NEXT: #include "llvm/ADT/StringRef.h" // CHECK-NEXT: #include "llvm/Frontend/Directive/Spelling.h" // CHECK-NEXT: #include "llvm/Support/Compiler.h" @@ -66,22 +67,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Association { // CHECK-NEXT: Block, +// CHECK-NEXT: First_ = Block, // CHECK-NEXT: Declaration, // CHECK-NEXT: Delimited, // CHECK-NEXT: Loop, // CHECK-NEXT: None, // CHECK-NEXT: Separating, +// CHECK-NEXT: Last_ = Separating, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Association_enumSize = 6; // CHECK-EMPTY: // CHECK-NEXT: enum class Category { // CHECK-NEXT: Declarative, +// CHECK-NEXT: First_ = Declarative, // CHECK-NEXT: Executable, // CHECK-NEXT: Informational, // CHECK-NEXT: Meta, // CHECK-NEXT: Subsidiary, // CHECK-NEXT: Utility, +// CHECK-NEXT: Last_ = Utility, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Category_enumSize = 6; @@ -96,6 +101,8 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Directive { // CHECK-NEXT: TDLD_dira, +// CHECK-NEXT: First_ = TDLD_dira, +// CHECK-NEXT: Last_ = TDLD_dira, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Directive_enumSize = 1; @@ -104,8 +111,10 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Clause { // CHECK-NEXT: TDLC_clausea, +// CHECK-NEXT: First_ = TDLC_clausea, // CHECK-NEXT: TDLC_clauseb, // CHECK-NEXT: TDLC_clausec, +// CHECK-NEXT: Last_ = TDLC_clausec, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Clause_enumSize = 3; @@ -151,6 +160,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: LLVM_ABI StringRef getTdlAKindName(AKind x); // CHECK-EMPTY: // CHECK-NEXT: } // namespace tdl +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; // CHECK-NEXT: } // namespace llvm // CHECK-NEXT: #endif // LLVM_Tdl_INC diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td index 3a64bb3900a31..a25197c3efd93 100644 --- a/llvm/test/TableGen/directive2.td +++ b/llvm/test/TableGen/directive2.td @@ -46,6 +46,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: #define LLVM_Tdl_INC // CHECK-EMPTY: // CHECK-NEXT: #include "llvm/ADT/ArrayRef.h" +// CHECK-NEXT: #include "llvm/ADT/Sequence.h" // CHECK-NEXT: #include "llvm/ADT/StringRef.h" // CHECK-NEXT: #include "llvm/Frontend/Directive/Spelling.h" // CHECK-NEXT: #include "llvm/Support/Compiler.h" @@ -57,22 +58,26 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Association { // CHECK-NEXT: Block, +// CHECK-NEXT: First_ = Block, // CHECK-NEXT: Declaration, // CHECK-NEXT: Delimited, // CHECK-NEXT: Loop, // CHECK-NEXT: None, // CHECK-NEXT: Separating, +// CHECK-NEXT: Last_ = Separating, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Association_enumSize = 6; // CHECK-EMPTY: // CHECK-NEXT: enum class Category { // CHECK-NEXT: Declarative, +// CHECK-NEXT: First_ = Declarative, // CHECK-NEXT: Executable, // CHECK-NEXT: Informational, // CHECK-NEXT: Meta, // CHECK-NEXT: Subsidiary, // CHECK-NEXT: Utility, +// CHECK-NEXT: Last_ = Utility, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Category_enumSize = 6; @@ -87,15 +92,19 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-EMPTY: // CHECK-NEXT: enum class Directive { // CHECK-NEXT: TDLD_dira, +// CHECK-NEXT: First_ = TDLD_dira, +// CHECK-NEXT: Last_ = TDLD_dira, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Directive_enumSize = 1; // CHECK-EMPTY: // CHECK-NEXT: enum class Clause { // CHECK-NEXT: TDLC_clausea, +// CHECK-NEXT: First_ = TDLC_clausea, // CHECK-NEXT: TDLC_clauseb, // CHECK-NEXT: TDLC_clausec, // CHECK-NEXT: TDLC_claused, +// CHECK-NEXT: Last_ = TDLC_claused, // CHECK-NEXT: }; // CHECK-EMPTY: // CHECK-NEXT: static constexpr std::size_t Clause_enumSize = 4; @@ -124,6 +133,22 @@ def TDL_DirA : Directive<[Spelling<"dira">]> { // CHECK-NEXT: LLVM_ABI Category getDirectiveCategory(Directive D); // CHECK-NEXT: LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D); // CHECK-NEXT: } // namespace tdl +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; +// CHECK-EMPTY: +// CHECK-NEXT: template <> struct enum_iteration_traits { +// CHECK-NEXT: static constexpr bool is_iterable = true; +// CHECK-NEXT: }; // CHECK-NEXT: } // namespace llvm // CHECK-NEXT: #endif // LLVM_Tdl_INC diff --git a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp index 0363a08cc0f03..10329820bef76 100644 --- a/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp +++ b/llvm/unittests/Frontend/OpenMPDirectiveNameParserTest.cpp @@ -48,12 +48,6 @@ static std::string &prepareParamName(std::string &Name) { return Name; } -namespace llvm { -template <> struct enum_iteration_traits { - static constexpr bool is_iterable = true; -}; -} // namespace llvm - // Test tokenizing. class Tokenize : public testing::TestWithParam {}; @@ -87,12 +81,10 @@ getParamName1(const testing::TestParamInfo &Info) { return prepareParamName(Name); } -INSTANTIATE_TEST_SUITE_P( - DirectiveNameParserTest, Tokenize, - testing::ValuesIn( - llvm::enum_seq(static_cast(0), - static_cast(omp::Directive_enumSize))), - getParamName1); +INSTANTIATE_TEST_SUITE_P(DirectiveNameParserTest, Tokenize, + testing::ValuesIn(llvm::enum_seq_inclusive( + omp::Directive::First_, omp::Directive::Last_)), + getParamName1); // Test parsing of valid names. @@ -131,9 +123,8 @@ getParamName2(const testing::TestParamInfo &Info) { INSTANTIATE_TEST_SUITE_P( DirectiveNameParserTest, ParseValid, - testing::Combine(testing::ValuesIn(llvm::enum_seq( - static_cast(0), - static_cast(omp::Directive_enumSize))), + testing::Combine(testing::ValuesIn(llvm::enum_seq_inclusive( + omp::Directive::First_, omp::Directive::Last_)), testing::ValuesIn(omp::getOpenMPVersions())), getParamName2); diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp index 177eecebce9a5..f0e23690367db 100644 --- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp +++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp @@ -106,8 +106,16 @@ static void generateEnumClass(ArrayRef Records, raw_ostream &OS, bool ExportEnums) { OS << "\n"; OS << "enum class " << Enum << " {\n"; - for (const Record *R : Records) { - OS << " " << getIdentifierName(R, Prefix) << ",\n"; + if (!Records.empty()) { + std::string N; + for (auto [I, R] : llvm::enumerate(Records)) { + N = getIdentifierName(R, Prefix); + OS << " " << N << ",\n"; + // Make the sentinel names less likely to conflict with actual names... + if (I == 0) + OS << " First_ = " << N << ",\n"; + } + OS << " Last_ = " << N << ",\n"; } OS << "};\n"; OS << "\n"; @@ -282,6 +290,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { if (DirLang.hasEnableBitmaskEnumInNamespace()) OS << "#include \"llvm/ADT/BitmaskEnum.h\"\n"; + OS << "#include \"llvm/ADT/Sequence.h\"\n"; OS << "#include \"llvm/ADT/StringRef.h\"\n"; OS << "#include \"llvm/Frontend/Directive/Spelling.h\"\n"; OS << "#include \"llvm/Support/Compiler.h\"\n"; @@ -375,6 +384,15 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) { for (auto Ns : reverse(Namespaces)) OS << "} // namespace " << Ns << "\n"; + // These specializations need to be in ::llvm. + for (StringRef Enum : {"Association", "Category", "Directive", "Clause"}) { + OS << "\n"; + OS << "template <> struct enum_iteration_traits<" + << DirLang.getCppNamespace() << "::" << Enum << "> {\n"; + OS << " static constexpr bool is_iterable = true;\n"; + OS << "};\n"; + } + OS << "} // namespace llvm\n"; OS << "#endif // LLVM_" << Lang << "_INC\n"; From 0dae924c1f668f74370b642ba91f818b728aca40 Mon Sep 17 00:00:00 2001 From: delaram-talaashrafi Date: Thu, 17 Jul 2025 12:38:02 -0400 Subject: [PATCH 201/813] [openacc][flang] Support two type bindName representation in acc routine (#149147) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on the OpenACC specification — which states that if the bind name is given as an identifier it should be resolved according to the compiled language, and if given as a string it should be used unmodified — we introduce two distinct `bindName` representations for `acc routine` to handle each case appropriately: one as an array of `SymbolRefAttr` for identifiers and another as an array of `StringAttr` for strings. To ensure correct correspondence between bind names and devices, this patch also introduces two separate sets of device attributes. The routine operation is extended accordingly, along with the necessary updates to the OpenACC dialect and its lowering. --- flang/lib/Lower/OpenACC.cpp | 102 +++++++++++----- flang/test/Lower/OpenACC/acc-routine.f90 | 7 +- flang/test/Lower/OpenACC/acc-routine03.f90 | 2 +- mlir/include/mlir/Dialect/OpenACC/OpenACC.h | 1 + .../mlir/Dialect/OpenACC/OpenACCOps.td | 12 +- mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 112 ++++++++++++++---- .../Dialect/OpenACC/OpenACCOpsTest.cpp | 44 +++++-- 7 files changed, 208 insertions(+), 72 deletions(-) diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 25682cba5620e..51eb33dec186b 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -4414,10 +4414,34 @@ getAttributeValueByDeviceType(llvm::SmallVector &attributes, return std::nullopt; } +// Helper function to extract string value from bind name variant +static std::optional getBindNameStringValue( + const std::optional> + &bindNameValue) { + if (!bindNameValue.has_value()) + return std::nullopt; + + return std::visit( + [](const auto &attr) -> std::optional { + if constexpr (std::is_same_v, + mlir::StringAttr>) { + return attr.getValue(); + } else if constexpr (std::is_same_v, + mlir::SymbolRefAttr>) { + return attr.getLeafReference(); + } else { + return std::nullopt; + } + }, + bindNameValue.value()); +} + static bool compareDeviceTypeInfo( mlir::acc::RoutineOp op, - llvm::SmallVector &bindNameArrayAttr, - llvm::SmallVector &bindNameDeviceTypeArrayAttr, + llvm::SmallVector &bindIdNameArrayAttr, + llvm::SmallVector &bindStrNameArrayAttr, + llvm::SmallVector &bindIdNameDeviceTypeArrayAttr, + llvm::SmallVector &bindStrNameDeviceTypeArrayAttr, llvm::SmallVector &gangArrayAttr, llvm::SmallVector &gangDimArrayAttr, llvm::SmallVector &gangDimDeviceTypeArrayAttr, @@ -4427,9 +4451,13 @@ static bool compareDeviceTypeInfo( for (uint32_t dtypeInt = 0; dtypeInt != mlir::acc::getMaxEnumValForDeviceType(); ++dtypeInt) { auto dtype = static_cast(dtypeInt); - if (op.getBindNameValue(dtype) != - getAttributeValueByDeviceType( - bindNameArrayAttr, bindNameDeviceTypeArrayAttr, dtype)) + auto bindNameValue = getBindNameStringValue(op.getBindNameValue(dtype)); + if (bindNameValue != + getAttributeValueByDeviceType( + bindIdNameArrayAttr, bindIdNameDeviceTypeArrayAttr, dtype) && + bindNameValue != + getAttributeValueByDeviceType( + bindStrNameArrayAttr, bindStrNameDeviceTypeArrayAttr, dtype)) return false; if (op.hasGang(dtype) != hasDeviceType(gangArrayAttr, dtype)) return false; @@ -4476,8 +4504,10 @@ getArrayAttrOrNull(fir::FirOpBuilder &builder, void createOpenACCRoutineConstruct( Fortran::lower::AbstractConverter &converter, mlir::Location loc, mlir::ModuleOp mod, mlir::func::FuncOp funcOp, std::string funcName, - bool hasNohost, llvm::SmallVector &bindNames, - llvm::SmallVector &bindNameDeviceTypes, + bool hasNohost, llvm::SmallVector &bindIdNames, + llvm::SmallVector &bindStrNames, + llvm::SmallVector &bindIdNameDeviceTypes, + llvm::SmallVector &bindStrNameDeviceTypes, llvm::SmallVector &gangDeviceTypes, llvm::SmallVector &gangDimValues, llvm::SmallVector &gangDimDeviceTypes, @@ -4490,7 +4520,8 @@ void createOpenACCRoutineConstruct( 0) { // If the routine is already specified with the same clauses, just skip // the operation creation. - if (compareDeviceTypeInfo(routineOp, bindNames, bindNameDeviceTypes, + if (compareDeviceTypeInfo(routineOp, bindIdNames, bindStrNames, + bindIdNameDeviceTypes, bindStrNameDeviceTypes, gangDeviceTypes, gangDimValues, gangDimDeviceTypes, seqDeviceTypes, workerDeviceTypes, vectorDeviceTypes) && @@ -4507,8 +4538,10 @@ void createOpenACCRoutineConstruct( modBuilder.create( loc, routineOpStr, mlir::SymbolRefAttr::get(builder.getContext(), funcName), - getArrayAttrOrNull(builder, bindNames), - getArrayAttrOrNull(builder, bindNameDeviceTypes), + getArrayAttrOrNull(builder, bindIdNames), + getArrayAttrOrNull(builder, bindStrNames), + getArrayAttrOrNull(builder, bindIdNameDeviceTypes), + getArrayAttrOrNull(builder, bindStrNameDeviceTypes), getArrayAttrOrNull(builder, workerDeviceTypes), getArrayAttrOrNull(builder, vectorDeviceTypes), getArrayAttrOrNull(builder, seqDeviceTypes), hasNohost, @@ -4525,8 +4558,10 @@ static void interpretRoutineDeviceInfo( llvm::SmallVector &seqDeviceTypes, llvm::SmallVector &vectorDeviceTypes, llvm::SmallVector &workerDeviceTypes, - llvm::SmallVector &bindNameDeviceTypes, - llvm::SmallVector &bindNames, + llvm::SmallVector &bindIdNameDeviceTypes, + llvm::SmallVector &bindStrNameDeviceTypes, + llvm::SmallVector &bindIdNames, + llvm::SmallVector &bindStrNames, llvm::SmallVector &gangDeviceTypes, llvm::SmallVector &gangDimValues, llvm::SmallVector &gangDimDeviceTypes) { @@ -4559,16 +4594,18 @@ static void interpretRoutineDeviceInfo( if (dinfo.bindNameOpt().has_value()) { const auto &bindName = dinfo.bindNameOpt().value(); mlir::Attribute bindNameAttr; - if (const auto &bindStr{std::get_if(&bindName)}) { + if (const auto &bindSym{ + std::get_if(&bindName)}) { + bindNameAttr = builder.getSymbolRefAttr(converter.mangleName(*bindSym)); + bindIdNames.push_back(bindNameAttr); + bindIdNameDeviceTypes.push_back(getDeviceTypeAttr()); + } else if (const auto &bindStr{std::get_if(&bindName)}) { bindNameAttr = builder.getStringAttr(*bindStr); - } else if (const auto &bindSym{ - std::get_if(&bindName)}) { - bindNameAttr = builder.getStringAttr(converter.mangleName(*bindSym)); + bindStrNames.push_back(bindNameAttr); + bindStrNameDeviceTypes.push_back(getDeviceTypeAttr()); } else { llvm_unreachable("Unsupported bind name type"); } - bindNames.push_back(bindNameAttr); - bindNameDeviceTypes.push_back(getDeviceTypeAttr()); } } @@ -4584,8 +4621,9 @@ void Fortran::lower::genOpenACCRoutineConstruct( bool hasNohost{false}; llvm::SmallVector seqDeviceTypes, vectorDeviceTypes, - workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes, - gangDimDeviceTypes, gangDimValues; + workerDeviceTypes, bindIdNameDeviceTypes, bindStrNameDeviceTypes, + bindIdNames, bindStrNames, gangDeviceTypes, gangDimDeviceTypes, + gangDimValues; for (const Fortran::semantics::OpenACCRoutineInfo &info : routineInfos) { // Device Independent Attributes @@ -4594,24 +4632,26 @@ void Fortran::lower::genOpenACCRoutineConstruct( } // Note: Device Independent Attributes are set to the // none device type in `info`. - interpretRoutineDeviceInfo(converter, info, seqDeviceTypes, - vectorDeviceTypes, workerDeviceTypes, - bindNameDeviceTypes, bindNames, gangDeviceTypes, - gangDimValues, gangDimDeviceTypes); + interpretRoutineDeviceInfo( + converter, info, seqDeviceTypes, vectorDeviceTypes, workerDeviceTypes, + bindIdNameDeviceTypes, bindStrNameDeviceTypes, bindIdNames, + bindStrNames, gangDeviceTypes, gangDimValues, gangDimDeviceTypes); // Device Dependent Attributes for (const Fortran::semantics::OpenACCRoutineDeviceTypeInfo &dinfo : info.deviceTypeInfos()) { - interpretRoutineDeviceInfo( - converter, dinfo, seqDeviceTypes, vectorDeviceTypes, - workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes, - gangDimValues, gangDimDeviceTypes); + interpretRoutineDeviceInfo(converter, dinfo, seqDeviceTypes, + vectorDeviceTypes, workerDeviceTypes, + bindIdNameDeviceTypes, bindStrNameDeviceTypes, + bindIdNames, bindStrNames, gangDeviceTypes, + gangDimValues, gangDimDeviceTypes); } } createOpenACCRoutineConstruct( - converter, loc, mod, funcOp, funcName, hasNohost, bindNames, - bindNameDeviceTypes, gangDeviceTypes, gangDimValues, gangDimDeviceTypes, - seqDeviceTypes, workerDeviceTypes, vectorDeviceTypes); + converter, loc, mod, funcOp, funcName, hasNohost, bindIdNames, + bindStrNames, bindIdNameDeviceTypes, bindStrNameDeviceTypes, + gangDeviceTypes, gangDimValues, gangDimDeviceTypes, seqDeviceTypes, + workerDeviceTypes, vectorDeviceTypes); } static void diff --git a/flang/test/Lower/OpenACC/acc-routine.f90 b/flang/test/Lower/OpenACC/acc-routine.f90 index 789f3a57e1f79..1a63b4120235c 100644 --- a/flang/test/Lower/OpenACC/acc-routine.f90 +++ b/flang/test/Lower/OpenACC/acc-routine.f90 @@ -2,13 +2,14 @@ ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s -! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind("_QPacc_routine17" [#acc.device_type], "_QPacc_routine17" [#acc.device_type], "_QPacc_routine16" [#acc.device_type]) -! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind("_QPacc_routine17" [#acc.device_type], "_QPacc_routine16" [#acc.device_type]) +! CHECK: acc.routine @[[r14:.*]] func(@_QPacc_routine19) bind(@_QPacc_routine17 [#acc.device_type], @_QPacc_routine17 +! [#acc.device_type], @_QPacc_routine16 [#acc.device_type]) +! CHECK: acc.routine @[[r13:.*]] func(@_QPacc_routine18) bind(@_QPacc_routine17 [#acc.device_type], @_QPacc_routine16 [#acc.device_type]) ! CHECK: acc.routine @[[r12:.*]] func(@_QPacc_routine17) worker ([#acc.device_type]) vector ([#acc.device_type]) ! CHECK: acc.routine @[[r11:.*]] func(@_QPacc_routine16) gang([#acc.device_type]) seq ([#acc.device_type]) ! CHECK: acc.routine @[[r10:.*]] func(@_QPacc_routine11) seq ! CHECK: acc.routine @[[r09:.*]] func(@_QPacc_routine10) seq -! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind("_QPacc_routine9a") +! CHECK: acc.routine @[[r08:.*]] func(@_QPacc_routine9) bind(@_QPacc_routine9a) ! CHECK: acc.routine @[[r07:.*]] func(@_QPacc_routine8) bind("routine8_") ! CHECK: acc.routine @[[r06:.*]] func(@_QPacc_routine7) gang(dim: 1 : i64) ! CHECK: acc.routine @[[r05:.*]] func(@_QPacc_routine6) nohost diff --git a/flang/test/Lower/OpenACC/acc-routine03.f90 b/flang/test/Lower/OpenACC/acc-routine03.f90 index 85e4ef580f983..ddd6bda0367e4 100644 --- a/flang/test/Lower/OpenACC/acc-routine03.f90 +++ b/flang/test/Lower/OpenACC/acc-routine03.f90 @@ -30,6 +30,6 @@ subroutine sub2(a) end subroutine ! CHECK: acc.routine @acc_routine_1 func(@_QPsub2) worker nohost -! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind("_QPsub2") worker +! CHECK: acc.routine @acc_routine_0 func(@_QPsub1) bind(@_QPsub2) worker ! CHECK: func.func @_QPsub1(%arg0: !fir.box> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>} ! CHECK: func.func @_QPsub2(%arg0: !fir.box> {fir.bindc_name = "a"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>} diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h index 4eb666239d4e4..8f87235fcd237 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h @@ -29,6 +29,7 @@ #include "mlir/Interfaces/ControlFlowInterfaces.h" #include "mlir/Interfaces/LoopLikeInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" +#include #define GET_TYPEDEF_CLASSES #include "mlir/Dialect/OpenACC/OpenACCOpsTypes.h.inc" diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 66378f116784e..96b9adcc53b3c 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2772,8 +2772,10 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> { }]; let arguments = (ins SymbolNameAttr:$sym_name, SymbolRefAttr:$func_name, - OptionalAttr:$bindName, - OptionalAttr:$bindNameDeviceType, + OptionalAttr:$bindIdName, + OptionalAttr:$bindStrName, + OptionalAttr:$bindIdNameDeviceType, + OptionalAttr:$bindStrNameDeviceType, OptionalAttr:$worker, OptionalAttr:$vector, OptionalAttr:$seq, UnitAttr:$nohost, @@ -2815,14 +2817,14 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> { std::optional getGangDimValue(); std::optional getGangDimValue(mlir::acc::DeviceType deviceType); - std::optional getBindNameValue(); - std::optional getBindNameValue(mlir::acc::DeviceType deviceType); + std::optional<::std::variant> getBindNameValue(); + std::optional<::std::variant> getBindNameValue(mlir::acc::DeviceType deviceType); }]; let assemblyFormat = [{ $sym_name `func` `(` $func_name `)` oilist ( - `bind` `(` custom($bindName, $bindNameDeviceType) `)` + `bind` `(` custom($bindIdName, $bindStrName ,$bindIdNameDeviceType, $bindStrNameDeviceType) `)` | `gang` `` custom($gang, $gangDim, $gangDimDeviceType) | `worker` custom($worker) | `vector` custom($vector) diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index f2eab62b286af..fbc1f003ab648 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/LogicalResult.h" +#include using namespace mlir; using namespace acc; @@ -3461,40 +3462,88 @@ LogicalResult acc::RoutineOp::verify() { return success(); } -static ParseResult parseBindName(OpAsmParser &parser, mlir::ArrayAttr &bindName, - mlir::ArrayAttr &deviceTypes) { - llvm::SmallVector bindNameAttrs; - llvm::SmallVector deviceTypeAttrs; +static ParseResult parseBindName(OpAsmParser &parser, + mlir::ArrayAttr &bindIdName, + mlir::ArrayAttr &bindStrName, + mlir::ArrayAttr &deviceIdTypes, + mlir::ArrayAttr &deviceStrTypes) { + llvm::SmallVector bindIdNameAttrs; + llvm::SmallVector bindStrNameAttrs; + llvm::SmallVector deviceIdTypeAttrs; + llvm::SmallVector deviceStrTypeAttrs; if (failed(parser.parseCommaSeparatedList([&]() { - if (parser.parseAttribute(bindNameAttrs.emplace_back())) + mlir::Attribute newAttr; + bool isSymbolRefAttr; + auto parseResult = parser.parseAttribute(newAttr); + if (auto symbolRefAttr = dyn_cast(newAttr)) { + bindIdNameAttrs.push_back(symbolRefAttr); + isSymbolRefAttr = true; + } else if (auto stringAttr = dyn_cast(newAttr)) { + bindStrNameAttrs.push_back(stringAttr); + isSymbolRefAttr = false; + } + if (parseResult) return failure(); if (failed(parser.parseOptionalLSquare())) { - deviceTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get( - parser.getContext(), mlir::acc::DeviceType::None)); + if (isSymbolRefAttr) { + deviceIdTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get( + parser.getContext(), mlir::acc::DeviceType::None)); + } else { + deviceStrTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get( + parser.getContext(), mlir::acc::DeviceType::None)); + } } else { - if (parser.parseAttribute(deviceTypeAttrs.emplace_back()) || - parser.parseRSquare()) - return failure(); + if (isSymbolRefAttr) { + if (parser.parseAttribute(deviceIdTypeAttrs.emplace_back()) || + parser.parseRSquare()) + return failure(); + } else { + if (parser.parseAttribute(deviceStrTypeAttrs.emplace_back()) || + parser.parseRSquare()) + return failure(); + } } return success(); }))) return failure(); - bindName = ArrayAttr::get(parser.getContext(), bindNameAttrs); - deviceTypes = ArrayAttr::get(parser.getContext(), deviceTypeAttrs); + bindIdName = ArrayAttr::get(parser.getContext(), bindIdNameAttrs); + bindStrName = ArrayAttr::get(parser.getContext(), bindStrNameAttrs); + deviceIdTypes = ArrayAttr::get(parser.getContext(), deviceIdTypeAttrs); + deviceStrTypes = ArrayAttr::get(parser.getContext(), deviceStrTypeAttrs); return success(); } static void printBindName(mlir::OpAsmPrinter &p, mlir::Operation *op, - std::optional bindName, - std::optional deviceTypes) { - llvm::interleaveComma(llvm::zip(*bindName, *deviceTypes), p, - [&](const auto &pair) { - p << std::get<0>(pair); - printSingleDeviceType(p, std::get<1>(pair)); - }); + std::optional bindIdName, + std::optional bindStrName, + std::optional deviceIdTypes, + std::optional deviceStrTypes) { + // Create combined vectors for all bind names and device types + llvm::SmallVector allBindNames; + llvm::SmallVector allDeviceTypes; + + // Append bindIdName and deviceIdTypes + if (hasDeviceTypeValues(deviceIdTypes)) { + allBindNames.append(bindIdName->begin(), bindIdName->end()); + allDeviceTypes.append(deviceIdTypes->begin(), deviceIdTypes->end()); + } + + // Append bindStrName and deviceStrTypes + if (hasDeviceTypeValues(deviceStrTypes)) { + allBindNames.append(bindStrName->begin(), bindStrName->end()); + allDeviceTypes.append(deviceStrTypes->begin(), deviceStrTypes->end()); + } + + // Print the combined sequence + if (!allBindNames.empty()) + llvm::interleaveComma(llvm::zip(allBindNames, allDeviceTypes), p, + [&](const auto &pair) { + p << std::get<0>(pair); + printSingleDeviceType(p, std::get<1>(pair)); + }); } static ParseResult parseRoutineGangClause(OpAsmParser &parser, @@ -3654,19 +3703,32 @@ bool RoutineOp::hasSeq(mlir::acc::DeviceType deviceType) { return hasDeviceType(getSeq(), deviceType); } -std::optional RoutineOp::getBindNameValue() { +std::optional> +RoutineOp::getBindNameValue() { return getBindNameValue(mlir::acc::DeviceType::None); } -std::optional +std::optional> RoutineOp::getBindNameValue(mlir::acc::DeviceType deviceType) { - if (!hasDeviceTypeValues(getBindNameDeviceType())) + if (!hasDeviceTypeValues(getBindIdNameDeviceType()) && + !hasDeviceTypeValues(getBindStrNameDeviceType())) { return std::nullopt; - if (auto pos = findSegment(*getBindNameDeviceType(), deviceType)) { - auto attr = (*getBindName())[*pos]; + } + + if (auto pos = findSegment(*getBindIdNameDeviceType(), deviceType)) { + auto attr = (*getBindIdName())[*pos]; + auto symbolRefAttr = dyn_cast(attr); + assert(symbolRefAttr && "expected SymbolRef"); + return symbolRefAttr; + } + + if (auto pos = findSegment(*getBindStrNameDeviceType(), deviceType)) { + auto attr = (*getBindStrName())[*pos]; auto stringAttr = dyn_cast(attr); - return stringAttr.getValue(); + assert(stringAttr && "expected String"); + return stringAttr; } + return std::nullopt; } diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp index aa16421cbec51..836efdb307f97 100644 --- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp +++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp @@ -519,14 +519,44 @@ TEST_F(OpenACCOpsTest, routineOpTest) { op->removeGangDimDeviceTypeAttr(); op->removeGangDimAttr(); - op->setBindNameDeviceTypeAttr(b.getArrayAttr({dtypeNone})); - op->setBindNameAttr(b.getArrayAttr({b.getStringAttr("fname")})); + op->setBindIdNameDeviceTypeAttr( + b.getArrayAttr({DeviceTypeAttr::get(&context, DeviceType::Host)})); + op->setBindStrNameDeviceTypeAttr(b.getArrayAttr({dtypeNone})); + op->setBindIdNameAttr( + b.getArrayAttr({SymbolRefAttr::get(&context, "test_symbol")})); + op->setBindStrNameAttr(b.getArrayAttr({b.getStringAttr("fname")})); EXPECT_TRUE(op->getBindNameValue().has_value()); - EXPECT_EQ(op->getBindNameValue().value(), "fname"); - for (auto d : dtypesWithoutNone) - EXPECT_FALSE(op->getBindNameValue(d).has_value()); - op->removeBindNameDeviceTypeAttr(); - op->removeBindNameAttr(); + EXPECT_TRUE(op->getBindNameValue(DeviceType::Host).has_value()); + EXPECT_EQ(std::visit( + [](const auto &attr) -> std::string { + if constexpr (std::is_same_v, + mlir::StringAttr>) { + return attr.str(); + } else { + return attr.getLeafReference().str(); + } + }, + op->getBindNameValue().value()), + "fname"); + EXPECT_EQ(std::visit( + [](const auto &attr) -> std::string { + if constexpr (std::is_same_v, + mlir::StringAttr>) { + return attr.str(); + } else { + return attr.getLeafReference().str(); + } + }, + op->getBindNameValue(DeviceType::Host).value()), + "test_symbol"); + for (auto d : dtypesWithoutNone) { + if (d != DeviceType::Host) + EXPECT_FALSE(op->getBindNameValue(d).has_value()); + } + op->removeBindIdNameDeviceTypeAttr(); + op->removeBindStrNameDeviceTypeAttr(); + op->removeBindIdNameAttr(); + op->removeBindStrNameAttr(); } template From b3a8d0efc907aae8198ff16e5bfb8dc48f08b6ca Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 17 Jul 2025 09:42:36 -0700 Subject: [PATCH 202/813] [RISCV] Add additional coverage for one hot interleave load cases [nfc] Add coverage for fixed vector vp.load, and the deinterleave intrinsic paths. --- .../rvv/fixed-vectors-interleaved-access.ll | 30 +++++++++++-- .../RISCV/rvv/vp-vector-interleaved-access.ll | 45 +++++++++++++++++++ 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 041aae229288f..019bbe2908a2c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1718,6 +1718,28 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) { ret void } +define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) { +; CHECK-LABEL: vp_load_factor3_one_active: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + ret <4 x i32> %v0 +} + +define <4 x i32> @vp_load_factor5_one_active(ptr %ptr) { +; CHECK-LABEL: vp_load_factor5_one_active: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1> splat (i1 true), i32 20) + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + ret <4 x i32> %v0 +} + define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) { ; CHECK-LABEL: store_factor4_one_active: ; CHECK: # %bb.0: @@ -1804,8 +1826,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI51_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI51_0) +; RV32-NEXT: lui a1, %hi(.LCPI53_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI53_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -1880,8 +1902,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI52_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI52_0) +; RV32-NEXT: lui a0, %hi(.LCPI54_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI54_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 diff --git a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll index 8cfa237858aca..23c0c826e85e3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll @@ -648,6 +648,51 @@ define void @masked_store_factor4_v2( %mask, ret void } +define @load_factor2_oneactive(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor2_oneactive: +; RV32: # %bb.0: +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: srli a1, a1, 1 +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg2e32.v v7, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor2_oneactive: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 34 +; RV64-NEXT: srli a1, a1, 33 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg2e32.v v7, (a0) +; RV64-NEXT: ret + %rvl = mul nuw i32 %evl, 4 + %wide.masked.load = call @llvm.vp.load.nxv4i32.p0(ptr %ptr, splat (i1 true), i32 %rvl) + %deinterleaved.results = call { , } @llvm.vector.deinterleave2.nxv4i32( %wide.masked.load) + %t0 = extractvalue { , } %deinterleaved.results, 1 + ret %t0 +} + +define @load_factor5_oneactive(ptr %ptr, i32 %evl) { +; RV32-LABEL: load_factor5_oneactive: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV32-NEXT: vlseg5e32.v v5, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor5_oneactive: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a1, a1, 32 +; RV64-NEXT: vsetvli zero, a1, e32, m1, ta, ma +; RV64-NEXT: vlseg5e32.v v5, (a0) +; RV64-NEXT: ret + %rvl = mul nuw i32 %evl, 5 + %wide.masked.load = call @llvm.vp.load(ptr %ptr, splat (i1 true), i32 %rvl) + %deinterleaved.results = call { , , , , } @llvm.vector.deinterleave5( %wide.masked.load) + %t3 = extractvalue { , , , , } %deinterleaved.results, 3 + ret %t3 +} + + ; Negative tests define {, } @not_same_mask( %mask0, %mask1, ptr %ptr, i32 %evl) { From e4a3541ff88af03c01007a94b6b5f5cea95ecf33 Mon Sep 17 00:00:00 2001 From: Akshay Khadse Date: Thu, 17 Jul 2025 12:57:04 -0400 Subject: [PATCH 203/813] [MLIR][Python] Support eliding large resource strings in PassManager (#149187) - Introduces a `large_resource_limit` parameter across Python bindings, enabling the eliding of resource strings exceeding a specified character limit during IR printing. - To maintain backward compatibilty, when using `operation.print()` API, if `large_resource_limit` is None and the `large_elements_limit` is set, the later will be used to elide the resource string as well. This change was introduced by https://github.com/llvm/llvm-project/pull/125738. - For printing using pass manager, the `large_resource_limit` and `large_elements_limit` are completely independent of each other. --- mlir/lib/Bindings/Python/IRCore.cpp | 22 +++++-- mlir/lib/Bindings/Python/IRModule.h | 14 ++--- mlir/lib/Bindings/Python/Pass.cpp | 12 +++- mlir/python/mlir/_mlir_libs/_mlir/ir.pyi | 7 +++ .../mlir/_mlir_libs/_mlir/passmanager.pyi | 1 + mlir/test/python/ir/operation.py | 9 +++ mlir/test/python/pass_manager.py | 57 +++++++++++++++++++ 7 files changed, 107 insertions(+), 15 deletions(-) diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index d961482885300..7b790e90e0d87 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -97,6 +97,10 @@ static const char kOperationPrintDocstring[] = binary: Whether to write bytes (True) or str (False). Defaults to False. large_elements_limit: Whether to elide elements attributes above this number of elements. Defaults to None (no limit). + large_resource_limit: Whether to elide resource attributes above this + number of characters. Defaults to None (no limit). If large_elements_limit + is set and this is None, the behavior will be to use large_elements_limit + as large_resource_limit. enable_debug_info: Whether to print debug/location information. Defaults to False. pretty_debug_info: Whether to format debug information for easier reading @@ -1303,6 +1307,7 @@ void PyOperation::checkValid() const { } void PyOperationBase::print(std::optional largeElementsLimit, + std::optional largeResourceLimit, bool enableDebugInfo, bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope, bool useNameLocAsPrefix, bool assumeVerified, @@ -1314,10 +1319,10 @@ void PyOperationBase::print(std::optional largeElementsLimit, fileObject = nb::module_::import_("sys").attr("stdout"); MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate(); - if (largeElementsLimit) { + if (largeElementsLimit) mlirOpPrintingFlagsElideLargeElementsAttrs(flags, *largeElementsLimit); - mlirOpPrintingFlagsElideLargeResourceString(flags, *largeElementsLimit); - } + if (largeResourceLimit) + mlirOpPrintingFlagsElideLargeResourceString(flags, *largeResourceLimit); if (enableDebugInfo) mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true, /*prettyForm=*/prettyDebugInfo); @@ -1405,6 +1410,7 @@ void PyOperationBase::walk( nb::object PyOperationBase::getAsm(bool binary, std::optional largeElementsLimit, + std::optional largeResourceLimit, bool enableDebugInfo, bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope, bool useNameLocAsPrefix, bool assumeVerified, @@ -1416,6 +1422,7 @@ nb::object PyOperationBase::getAsm(bool binary, fileObject = nb::module_::import_("io").attr("StringIO")(); } print(/*largeElementsLimit=*/largeElementsLimit, + /*largeResourceLimit=*/largeResourceLimit, /*enableDebugInfo=*/enableDebugInfo, /*prettyDebugInfo=*/prettyDebugInfo, /*printGenericOpForm=*/printGenericOpForm, @@ -3348,6 +3355,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { [](PyOperationBase &self) { return self.getAsm(/*binary=*/false, /*largeElementsLimit=*/std::nullopt, + /*largeResourceLimit=*/std::nullopt, /*enableDebugInfo=*/false, /*prettyDebugInfo=*/false, /*printGenericOpForm=*/false, @@ -3363,11 +3371,12 @@ void mlir::python::populateIRCore(nb::module_ &m) { nb::arg("state"), nb::arg("file").none() = nb::none(), nb::arg("binary") = false, kOperationPrintStateDocstring) .def("print", - nb::overload_cast, bool, bool, bool, bool, - bool, bool, nb::object, bool, bool>( - &PyOperationBase::print), + nb::overload_cast, std::optional, + bool, bool, bool, bool, bool, bool, nb::object, + bool, bool>(&PyOperationBase::print), // Careful: Lots of arguments must match up with print method. nb::arg("large_elements_limit").none() = nb::none(), + nb::arg("large_resource_limit").none() = nb::none(), nb::arg("enable_debug_info") = false, nb::arg("pretty_debug_info") = false, nb::arg("print_generic_op_form") = false, @@ -3383,6 +3392,7 @@ void mlir::python::populateIRCore(nb::module_ &m) { // Careful: Lots of arguments must match up with get_asm method. nb::arg("binary") = false, nb::arg("large_elements_limit").none() = nb::none(), + nb::arg("large_resource_limit").none() = nb::none(), nb::arg("enable_debug_info") = false, nb::arg("pretty_debug_info") = false, nb::arg("print_generic_op_form") = false, diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h index 9befcce725bb7..0fdd2d1a7eff6 100644 --- a/mlir/lib/Bindings/Python/IRModule.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -599,18 +599,18 @@ class PyOperationBase { public: virtual ~PyOperationBase() = default; /// Implements the bound 'print' method and helps with others. - void print(std::optional largeElementsLimit, bool enableDebugInfo, + void print(std::optional largeElementsLimit, + std::optional largeResourceLimit, bool enableDebugInfo, bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope, bool useNameLocAsPrefix, bool assumeVerified, nanobind::object fileObject, bool binary, bool skipRegions); void print(PyAsmState &state, nanobind::object fileObject, bool binary); - nanobind::object getAsm(bool binary, - std::optional largeElementsLimit, - bool enableDebugInfo, bool prettyDebugInfo, - bool printGenericOpForm, bool useLocalScope, - bool useNameLocAsPrefix, bool assumeVerified, - bool skipRegions); + nanobind::object + getAsm(bool binary, std::optional largeElementsLimit, + std::optional largeResourceLimit, bool enableDebugInfo, + bool prettyDebugInfo, bool printGenericOpForm, bool useLocalScope, + bool useNameLocAsPrefix, bool assumeVerified, bool skipRegions); // Implement the bound 'writeBytecode' method. void writeBytecode(const nanobind::object &fileObject, diff --git a/mlir/lib/Bindings/Python/Pass.cpp b/mlir/lib/Bindings/Python/Pass.cpp index 8d84864b9db4d..20017e25b69bb 100644 --- a/mlir/lib/Bindings/Python/Pass.cpp +++ b/mlir/lib/Bindings/Python/Pass.cpp @@ -78,12 +78,19 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) { [](PyPassManager &passManager, bool printBeforeAll, bool printAfterAll, bool printModuleScope, bool printAfterChange, bool printAfterFailure, std::optional largeElementsLimit, - bool enableDebugInfo, bool printGenericOpForm, + std::optional largeResourceLimit, bool enableDebugInfo, + bool printGenericOpForm, std::optional optionalTreePrintingPath) { MlirOpPrintingFlags flags = mlirOpPrintingFlagsCreate(); - if (largeElementsLimit) + if (largeElementsLimit) { mlirOpPrintingFlagsElideLargeElementsAttrs(flags, *largeElementsLimit); + mlirOpPrintingFlagsElideLargeResourceString(flags, + *largeElementsLimit); + } + if (largeResourceLimit) + mlirOpPrintingFlagsElideLargeResourceString(flags, + *largeResourceLimit); if (enableDebugInfo) mlirOpPrintingFlagsEnableDebugInfo(flags, /*enable=*/true, /*prettyForm=*/false); @@ -103,6 +110,7 @@ void mlir::python::populatePassManagerSubmodule(nb::module_ &m) { "print_module_scope"_a = false, "print_after_change"_a = false, "print_after_failure"_a = false, "large_elements_limit"_a.none() = nb::none(), + "large_resource_limit"_a.none() = nb::none(), "enable_debug_info"_a = false, "print_generic_op_form"_a = false, "tree_printing_dir_path"_a.none() = nb::none(), "Enable IR printing, default as mlir-print-ir-after-all.") diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi index ed476da28d6be..be71737e4b5b4 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi @@ -200,6 +200,7 @@ class _OperationBase: def get_asm( binary: Literal[True], large_elements_limit: int | None = None, + large_resource_limit: int | None = None, enable_debug_info: bool = False, pretty_debug_info: bool = False, print_generic_op_form: bool = False, @@ -212,6 +213,7 @@ class _OperationBase: self, binary: bool = False, large_elements_limit: int | None = None, + large_resource_limit: int | None = None, enable_debug_info: bool = False, pretty_debug_info: bool = False, print_generic_op_form: bool = False, @@ -253,6 +255,7 @@ class _OperationBase: def print( self, large_elements_limit: int | None = None, + large_resource_limit: int | None = None, enable_debug_info: bool = False, pretty_debug_info: bool = False, print_generic_op_form: bool = False, @@ -270,6 +273,10 @@ class _OperationBase: binary: Whether to write bytes (True) or str (False). Defaults to False. large_elements_limit: Whether to elide elements attributes above this number of elements. Defaults to None (no limit). + large_resource_limit: Whether to elide resource strings above this + number of characters. Defaults to None (no limit). If large_elements_limit + is set and this is None, the behavior will be to use large_elements_limit + as large_resource_limit. enable_debug_info: Whether to print debug/location information. Defaults to False. pretty_debug_info: Whether to format debug information for easier reading diff --git a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi index 0d2eaffe16d3e..1010daddae2aa 100644 --- a/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlir/passmanager.pyi @@ -23,6 +23,7 @@ class PassManager: print_after_change: bool = False, print_after_failure: bool = False, large_elements_limit: int | None = None, + large_resource_limit: int | None = None, enable_debug_info: bool = False, print_generic_op_form: bool = False, tree_printing_dir_path: str | None = None, diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py index b08fe98397fbc..ede1571f940f6 100644 --- a/mlir/test/python/ir/operation.py +++ b/mlir/test/python/ir/operation.py @@ -686,6 +686,15 @@ def testOperationPrint(): skip_regions=True, ) + # Test print with large_resource_limit. + # CHECK: func.func @f1(%arg0: i32) -> i32 + # CHECK-NOT: resource1: "0x08 + module.operation.print(large_resource_limit=2) + + # Test large_elements_limit has no effect on resource string + # CHECK: func.func @f1(%arg0: i32) -> i32 + # CHECK: resource1: "0x08 + module.operation.print(large_elements_limit=2) # CHECK-LABEL: TEST: testKnownOpView @run diff --git a/mlir/test/python/pass_manager.py b/mlir/test/python/pass_manager.py index 85d2eb304882e..e26d42bb32913 100644 --- a/mlir/test/python/pass_manager.py +++ b/mlir/test/python/pass_manager.py @@ -363,6 +363,63 @@ def testPrintIrLargeLimitElements(): pm.run(module) +# CHECK-LABEL: TEST: testPrintIrLargeResourceLimit +@run +def testPrintIrLargeResourceLimit(): + with Context() as ctx: + module = ModuleOp.parse( + """ + module { + func.func @main() -> tensor<3xi64> { + %0 = arith.constant dense_resource : tensor<3xi64> + return %0 : tensor<3xi64> + } + } + {-# + dialect_resources: { + builtin: { + blob1: "0x010000000000000002000000000000000300000000000000" + } + } + #-} + """ + ) + pm = PassManager.parse("builtin.module(canonicalize)") + ctx.enable_multithreading(False) + pm.enable_ir_printing(large_resource_limit=4) + # CHECK-NOT: blob1: "0x01 + pm.run(module) + + +# CHECK-LABEL: TEST: testPrintIrLargeResourceLimitVsElementsLimit +@run +def testPrintIrLargeResourceLimitVsElementsLimit(): + """Test that large_elements_limit does not affect the printing of resources.""" + with Context() as ctx: + module = ModuleOp.parse( + """ + module { + func.func @main() -> tensor<3xi64> { + %0 = arith.constant dense_resource : tensor<3xi64> + return %0 : tensor<3xi64> + } + } + {-# + dialect_resources: { + builtin: { + blob1: "0x010000000000000002000000000000000300000000000000" + } + } + #-} + """ + ) + pm = PassManager.parse("builtin.module(canonicalize)") + ctx.enable_multithreading(False) + pm.enable_ir_printing(large_elements_limit=1) + # CHECK-NOT: blob1: "0x01 + pm.run(module) + + # CHECK-LABEL: TEST: testPrintIrTree @run def testPrintIrTree(): From d35931c49e5b37243ace2b79bec87463772b6c94 Mon Sep 17 00:00:00 2001 From: T0b1-iOS Date: Thu, 17 Jul 2025 18:57:32 +0200 Subject: [PATCH 204/813] [Clang][CodeGen][X86] don't coerce int128 into `{i64,i64}` for SysV-like ABIs (#135230) Currently, clang coerces (u)int128_t to two i64 IR parameters when they are passed in registers. This leads to broken debug info for them after applying SROA+InstCombine. SROA generates IR like this ([godbolt](https://godbolt.org/z/YrTa4chfc)): ```llvm define dso_local { i64, i64 } @add(i64 noundef %a.coerce0, i64 noundef %a.coerce1) { entry: %a.sroa.2.0.insert.ext = zext i64 %a.coerce1 to i128 %a.sroa.2.0.insert.shift = shl nuw i128 %a.sroa.2.0.insert.ext, 64 %a.sroa.0.0.insert.ext = zext i64 %a.coerce0 to i128 %a.sroa.0.0.insert.insert = or i128 %a.sroa.2.0.insert.shift, %a.sroa.0.0.insert.ext #dbg_value(i128 %a.sroa.0.0.insert.insert, !17, !DIExpression(), !18) // ... !17 = !DILocalVariable(name: "a", arg: 1, scope: !10, file: !11, line: 1, type: !14) // ... ``` and InstCombine then removes the `or`, moving it into the `DIExpression`, and the `shl` at which point the debug info salvaging in `Transforms/Local` replaces the arguments with `poison` as it does not allow constants larger than 64 bit in `DIExpression`s. I'm working under the assumption that there is interest in fixing this. If not, please tell me. By not coercing `int128_t`s into `{i64, i64}` but keeping them as `i128`, the debug info stays intact and SelectionDAG then generates two `DW_OP_LLVM_fragment` expressions for the two corresponding argument registers. Given that the ABI code for x64 seems to not coerce the argument when it is passed on the stack, it should not lead to any problems keeping it as an `i128` when it is passed in registers. Alternatively, this could be fixed by checking if a constant value fits in 64 bits in the debug info salvaging code and then extending the value on the expression stack to the necessary width. This fixes InstCombine breaking the debug info but then SelectionDAG removes the expression and that seems significantly more complex to debug. Another fix may be to generate `DW_OP_LLVM_fragment` expressions when removing the `or` as it gets marked as disjoint by InstCombine. However, I don't know if the KnownBits information is still available at the time the `or` gets removed and it would probably require refactoring of the debug info salvaging code as that currently only seems to replace single expressions and is not designed to support generating new debug records. Converting `(u)int128_t` arguments to `i128` in the IR seems like the simpler solution, if it doesn't cause any ABI issues. --- clang/lib/CodeGen/Targets/X86.cpp | 32 +++++++++---- clang/test/CodeGen/X86/i128-debuginfo.c | 10 ++++ clang/test/CodeGen/X86/x86_64-arguments.c | 39 +++++++++++++++ clang/test/CodeGen/alloc-align-attr.c | 58 +++++++---------------- clang/test/CodeGen/builtins.c | 18 ++----- clang/test/CodeGen/ext-int-cc.c | 4 +- clang/test/CodeGen/extend-arg-64.c | 2 +- 7 files changed, 96 insertions(+), 67 deletions(-) create mode 100644 clang/test/CodeGen/X86/i128-debuginfo.c diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp index 0b712ac2dabc4..abb91486e7ee6 100644 --- a/clang/lib/CodeGen/Targets/X86.cpp +++ b/clang/lib/CodeGen/Targets/X86.cpp @@ -2470,13 +2470,12 @@ GetSSETypeAtOffset(llvm::Type *IRType, unsigned IROffset, return llvm::Type::getDoubleTy(getVMContext()); } - /// GetINTEGERTypeAtOffset - The ABI specifies that a value should be passed in -/// an 8-byte GPR. This means that we either have a scalar or we are talking -/// about the high or low part of an up-to-16-byte struct. This routine picks -/// the best LLVM IR type to represent this, which may be i64 or may be anything -/// else that the backend will pass in a GPR that works better (e.g. i8, %foo*, -/// etc). +/// one or more 8-byte GPRs. This means that we either have a scalar or we are +/// talking about the high and/or low part of an up-to-16-byte struct. This +/// routine picks the best LLVM IR type to represent this, which may be i64 or +/// may be anything else that the backend will pass in GPRs that works better +/// (e.g. i8, %foo*, etc). /// /// PrefType is an LLVM IR type that corresponds to (part of) the IR type for /// the source type. IROffset is an offset in bytes into the LLVM IR type that @@ -2534,6 +2533,13 @@ GetINTEGERTypeAtOffset(llvm::Type *IRType, unsigned IROffset, SourceOffset); } + // if we have a 128-bit integer, we can pass it safely using an i128 + // so we return that + if (IRType->isIntegerTy(128)) { + assert(IROffset == 0); + return IRType; + } + // Okay, we don't have any better idea of what to pass, so we pass this in an // integer register that isn't too big to fit the rest of the struct. unsigned TySizeInBytes = @@ -2591,8 +2597,7 @@ GetX86_64ByValArgumentPair(llvm::Type *Lo, llvm::Type *Hi, return Result; } -ABIArgInfo X86_64ABIInfo:: -classifyReturnType(QualType RetTy) const { +ABIArgInfo X86_64ABIInfo::classifyReturnType(QualType RetTy) const { // AMD64-ABI 3.2.3p4: Rule 1. Classify the return type with the // classification algorithm. X86_64ABIInfo::Class Lo, Hi; @@ -2638,6 +2643,12 @@ classifyReturnType(QualType RetTy) const { isPromotableIntegerTypeForABI(RetTy)) return ABIArgInfo::getExtend(RetTy); } + + if (ResType->isIntegerTy(128)) { + // i128 are passed directly + assert(Hi == Integer); + return ABIArgInfo::getDirect(ResType); + } break; // AMD64-ABI 3.2.3p4: Rule 4. If the class is SSE, the next @@ -2783,6 +2794,11 @@ X86_64ABIInfo::classifyArgumentType(QualType Ty, unsigned freeIntRegs, return ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty)); } + if (ResType->isIntegerTy(128)) { + assert(Hi == Integer); + ++neededInt; + return ABIArgInfo::getDirect(ResType); + } break; // AMD64-ABI 3.2.3p3: Rule 3. If the class is SSE, the next diff --git a/clang/test/CodeGen/X86/i128-debuginfo.c b/clang/test/CodeGen/X86/i128-debuginfo.c new file mode 100644 index 0000000000000..4b865c1bed9f0 --- /dev/null +++ b/clang/test/CodeGen/X86/i128-debuginfo.c @@ -0,0 +1,10 @@ +// no autogeneration since update_cc_test_checks does not support -g +// RUN: %clang_cc1 -triple x86_64-pc-linux -O1 -debug-info-kind=limited -emit-llvm -o - %s | FileCheck %s + +// CHECK-LABEL: define{{.*}} i128 @add(i128 noundef %a) +// CHECK: #dbg_value(i128 %a, ![[DI:.*]], !DIExpression() +__int128_t add(__int128_t a) { + return a + a; +} + +// CHECK: ![[DI]] = !DILocalVariable(name: "a", arg: 1 diff --git a/clang/test/CodeGen/X86/x86_64-arguments.c b/clang/test/CodeGen/X86/x86_64-arguments.c index 82845f0a2b31f..580f9487395d3 100644 --- a/clang/test/CodeGen/X86/x86_64-arguments.c +++ b/clang/test/CodeGen/X86/x86_64-arguments.c @@ -551,6 +551,45 @@ struct s68 { void f68(struct s68 x) { } +// CHECK-LABEL: define{{.*}} i128 @f69(i128 noundef %a) +__int128_t f69(__int128_t a) { + return a; +} + +// CHECK-LABEL: define{{.*}} i128 @f70(i128 noundef %a) +__uint128_t f70(__uint128_t a) { + return a; +} + +// check that registers are correctly counted for (u)int128_t arguments +struct s71 { + long long a, b; +}; +// CHECK-LABEL: define{{.*}} void @f71(i128 noundef %a, i128 noundef %b, i64 noundef %c, ptr noundef byval(%struct.s71) align 8 %d) +void f71(__int128_t a, __int128_t b, long long c, struct s71 d) { +} +// CHECK-LABEL: define{{.*}} void @f72(i128 noundef %a, i128 noundef %b, i64 %d.coerce0, i64 %d.coerce1) +void f72(__int128_t a, __int128_t b, struct s71 d) { +} + +// check that structs containing (u)int128_t are passed correctly +struct s73 { + struct inner { + __uint128_t a; + }; + struct inner in; +}; +// CHECK-LABEL: define{{.*}} i128 @f73(i128 %a.coerce) +struct s73 f73(struct s73 a) { + return a; +} + +// check that _BitInt(128) is still passed correctly on the stack +// CHECK-LABEL: define{{.*}} i128 @f74(i128 noundef %b, i128 noundef %c, i128 noundef %d, i64 noundef %e, ptr noundef byval(i128) align 8 %0) +_BitInt(128) f74(__uint128_t b, __uint128_t c, __uint128_t d, long e, _BitInt(128) a) { + return a; +} + /// The synthesized __va_list_tag does not have file/line fields. // CHECK: = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "__va_list_tag", // CHECK-NOT: file: diff --git a/clang/test/CodeGen/alloc-align-attr.c b/clang/test/CodeGen/alloc-align-attr.c index 76e5d1041b19f..c4c4e76eaaa04 100644 --- a/clang/test/CodeGen/alloc-align-attr.c +++ b/clang/test/CodeGen/alloc-align-attr.c @@ -70,66 +70,42 @@ __INT32_TYPE__ test4(__SIZE_TYPE__ a) { struct Empty {}; struct MultiArgs { __INT64_TYPE__ a, b;}; -// Struct parameter doesn't take up an IR parameter, 'i' takes up 2. +// Struct parameter doesn't take up an IR parameter, 'i' takes up 1. // Truncation to i64 is permissible, since alignments of greater than 2^64 are insane. __INT32_TYPE__ *m3(struct Empty s, __int128_t i) __attribute__((alloc_align(2))); // CHECK-LABEL: @test5( // CHECK-NEXT: entry: -// CHECK-NEXT: [[A:%.*]] = alloca i128, align 16 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i128, align 16 // CHECK-NEXT: [[E:%.*]] = alloca [[STRUCT_EMPTY:%.*]], align 1 -// CHECK-NEXT: [[COERCE:%.*]] = alloca i128, align 16 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0 -// CHECK-NEXT: store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1 -// CHECK-NEXT: store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load i128, ptr [[A]], align 16 -// CHECK-NEXT: store i128 [[A1]], ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16 -// CHECK-NEXT: store i128 [[TMP2]], ptr [[COERCE]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0 -// CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 16 -// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1 -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8 -// CHECK-NEXT: [[CALL:%.*]] = call ptr @m3(i64 noundef [[TMP4]], i64 noundef [[TMP6]]) -// CHECK-NEXT: [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64 +// CHECK-NEXT: store i128 [[A:%.*]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[CALL:%.*]] = call ptr @m3(i128 noundef [[TMP0]]) +// CHECK-NEXT: [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64 // CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ] -// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[CALL]], align 4 -// CHECK-NEXT: ret i32 [[TMP7]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[CALL]], align 4 +// CHECK-NEXT: ret i32 [[TMP1]] // __INT32_TYPE__ test5(__int128_t a) { struct Empty e; return *m3(e, a); } -// Struct parameter takes up 2 parameters, 'i' takes up 2. +// Struct parameter takes up 2 parameters, 'i' takes up 1. __INT32_TYPE__ *m4(struct MultiArgs s, __int128_t i) __attribute__((alloc_align(2))); // CHECK-LABEL: @test6( // CHECK-NEXT: entry: -// CHECK-NEXT: [[A:%.*]] = alloca i128, align 16 // CHECK-NEXT: [[A_ADDR:%.*]] = alloca i128, align 16 // CHECK-NEXT: [[E:%.*]] = alloca [[STRUCT_MULTIARGS:%.*]], align 8 -// CHECK-NEXT: [[COERCE:%.*]] = alloca i128, align 16 -// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 0 -// CHECK-NEXT: store i64 [[A_COERCE0:%.*]], ptr [[TMP0]], align 16 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[A]], i32 0, i32 1 -// CHECK-NEXT: store i64 [[A_COERCE1:%.*]], ptr [[TMP1]], align 8 -// CHECK-NEXT: [[A1:%.*]] = load i128, ptr [[A]], align 16 -// CHECK-NEXT: store i128 [[A1]], ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr [[A_ADDR]], align 16 -// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0 +// CHECK-NEXT: store i128 [[A:%.*]], ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP0:%.*]] = load i128, ptr [[A_ADDR]], align 16 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1 // CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8 -// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[E]], i32 0, i32 1 -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP5]], align 8 -// CHECK-NEXT: store i128 [[TMP2]], ptr [[COERCE]], align 16 -// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 0 -// CHECK-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 16 -// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw { i64, i64 }, ptr [[COERCE]], i32 0, i32 1 -// CHECK-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP9]], align 8 -// CHECK-NEXT: [[CALL:%.*]] = call ptr @m4(i64 [[TMP4]], i64 [[TMP6]], i64 noundef [[TMP8]], i64 noundef [[TMP10]]) -// CHECK-NEXT: [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP2]] to i64 +// CHECK-NEXT: [[CALL:%.*]] = call ptr @m4(i64 [[TMP2]], i64 [[TMP4]], i128 noundef [[TMP0]]) +// CHECK-NEXT: [[CASTED_ALIGN:%.*]] = trunc i128 [[TMP0]] to i64 // CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[CALL]], i64 [[CASTED_ALIGN]]) ] -// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[CALL]], align 4 -// CHECK-NEXT: ret i32 [[TMP11]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[CALL]], align 4 +// CHECK-NEXT: ret i32 [[TMP5]] // __INT32_TYPE__ test6(__int128_t a) { struct MultiArgs e; diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c index eda6c67fdad00..aa9965b815983 100644 --- a/clang/test/CodeGen/builtins.c +++ b/clang/test/CodeGen/builtins.c @@ -956,36 +956,24 @@ void test_builtin_os_log_errno(void) { void test_builtin_os_log_long_double(void *buf, long double ld) { // CHECK: %[[BUF_ADDR:.*]] = alloca ptr, align 8 // CHECK: %[[LD_ADDR:.*]] = alloca x86_fp80, align 16 - // CHECK: %[[COERCE:.*]] = alloca i128, align 16 // CHECK: store ptr %[[BUF]], ptr %[[BUF_ADDR]], align 8 // CHECK: store x86_fp80 %[[LD]], ptr %[[LD_ADDR]], align 16 // CHECK: %[[V0:.*]] = load ptr, ptr %[[BUF_ADDR]], align 8 // CHECK: %[[V1:.*]] = load x86_fp80, ptr %[[LD_ADDR]], align 16 // CHECK: %[[V2:.*]] = bitcast x86_fp80 %[[V1]] to i80 // CHECK: %[[V3:.*]] = zext i80 %[[V2]] to i128 - // CHECK: store i128 %[[V3]], ptr %[[COERCE]], align 16 - // CHECK: %[[V5:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 0 - // CHECK: %[[V6:.*]] = load i64, ptr %[[V5]], align 16 - // CHECK: %[[V7:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[COERCE]], i32 0, i32 1 - // CHECK: %[[V8:.*]] = load i64, ptr %[[V7]], align 8 - // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i64 noundef %[[V6]], i64 noundef %[[V8]]) + // CHECK: call void @__os_log_helper_1_0_1_16_0(ptr noundef %[[V0]], i128 noundef %[[V3]]) __builtin_os_log_format(buf, "%Lf", ld); } // CHECK-LABEL: define linkonce_odr hidden void @__os_log_helper_1_0_1_16_0 -// CHECK: (ptr noundef %[[BUFFER:.*]], i64 noundef %[[ARG0_COERCE0:.*]], i64 noundef %[[ARG0_COERCE1:.*]]) +// CHECK: (ptr noundef %[[BUFFER:.*]], i128 noundef %[[ARG0:.*]]) -// CHECK: %[[ARG0:.*]] = alloca i128, align 16 // CHECK: %[[BUFFER_ADDR:.*]] = alloca ptr, align 8 // CHECK: %[[ARG0_ADDR:.*]] = alloca i128, align 16 -// CHECK: %[[V1:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 0 -// CHECK: store i64 %[[ARG0_COERCE0]], ptr %[[V1]], align 16 -// CHECK: %[[V2:.*]] = getelementptr inbounds nuw { i64, i64 }, ptr %[[ARG0]], i32 0, i32 1 -// CHECK: store i64 %[[ARG0_COERCE1]], ptr %[[V2]], align 8 -// CHECK: %[[ARG01:.*]] = load i128, ptr %[[ARG0]], align 16 // CHECK: store ptr %[[BUFFER]], ptr %[[BUFFER_ADDR]], align 8 -// CHECK: store i128 %[[ARG01]], ptr %[[ARG0_ADDR]], align 16 +// CHECK: store i128 %[[ARG0]], ptr %[[ARG0_ADDR]], align 16 // CHECK: %[[BUF:.*]] = load ptr, ptr %[[BUFFER_ADDR]], align 8 // CHECK: %[[SUMMARY:.*]] = getelementptr i8, ptr %[[BUF]], i64 0 // CHECK: store i8 0, ptr %[[SUMMARY]], align 1 diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c index f31a4eb240c25..fdca4012ee4a4 100644 --- a/clang/test/CodeGen/ext-int-cc.c +++ b/clang/test/CodeGen/ext-int-cc.c @@ -32,7 +32,7 @@ // Make sure 128 and 64 bit versions are passed like integers. void ParamPassing(_BitInt(128) b, _BitInt(64) c) {} -// LIN64: define{{.*}} void @ParamPassing(i64 %{{.+}}, i64 %{{.+}}, i64 %{{.+}}) +// LIN64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) // WIN64: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) // LIN32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) // WIN32: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) @@ -251,7 +251,7 @@ _BitInt(127) ReturnPassing3(void) { return 0; } // LA32: define{{.*}} void @ReturnPassing3(ptr dead_on_unwind noalias writable sret _BitInt(128) ReturnPassing4(void) { return 0; } -// LIN64: define{{.*}} { i64, i64 } @ReturnPassing4( +// LIN64: define{{.*}} i128 @ReturnPassing4( // WIN64: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret // LIN32: define{{.*}} void @ReturnPassing4(ptr dead_on_unwind noalias writable sret // WIN32: define dso_local void @ReturnPassing4(ptr dead_on_unwind noalias writable sret diff --git a/clang/test/CodeGen/extend-arg-64.c b/clang/test/CodeGen/extend-arg-64.c index 2cb56d35af21d..8b99c01807ecc 100644 --- a/clang/test/CodeGen/extend-arg-64.c +++ b/clang/test/CodeGen/extend-arg-64.c @@ -84,7 +84,7 @@ int test(void) { #ifdef D128 knr(i128); // CHECKEXT: load i128 - // CHECKEXT: call{{.*}} void (i64, i64, ...) @knr + // CHECKEXT: call{{.*}} void (i128, ...) @knr #endif knr(u32, s32, u16, s16, u8, s8); From ff5784bb9094f6035851dc7abc4a5760fdc21e45 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 17 Jul 2025 12:11:12 -0500 Subject: [PATCH 205/813] [flang][OpenMP] Move extractOmpDirective to Utils.cpp, NFC (#148653) --- flang/lib/Lower/OpenMP/OpenMP.cpp | 84 ------------------------------- flang/lib/Lower/OpenMP/Utils.cpp | 84 +++++++++++++++++++++++++++++++ flang/lib/Lower/OpenMP/Utils.h | 3 ++ 3 files changed, 87 insertions(+), 84 deletions(-) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 4458f62eea95a..fcb20fdf187ff 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -372,90 +372,6 @@ extractMappedBaseValues(llvm::ArrayRef vars, }); } -/// Get the directive enumeration value corresponding to the given OpenMP -/// construct PFT node. -llvm::omp::Directive -extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) { - return common::visit( - common::visitors{ - [](const parser::OpenMPAllocatorsConstruct &c) { - return llvm::omp::OMPD_allocators; - }, - [](const parser::OpenMPAssumeConstruct &c) { - return llvm::omp::OMPD_assume; - }, - [](const parser::OpenMPAtomicConstruct &c) { - return llvm::omp::OMPD_atomic; - }, - [](const parser::OpenMPBlockConstruct &c) { - return std::get( - std::get(c.t).t) - .v; - }, - [](const parser::OpenMPCriticalConstruct &c) { - return llvm::omp::OMPD_critical; - }, - [](const parser::OpenMPDeclarativeAllocate &c) { - return llvm::omp::OMPD_allocate; - }, - [](const parser::OpenMPDispatchConstruct &c) { - return llvm::omp::OMPD_dispatch; - }, - [](const parser::OpenMPExecutableAllocate &c) { - return llvm::omp::OMPD_allocate; - }, - [](const parser::OpenMPLoopConstruct &c) { - return std::get( - std::get(c.t).t) - .v; - }, - [](const parser::OpenMPSectionConstruct &c) { - return llvm::omp::OMPD_section; - }, - [](const parser::OpenMPSectionsConstruct &c) { - return std::get( - std::get(c.t).t) - .v; - }, - [](const parser::OpenMPStandaloneConstruct &c) { - return common::visit( - common::visitors{ - [](const parser::OpenMPSimpleStandaloneConstruct &c) { - return c.v.DirId(); - }, - [](const parser::OpenMPFlushConstruct &c) { - return llvm::omp::OMPD_flush; - }, - [](const parser::OpenMPCancelConstruct &c) { - return llvm::omp::OMPD_cancel; - }, - [](const parser::OpenMPCancellationPointConstruct &c) { - return llvm::omp::OMPD_cancellation_point; - }, - [](const parser::OmpMetadirectiveDirective &c) { - return llvm::omp::OMPD_metadirective; - }, - [](const parser::OpenMPDepobjConstruct &c) { - return llvm::omp::OMPD_depobj; - }, - [](const parser::OpenMPInteropConstruct &c) { - return llvm::omp::OMPD_interop; - }}, - c.u); - }, - [](const parser::OpenMPUtilityConstruct &c) { - return common::visit( - common::visitors{[](const parser::OmpErrorDirective &c) { - return llvm::omp::OMPD_error; - }, - [](const parser::OmpNothingDirective &c) { - return llvm::omp::OMPD_nothing; - }}, - c.u); - }}, - ompConstruct.u); -} - /// Populate the global \see hostEvalInfo after processing clauses for the given /// \p eval OpenMP target construct, or nested constructs, if these must be /// evaluated outside of the target region per the spec. diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index 2e53f01f1da6a..b194150c0f7f0 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -661,6 +661,90 @@ bool collectLoopRelatedInfo( return found; } + +/// Get the directive enumeration value corresponding to the given OpenMP +/// construct PFT node. +llvm::omp::Directive +extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) { + return common::visit( + common::visitors{ + [](const parser::OpenMPAllocatorsConstruct &c) { + return llvm::omp::OMPD_allocators; + }, + [](const parser::OpenMPAssumeConstruct &c) { + return llvm::omp::OMPD_assume; + }, + [](const parser::OpenMPAtomicConstruct &c) { + return llvm::omp::OMPD_atomic; + }, + [](const parser::OpenMPBlockConstruct &c) { + return std::get( + std::get(c.t).t) + .v; + }, + [](const parser::OpenMPCriticalConstruct &c) { + return llvm::omp::OMPD_critical; + }, + [](const parser::OpenMPDeclarativeAllocate &c) { + return llvm::omp::OMPD_allocate; + }, + [](const parser::OpenMPDispatchConstruct &c) { + return llvm::omp::OMPD_dispatch; + }, + [](const parser::OpenMPExecutableAllocate &c) { + return llvm::omp::OMPD_allocate; + }, + [](const parser::OpenMPLoopConstruct &c) { + return std::get( + std::get(c.t).t) + .v; + }, + [](const parser::OpenMPSectionConstruct &c) { + return llvm::omp::OMPD_section; + }, + [](const parser::OpenMPSectionsConstruct &c) { + return std::get( + std::get(c.t).t) + .v; + }, + [](const parser::OpenMPStandaloneConstruct &c) { + return common::visit( + common::visitors{ + [](const parser::OpenMPSimpleStandaloneConstruct &c) { + return c.v.DirId(); + }, + [](const parser::OpenMPFlushConstruct &c) { + return llvm::omp::OMPD_flush; + }, + [](const parser::OpenMPCancelConstruct &c) { + return llvm::omp::OMPD_cancel; + }, + [](const parser::OpenMPCancellationPointConstruct &c) { + return llvm::omp::OMPD_cancellation_point; + }, + [](const parser::OmpMetadirectiveDirective &c) { + return llvm::omp::OMPD_metadirective; + }, + [](const parser::OpenMPDepobjConstruct &c) { + return llvm::omp::OMPD_depobj; + }, + [](const parser::OpenMPInteropConstruct &c) { + return llvm::omp::OMPD_interop; + }}, + c.u); + }, + [](const parser::OpenMPUtilityConstruct &c) { + return common::visit( + common::visitors{[](const parser::OmpErrorDirective &c) { + return llvm::omp::OMPD_error; + }, + [](const parser::OmpNothingDirective &c) { + return llvm::omp::OMPD_nothing; + }}, + c.u); + }}, + ompConstruct.u); +} } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h index 1526bd4e90233..8e3ad5c3452e2 100644 --- a/flang/lib/Lower/OpenMP/Utils.h +++ b/flang/lib/Lower/OpenMP/Utils.h @@ -166,6 +166,9 @@ bool collectLoopRelatedInfo( lower::pft::Evaluation &eval, const omp::List &clauses, mlir::omp::LoopRelatedClauseOps &result, llvm::SmallVectorImpl &iv); + +llvm::omp::Directive +extractOmpDirective(const parser::OpenMPConstruct &ompConstruct); } // namespace omp } // namespace lower } // namespace Fortran From 871d65bfdd580cec349e63d687e6829e0ef62824 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 17 Jul 2025 10:13:43 -0700 Subject: [PATCH 206/813] [CI] Migrate monolithic-linux script to sccache This is in preparation for migrating to Google Cloud Storage (GCS) based caching soon which is only supported by sccache. Reviewers: Keenuts, gburgessiv, dschuff, lnihlen, cmtice Reviewed By: cmtice Pull Request: https://github.com/llvm/llvm-project/pull/149195 --- .ci/monolithic-linux.sh | 7 ++++--- .github/workflows/premerge.yaml | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index 303b430c28e3f..d9f51ba9fd946 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -21,7 +21,7 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}" INSTALL_DIR="${BUILD_DIR}/install" rm -rf "${BUILD_DIR}" -ccache --zero-stats +sccache --zero-stats mkdir -p artifacts/reproducers @@ -31,7 +31,7 @@ export CLANG_CRASH_DIAGNOSTICS_DIR=`realpath artifacts/reproducers` function at-exit { retcode=$? - ccache --print-stats > artifacts/ccache_stats.txt + sccache --show-stats > artifacts/sccache_stats.txt cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log cp "${BUILD_DIR}"/test-results.*.xml artifacts/ || : @@ -73,7 +73,8 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -D LLVM_LIT_ARGS="${lit_args}" \ -D LLVM_ENABLE_LLD=ON \ -D CMAKE_CXX_FLAGS=-gmlt \ - -D LLVM_CCACHE_BUILD=ON \ + -D CMAKE_C_COMPILER_LAUNCHER=sccache \ + -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \ -D LIBCXX_CXX_ABI=libcxxabi \ -D MLIR_ENABLE_BINDINGS_PYTHON=ON \ -D LLDB_ENABLE_PYTHON=ON \ diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index ff63355222065..7b5ecd62080f3 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -37,6 +37,7 @@ jobs: - name: Setup ccache uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 with: + variant: "sccache" max-size: "2000M" - name: Build and Test # Mark the job as a success even if the step fails so that people do From fcb7ed69d0b4b0eb6cf7802b660d92a29bca718f Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 17 Jul 2025 10:15:11 -0700 Subject: [PATCH 207/813] [CI][Github] Use newer sccache version in CI container I was having trouble with the version that ships in the ubuntu apt repository and GCS based caching. The newer version works, so reintroduce the infra that we had in 2c1d4b0404187f0162d3b2df64dae062e53c3c79 to download it. Reviewers: tstellar, lnihlen, gburgessiv, dschuff, cmtice, Keenuts Reviewed By: cmtice, Keenuts Pull Request: https://github.com/llvm/llvm-project/pull/149196 --- .../containers/github-action-ci/Dockerfile | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile index efe08ebc221c5..69c71f638e2ac 100644 --- a/.github/workflows/containers/github-action-ci/Dockerfile +++ b/.github/workflows/containers/github-action-ci/Dockerfile @@ -63,11 +63,21 @@ RUN apt-get update && \ python3-pip \ ccache \ file \ - tzdata \ - sccache && \ + tzdata && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# We need sccache for caching. We cannot use the apt repository version because +# it is too old and has bugs related to features we require (particularly GCS +# caching), so we manually install it here. +# TODO(boomanaiden154): We should return to installing this from the apt +# repository once a version containing the necessary bug fixes is available. +RUN curl -L 'https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-x86_64-unknown-linux-musl.tar.gz' > /tmp/sccache.tar.gz && \ + echo "1fbb35e135660d04a2d5e42b59c7874d39b3deb17de56330b25b713ec59f849b /tmp/sccache.tar.gz" | sha256sum -c && \ + tar xzf /tmp/sccache.tar.gz -O --wildcards '*/sccache' > '/usr/local/bin/sccache' && \ + rm /tmp/sccache.tar.gz && \ + chmod +x /usr/local/bin/sccache + ENV LLVM_SYSROOT=$LLVM_SYSROOT ENV PATH=${LLVM_SYSROOT}/bin:${PATH} From b84f72a7f51e2ea829feb12ecbb8be0cfc835e2c Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Thu, 17 Jul 2025 19:16:04 +0200 Subject: [PATCH 208/813] [CIR] Upstream Unary Inc/Dec for ComplexType (#149162) This change adds support for unary inc/dec operators for ComplexType https://github.com/llvm/llvm-project/issues/141365 --- clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 49 ++++ clang/lib/CIR/CodeGen/CIRGenFunction.h | 6 + .../Dialect/Transforms/LoweringPrepare.cpp | 3 +- clang/test/CIR/CodeGen/complex-unary.cpp | 270 +++++++++++++++--- 4 files changed, 290 insertions(+), 38 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index 9f36be5397ad8..0a22771378ff1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -56,6 +56,26 @@ class ComplexExprEmitter : public StmtVisitor { mlir::Value VisitParenExpr(ParenExpr *e); mlir::Value VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *e); + + mlir::Value VisitPrePostIncDec(const UnaryOperator *e, bool isInc, + bool isPre); + + mlir::Value VisitUnaryPostDec(const UnaryOperator *e) { + return VisitPrePostIncDec(e, false, false); + } + + mlir::Value VisitUnaryPostInc(const UnaryOperator *e) { + return VisitPrePostIncDec(e, true, false); + } + + mlir::Value VisitUnaryPreDec(const UnaryOperator *e) { + return VisitPrePostIncDec(e, false, true); + } + + mlir::Value VisitUnaryPreInc(const UnaryOperator *e) { + return VisitPrePostIncDec(e, true, true); + } + mlir::Value VisitUnaryDeref(const Expr *e); mlir::Value VisitUnaryNot(const UnaryOperator *e); @@ -334,6 +354,12 @@ mlir::Value ComplexExprEmitter::VisitSubstNonTypeTemplateParmExpr( return Visit(e->getReplacement()); } +mlir::Value ComplexExprEmitter::VisitPrePostIncDec(const UnaryOperator *e, + bool isInc, bool isPre) { + LValue lv = cgf.emitLValue(e->getSubExpr()); + return cgf.emitComplexPrePostIncDec(e, lv, isInc, isPre); +} + mlir::Value ComplexExprEmitter::VisitUnaryDeref(const Expr *e) { return emitLoadOfLValue(e); } @@ -422,6 +448,29 @@ mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) { return ComplexExprEmitter(*this).Visit(const_cast(e)); } +mlir::Value CIRGenFunction::emitComplexPrePostIncDec(const UnaryOperator *e, + LValue lv, bool isInc, + bool isPre) { + mlir::Value inVal = emitLoadOfComplex(lv, e->getExprLoc()); + mlir::Location loc = getLoc(e->getExprLoc()); + auto opKind = isInc ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec; + mlir::Value incVal = builder.createUnaryOp(loc, opKind, inVal); + + // Store the updated result through the lvalue. + emitStoreOfComplex(loc, incVal, lv, /*isInit=*/false); + + if (getLangOpts().OpenMP) + cgm.errorNYI(loc, "emitComplexPrePostIncDec OpenMP"); + + // If this is a postinc, return the value read from memory, otherwise use the + // updated value. + return isPre ? incVal : inVal; +} + +mlir::Value CIRGenFunction::emitLoadOfComplex(LValue src, SourceLocation loc) { + return ComplexExprEmitter(*this).emitLoadOfLValue(src, loc); +} + void CIRGenFunction::emitStoreOfComplex(mlir::Location loc, mlir::Value v, LValue dest, bool isInit) { ComplexExprEmitter(*this).emitStoreOfComplex(loc, v, dest, isInit); diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 3baabba5adfe1..9541f4f0725eb 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -930,6 +930,9 @@ class CIRGenFunction : public CIRGenTypeCache { /// returning the result. mlir::Value emitComplexExpr(const Expr *e); + mlir::Value emitComplexPrePostIncDec(const UnaryOperator *e, LValue lv, + bool isInc, bool isPre); + LValue emitComplexAssignmentLValue(const BinaryOperator *e); void emitCompoundStmt(const clang::CompoundStmt &s); @@ -980,6 +983,9 @@ class CIRGenFunction : public CIRGenTypeCache { RValue emitLoadOfBitfieldLValue(LValue lv, SourceLocation loc); + /// Load a complex number from the specified l-value. + mlir::Value emitLoadOfComplex(LValue src, SourceLocation loc); + /// Given an expression that represents a value lvalue, this method emits /// the address of the lvalue, then loads the result as an rvalue, /// returning the rvalue. diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp index c708cf9d9fa61..8f848c7345610 100644 --- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp +++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp @@ -50,7 +50,8 @@ void LoweringPreparePass::lowerUnaryOp(cir::UnaryOp op) { switch (opKind) { case cir::UnaryOpKind::Inc: case cir::UnaryOpKind::Dec: - llvm_unreachable("Complex unary Inc/Dec NYI"); + resultReal = builder.createUnaryOp(loc, opKind, operandReal); + resultImag = operandImag; break; case cir::UnaryOpKind::Plus: diff --git a/clang/test/CIR/CodeGen/complex-unary.cpp b/clang/test/CIR/CodeGen/complex-unary.cpp index 33f3c2fa895d3..676b5546d28e0 100644 --- a/clang/test/CIR/CodeGen/complex-unary.cpp +++ b/clang/test/CIR/CodeGen/complex-unary.cpp @@ -10,40 +10,40 @@ void foo() { int _Complex b = ~a; } -// CIR-BEFORE: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] -// CIR-BEFORE: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] -// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex // CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex, !cir.complex -// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[RESULT]] : !cir.complex, !cir.ptr> +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[B_ADDR]] : !cir.complex, !cir.ptr> -// CIR-AFTER: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] -// CIR-AFTER: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] -// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !s32i // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !s32i // CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !s32i, !s32i // CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !s32i -> !cir.complex -// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex, !cir.ptr> +// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[B_ADDR]] : !cir.complex, !cir.ptr> -// LLVM: %[[COMPLEX:.*]] = alloca { i32, i32 }, i64 1, align 4 -// LLVM: %[[RESULT:.*]] = alloca { i32, i32 }, i64 1, align 4 -// LLVM: %[[TMP:.*]] = load { i32, i32 }, ptr %[[COMPLEX]], align 4 +// LLVM: %[[A_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { i32, i32 }, ptr %[[A_ADDR]], align 4 // LLVM: %[[REAL:.*]] = extractvalue { i32, i32 } %[[TMP]], 0 // LLVM: %[[IMAG:.*]] = extractvalue { i32, i32 } %[[TMP]], 1 // LLVM: %[[IMAG_MINUS:.*]] = sub i32 0, %[[IMAG]] // LLVM: %[[RESULT_TMP:.*]] = insertvalue { i32, i32 } {{.*}}, i32 %[[REAL]], 0 // LLVM: %[[RESULT_VAL:.*]] = insertvalue { i32, i32 } %[[RESULT_TMP]], i32 %[[IMAG_MINUS]], 1 -// LLVM: store { i32, i32 } %[[RESULT_VAL]], ptr %[[RESULT]], align 4 +// LLVM: store { i32, i32 } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4 -// OGCG: %[[COMPLEX:.*]] = alloca { i32, i32 }, align 4 -// OGCG: %[[RESULT:.*]] = alloca { i32, i32 }, align 4 -// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 0 +// OGCG: %[[A_ADDR:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 0 // OGCG: %[[A_REAL:.*]] = load i32, ptr %[[A_REAL_PTR]], align 4 -// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[COMPLEX]], i32 0, i32 1 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 1 // OGCG: %[[A_IMAG:.*]] = load i32, ptr %[[A_IMAG_PTR]], align 4 // OGCG: %[[A_IMAG_MINUS:.*]] = sub i32 0, %[[A_IMAG]] -// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[RESULT]], i32 0, i32 0 -// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[RESULT]], i32 0, i32 1 +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[B_ADDR]], i32 0, i32 1 // OGCG: store i32 %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4 // OGCG: store i32 %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4 @@ -52,39 +52,235 @@ void foo2() { float _Complex b = ~a; } -// CIR-BEFORE: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] -// CIR-BEFORE: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] -// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex // CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex, !cir.complex -// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[RESULT]] : !cir.complex, !cir.ptr> +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[B_ADDR]] : !cir.complex, !cir.ptr> -// CIR-AFTER: %[[COMPLEX:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] -// CIR-AFTER: %[[RESULT:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] -// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr>, !cir.complex +// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !cir.float // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !cir.float // CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float // CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex -// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex, !cir.ptr> +// CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[B_ADDR]] : !cir.complex, !cir.ptr> -// LLVM: %[[COMPLEX:.*]] = alloca { float, float }, i64 1, align 4 -// LLVM: %[[RESULT:.*]] = alloca { float, float }, i64 1, align 4 -// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[COMPLEX]], align 4 +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4 // LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0 // LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1 // LLVM: %[[IMAG_MINUS:.*]] = fneg float %[[IMAG]] // LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0 // LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG_MINUS]], 1 -// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[RESULT]], align 4 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4 -// OGCG: %[[COMPLEX:.*]] = alloca { float, float }, align 4 -// OGCG: %[[RESULT:.*]] = alloca { float, float }, align 4 -// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 0 +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 // OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4 -// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[COMPLEX]], i32 0, i32 1 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 // OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4 -// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float %[[A_IMAG]] -// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 0 -// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT]], i32 0, i32 1 +// OGCG: %[[A_IMAG_MINUS:.*]] = fneg float %[[A_IMAG]] +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1 // OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4 // OGCG: store float %[[A_IMAG_MINUS]], ptr %[[RESULT_IMAG_PTR]], align 4 + +void foo3() { + float _Complex a; + float _Complex b = a++; +} + +// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex +// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.unary(inc, %[[TMP]]) : !cir.complex, !cir.complex +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex, !cir.ptr> + +// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex +// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[REAL_INC:.*]] = cir.unary(inc, %[[REAL]]) : !cir.float, !cir.float +// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_INC]], %[[IMAG]] : !cir.float -> !cir.complex +// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4 +// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0 +// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1 +// LLVM: %[[REAL_INC:.*]] = fadd float 1.000000e+00, %[[REAL]] +// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_INC]], 0 +// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4 +// LLVM: store { float, float } %[[TMP]], ptr %[[B_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[A_REAL_INC:.*]] = fadd float %[[A_REAL]], 1.000000e+00 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store float %[[A_REAL_INC]], ptr %[[A_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1 +// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4 + +void foo4() { + float _Complex a; + float _Complex b = ++a; +} + +// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex +// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.unary(inc, %[[TMP]]) : !cir.complex, !cir.complex +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[B_ADDR]] : !cir.complex, !cir.ptr> + +// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex +// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[REAL_INC:.*]] = cir.unary(inc, %[[REAL]]) : !cir.float, !cir.float +// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_INC]], %[[IMAG]] : !cir.float -> !cir.complex +// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4 +// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0 +// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1 +// LLVM: %[[REAL_INC:.*]] = fadd float 1.000000e+00, %[[REAL]] +// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_INC]], 0 +// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[A_REAL_INC:.*]] = fadd float %[[A_REAL]], 1.000000e+00 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store float %[[A_REAL_INC]], ptr %[[A_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1 +// OGCG: store float %[[A_REAL_INC]], ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4 + +void foo5() { + float _Complex a; + float _Complex b = a--; +} + +// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex +// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.unary(dec, %[[TMP]]) : !cir.complex, !cir.complex +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex, !cir.ptr> + +// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex +// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[REAL_DEC:.*]] = cir.unary(dec, %[[REAL]]) : !cir.float, !cir.float +// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_DEC]], %[[IMAG]] : !cir.float -> !cir.complex +// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4 +// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0 +// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1 +// LLVM: %[[REAL_DEC:.*]] = fadd float -1.000000e+00, %[[REAL]] +// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_DEC]], 0 +// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4 +// LLVM: store { float, float } %[[TMP]], ptr %[[B_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[A_REAL_DEC:.*]] = fadd float %[[A_REAL]], -1.000000e+00 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store float %[[A_REAL_DEC]], ptr %[[A_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1 +// OGCG: store float %[[A_REAL]], ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4 + +void foo6() { + float _Complex a; + float _Complex b = --a; +} + +// CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex +// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.unary(dec, %[[TMP]]) : !cir.complex, !cir.complex +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[B_ADDR]] : !cir.complex, !cir.ptr> + +// CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a"] +// CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["b", init] +// CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr>, !cir.complex +// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex -> !cir.float +// CIR-AFTER: %[[REAL_DEC:.*]] = cir.unary(dec, %[[REAL]]) : !cir.float, !cir.float +// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_DEC]], %[[IMAG]] : !cir.float -> !cir.complex +// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex, !cir.ptr> +// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4 +// LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4 +// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0 +// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1 +// LLVM: %[[REAL_DEC:.*]] = fadd float -1.000000e+00, %[[REAL]] +// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL_DEC]], 0 +// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[A_ADDR]], align 4 +// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_REAL:.*]] = load float, ptr %[[A_REAL_PTR]], align 4 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: %[[A_IMAG:.*]] = load float, ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[A_REAL_DEC:.*]] = fadd float %[[A_REAL]], -1.000000e+00 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store float %[[A_REAL_DEC]], ptr %[[A_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG]], ptr %[[A_IMAG_PTR]], align 4 +// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 0 +// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[B_ADDR]], i32 0, i32 1 +// OGCG: store float %[[A_REAL_DEC]], ptr %[[RESULT_REAL_PTR]], align 4 +// OGCG: store float %[[A_IMAG]], ptr %[[RESULT_IMAG_PTR]], align 4 From 46c059f925d18afc99db90dd9b9be989f7f62536 Mon Sep 17 00:00:00 2001 From: Robert Konicar Date: Thu, 17 Jul 2025 19:20:31 +0200 Subject: [PATCH 209/813] [mlir][LLVMIR] Add IFuncOp to LLVM dialect (#147697) Add IFunc to LLVM dialect and add support for lifting/exporting LLVMIR IFunc. --- .../include/mlir/Dialect/LLVMIR/LLVMDialect.h | 3 + mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 65 ++++++++++ .../include/mlir/Target/LLVMIR/ModuleImport.h | 5 + .../mlir/Target/LLVMIR/ModuleTranslation.h | 11 ++ mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 119 +++++++++++++++--- .../LLVMIR/LLVMToLLVMIRTranslation.cpp | 24 +++- mlir/lib/Target/LLVMIR/ModuleImport.cpp | 48 ++++++- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 34 ++++- mlir/test/Dialect/LLVMIR/ifunc.mlir | 40 ++++++ mlir/test/Dialect/LLVMIR/invalid.mlir | 27 ++++ mlir/test/Target/LLVMIR/Import/ifunc.ll | 63 ++++++++++ mlir/test/Target/LLVMIR/ifunc.mlir | 70 +++++++++++ 12 files changed, 478 insertions(+), 31 deletions(-) create mode 100644 mlir/test/Dialect/LLVMIR/ifunc.mlir create mode 100644 mlir/test/Target/LLVMIR/Import/ifunc.ll create mode 100644 mlir/test/Target/LLVMIR/ifunc.mlir diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h index 63e007cdc335c..e355bb8f5ddae 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h @@ -223,6 +223,9 @@ Value createGlobalString(Location loc, OpBuilder &builder, StringRef name, /// function confirms that the Operation has the desired properties. bool satisfiesLLVMModule(Operation *op); +/// Lookup parent Module satisfying LLVM conditions on the Module Operation. +Operation *parentLLVMModule(Operation *op); + /// Convert an array of integer attributes to a vector of integers that can be /// used as indices in LLVM operations. template diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index f4c1640098320..4a9bc90e43d96 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -1285,6 +1285,10 @@ def LLVM_AddressOfOp : LLVM_Op<"mlir.addressof", /// Return the llvm.mlir.alias operation that defined the value referenced /// here. AliasOp getAlias(SymbolTableCollection &symbolTable); + + /// Return the llvm.mlir.ifunc operation that defined the value referenced + /// here. + IFuncOp getIFunc(SymbolTableCollection &symbolTable); }]; let assemblyFormat = "$global_name attr-dict `:` qualified(type($res))"; @@ -1601,6 +1605,67 @@ def LLVM_AliasOp : LLVM_Op<"mlir.alias", let hasRegionVerifier = 1; } +def LLVM_IFuncOp : LLVM_Op<"mlir.ifunc", + [IsolatedFromAbove, Symbol, DeclareOpInterfaceMethods]> { + let arguments = (ins + SymbolNameAttr:$sym_name, + TypeAttr:$i_func_type, + FlatSymbolRefAttr:$resolver, + TypeAttr:$resolver_type, + Linkage:$linkage, + UnitAttr:$dso_local, + DefaultValuedAttr, "0">:$address_space, + DefaultValuedAttr:$unnamed_addr, + DefaultValuedAttr:$visibility_ + ); + let summary = "LLVM dialect ifunc"; + let description = [{ + `llvm.mlir.ifunc` is a top level operation that defines a global ifunc. + It defines a new symbol and takes a symbol refering to a resolver function. + IFuncs can be called as regular functions. The function type is the same + as the IFuncType. The symbol is resolved at runtime by calling a resolver + function. + + Examples: + + ```mlir + // IFuncs resolve a symbol at runtime using a resovler function. + llvm.mlir.ifunc external @foo: !llvm.func, !llvm.ptr @resolver + + llvm.func @foo_1(i64) -> f32 + llvm.func @foo_2(i64) -> f32 + + llvm.func @resolve_foo() -> !llvm.ptr attributes { + %0 = llvm.mlir.addressof @foo_2 : !llvm.ptr + %1 = llvm.mlir.addressof @foo_1 : !llvm.ptr + + // ... Logic selecting from foo_{1, 2} + + // Return function pointer to the selected function + llvm.return %7 : !llvm.ptr + } + + llvm.func @use_foo() { + // IFuncs are called as regular functions + %res = llvm.call @foo(%value) : i64 -> f32 + } + ``` + }]; + + let builders = [ + OpBuilder<(ins "StringRef":$name, "Type":$i_func_type, + "StringRef":$resolver, "Type":$resolver_type, + "Linkage":$linkage, "LLVM::Visibility":$visibility)> + ]; + + let assemblyFormat = [{ + custom($linkage) ($visibility_^)? ($unnamed_addr^)? + $sym_name `:` $i_func_type `,` $resolver_type $resolver attr-dict + }]; + let hasVerifier = 1; +} + + def LLVM_DSOLocalEquivalentOp : LLVM_Op<"dso_local_equivalent", [Pure, ConstantLike, DeclareOpInterfaceMethods]> { let arguments = (ins FlatSymbolRefAttr:$function_name); diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h index 9902c6bb15caf..c484072ffaa80 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h @@ -71,6 +71,9 @@ class ModuleImport { /// Converts all aliases of the LLVM module to MLIR variables. LogicalResult convertAliases(); + /// Converts all ifuncs of the LLVM module to MLIR variables. + LogicalResult convertIFuncs(); + /// Converts the data layout of the LLVM module to an MLIR data layout /// specification. LogicalResult convertDataLayout(); @@ -320,6 +323,8 @@ class ModuleImport { /// Converts an LLVM global alias variable into an MLIR LLVM dialect alias /// operation if a conversion exists. Otherwise, returns failure. LogicalResult convertAlias(llvm::GlobalAlias *alias); + // Converts an LLVM global ifunc into an MLIR LLVM dialect ifunc operation. + LogicalResult convertIFunc(llvm::GlobalIFunc *ifunc); /// Returns personality of `func` as a FlatSymbolRefAttr. FlatSymbolRefAttr getPersonalityAsAttr(llvm::Function *func); /// Imports `bb` into `block`, which must be initially empty. diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h index 5d52cf3f04b6a..f3f73f49f199a 100644 --- a/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h +++ b/mlir/include/mlir/Target/LLVMIR/ModuleTranslation.h @@ -260,6 +260,12 @@ class ModuleTranslation { return aliasesMapping.lookup(op); } + /// Finds an LLVM IR global value that corresponds to the given MLIR operation + /// defining an IFunc. + llvm::GlobalValue *lookupIFunc(Operation *op) { + return ifuncMapping.lookup(op); + } + /// Returns the OpenMP IR builder associated with the LLVM IR module being /// constructed. llvm::OpenMPIRBuilder *getOpenMPBuilder(); @@ -345,6 +351,7 @@ class ModuleTranslation { bool recordInsertions = false); LogicalResult convertFunctionSignatures(); LogicalResult convertFunctions(); + LogicalResult convertIFuncs(); LogicalResult convertComdats(); LogicalResult convertUnresolvedBlockAddress(); @@ -406,6 +413,10 @@ class ModuleTranslation { /// aliases. DenseMap aliasesMapping; + /// Mappings between llvm.mlir.ifunc definitions and corresponding global + /// ifuncs. + DenseMap ifuncMapping; + /// A stateful object used to translate types. TypeToLLVMIRTranslator typeTranslator; diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 62dce32bc4531..4a1527cd0369f 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -130,6 +130,17 @@ static RetTy parseOptionalLLVMKeyword(OpAsmParser &parser, return static_cast(index); } +static void printLLVMLinkage(OpAsmPrinter &p, Operation *, LinkageAttr val) { + p << stringifyLinkage(val.getLinkage()); +} + +static ParseResult parseLLVMLinkage(OpAsmParser &p, LinkageAttr &val) { + val = LinkageAttr::get( + p.getContext(), + parseOptionalLLVMKeyword(p, LLVM::Linkage::External)); + return success(); +} + //===----------------------------------------------------------------------===// // Operand bundle helpers. //===----------------------------------------------------------------------===// @@ -1166,14 +1177,17 @@ LogicalResult CallOp::verifySymbolUses(SymbolTableCollection &symbolTable) { return emitOpError() << "'" << calleeName.getValue() << "' does not reference a symbol in the current scope"; - auto fn = dyn_cast(callee); - if (!fn) - return emitOpError() << "'" << calleeName.getValue() - << "' does not reference a valid LLVM function"; - - if (failed(verifyCallOpDebugInfo(*this, fn))) - return failure(); - fnType = fn.getFunctionType(); + if (auto fn = dyn_cast(callee)) { + if (failed(verifyCallOpDebugInfo(*this, fn))) + return failure(); + fnType = fn.getFunctionType(); + } else if (auto ifunc = dyn_cast(callee)) { + fnType = ifunc.getIFuncType(); + } else { + return emitOpError() + << "'" << calleeName.getValue() + << "' does not reference a valid LLVM function or IFunc"; + } } LLVMFunctionType funcType = llvm::dyn_cast(fnType); @@ -2029,14 +2043,6 @@ LogicalResult ReturnOp::verify() { // LLVM::AddressOfOp. //===----------------------------------------------------------------------===// -static Operation *parentLLVMModule(Operation *op) { - Operation *module = op->getParentOp(); - while (module && !satisfiesLLVMModule(module)) - module = module->getParentOp(); - assert(module && "unexpected operation outside of a module"); - return module; -} - GlobalOp AddressOfOp::getGlobal(SymbolTableCollection &symbolTable) { return dyn_cast_or_null( symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr())); @@ -2052,6 +2058,11 @@ AliasOp AddressOfOp::getAlias(SymbolTableCollection &symbolTable) { symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr())); } +IFuncOp AddressOfOp::getIFunc(SymbolTableCollection &symbolTable) { + return dyn_cast_or_null( + symbolTable.lookupSymbolIn(parentLLVMModule(*this), getGlobalNameAttr())); +} + LogicalResult AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) { Operation *symbol = @@ -2060,10 +2071,11 @@ AddressOfOp::verifySymbolUses(SymbolTableCollection &symbolTable) { auto global = dyn_cast_or_null(symbol); auto function = dyn_cast_or_null(symbol); auto alias = dyn_cast_or_null(symbol); + auto ifunc = dyn_cast_or_null(symbol); - if (!global && !function && !alias) + if (!global && !function && !alias && !ifunc) return emitOpError("must reference a global defined by 'llvm.mlir.global', " - "'llvm.mlir.alias' or 'llvm.func'"); + "'llvm.mlir.alias' or 'llvm.func' or 'llvm.mlir.ifunc'"); LLVMPointerType type = getType(); if ((global && global.getAddrSpace() != type.getAddressSpace()) || @@ -2673,6 +2685,69 @@ unsigned AliasOp::getAddrSpace() { return ptrTy.getAddressSpace(); } +//===----------------------------------------------------------------------===// +// IFuncOp +//===----------------------------------------------------------------------===// + +void IFuncOp::build(OpBuilder &builder, OperationState &result, StringRef name, + Type iFuncType, StringRef resolverName, Type resolverType, + Linkage linkage, LLVM::Visibility visibility) { + return build(builder, result, name, iFuncType, resolverName, resolverType, + linkage, /*dso_local=*/false, /*address_space=*/0, + UnnamedAddr::None, visibility); +} + +LogicalResult IFuncOp::verifySymbolUses(SymbolTableCollection &symbolTable) { + Operation *symbol = + symbolTable.lookupSymbolIn(parentLLVMModule(*this), getResolverAttr()); + // This matches LLVM IR verification logic, see llvm/lib/IR/Verifier.cpp + auto resolver = dyn_cast(symbol); + auto alias = dyn_cast(symbol); + while (alias) { + Block &initBlock = alias.getInitializerBlock(); + auto returnOp = cast(initBlock.getTerminator()); + auto addrOp = dyn_cast(returnOp.getArg().getDefiningOp()); + // FIXME: This is a best effort solution. The AliasOp body might be more + // complex and in that case we bail out with success. To completely match + // the LLVM IR logic it would be necessary to implement proper alias and + // cast stripping. + if (!addrOp) + return success(); + resolver = addrOp.getFunction(symbolTable); + alias = addrOp.getAlias(symbolTable); + } + if (!resolver) + return emitOpError("must have a function resolver"); + Linkage linkage = resolver.getLinkage(); + if (resolver.isExternal() || linkage == Linkage::AvailableExternally) + return emitOpError("resolver must be a definition"); + if (!isa(resolver.getFunctionType().getReturnType())) + return emitOpError("resolver must return a pointer"); + auto resolverPtr = dyn_cast(getResolverType()); + if (!resolverPtr || resolverPtr.getAddressSpace() != getAddressSpace()) + return emitOpError("resolver has incorrect type"); + return success(); +} + +LogicalResult IFuncOp::verify() { + switch (getLinkage()) { + case Linkage::External: + case Linkage::Internal: + case Linkage::Private: + case Linkage::Weak: + case Linkage::WeakODR: + case Linkage::Linkonce: + case Linkage::LinkonceODR: + break; + default: + return emitOpError() << "'" << stringifyLinkage(getLinkage()) + << "' linkage not supported in ifuncs, available " + "options: private, internal, linkonce, weak, " + "linkonce_odr, weak_odr, or external linkage"; + } + return success(); +} + //===----------------------------------------------------------------------===// // ShuffleVectorOp //===----------------------------------------------------------------------===// @@ -4320,3 +4395,11 @@ bool mlir::LLVM::satisfiesLLVMModule(Operation *op) { return op->hasTrait() && op->hasTrait(); } + +Operation *mlir::LLVM::parentLLVMModule(Operation *op) { + Operation *module = op->getParentOp(); + while (module && !satisfiesLLVMModule(module)) + module = module->getParentOp(); + assert(module && "unexpected operation outside of a module"); + return module; +} diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index 70029d7e15a90..ff34a0825215c 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -422,9 +422,18 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder, ArrayRef operandsRef(operands); llvm::CallInst *call; if (auto attr = callOp.getCalleeAttr()) { - call = - builder.CreateCall(moduleTranslation.lookupFunction(attr.getValue()), - operandsRef, opBundles); + if (llvm::Function *function = + moduleTranslation.lookupFunction(attr.getValue())) { + call = builder.CreateCall(function, operandsRef, opBundles); + } else { + Operation *moduleOp = parentLLVMModule(&opInst); + Operation *ifuncOp = + moduleTranslation.symbolTable().lookupSymbolIn(moduleOp, attr); + llvm::GlobalValue *ifunc = moduleTranslation.lookupIFunc(ifuncOp); + llvm::FunctionType *calleeType = llvm::cast( + moduleTranslation.convertType(callOp.getCalleeFunctionType())); + call = builder.CreateCall(calleeType, ifunc, operandsRef, opBundles); + } } else { llvm::FunctionType *calleeType = llvm::cast( moduleTranslation.convertType(callOp.getCalleeFunctionType())); @@ -648,18 +657,21 @@ convertOperationImpl(Operation &opInst, llvm::IRBuilderBase &builder, LLVM::LLVMFuncOp function = addressOfOp.getFunction(moduleTranslation.symbolTable()); LLVM::AliasOp alias = addressOfOp.getAlias(moduleTranslation.symbolTable()); + LLVM::IFuncOp ifunc = addressOfOp.getIFunc(moduleTranslation.symbolTable()); // The verifier should not have allowed this. - assert((global || function || alias) && - "referencing an undefined global, function, or alias"); + assert((global || function || alias || ifunc) && + "referencing an undefined global, function, alias, or ifunc"); llvm::Value *llvmValue = nullptr; if (global) llvmValue = moduleTranslation.lookupGlobal(global); else if (alias) llvmValue = moduleTranslation.lookupAlias(alias); - else + else if (function) llvmValue = moduleTranslation.lookupFunction(function.getName()); + else + llvmValue = moduleTranslation.lookupIFunc(ifunc); moduleTranslation.mapValue(addressOfOp.getResult(), llvmValue); return success(); diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp index bfda223fe0f5f..c807985756539 100644 --- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp @@ -1031,6 +1031,16 @@ LogicalResult ModuleImport::convertAliases() { return success(); } +LogicalResult ModuleImport::convertIFuncs() { + for (llvm::GlobalIFunc &ifunc : llvmModule->ifuncs()) { + if (failed(convertIFunc(&ifunc))) { + return emitError(UnknownLoc::get(context)) + << "unhandled global ifunc: " << diag(ifunc); + } + } + return success(); +} + LogicalResult ModuleImport::convertDataLayout() { Location loc = mlirModule.getLoc(); DataLayoutImporter dataLayoutImporter(context, llvmModule->getDataLayout()); @@ -1369,6 +1379,21 @@ LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) { return success(); } +LogicalResult ModuleImport::convertIFunc(llvm::GlobalIFunc *ifunc) { + OpBuilder::InsertionGuard guard = setGlobalInsertionPoint(); + + Type type = convertType(ifunc->getValueType()); + llvm::Constant *resolver = ifunc->getResolver(); + Type resolverType = convertType(resolver->getType()); + builder.create(mlirModule.getLoc(), ifunc->getName(), type, + resolver->getName(), resolverType, + convertLinkageFromLLVM(ifunc->getLinkage()), + ifunc->isDSOLocal(), ifunc->getAddressSpace(), + convertUnnamedAddrFromLLVM(ifunc->getUnnamedAddr()), + convertVisibilityFromLLVM(ifunc->getVisibility())); + return success(); +} + LogicalResult ModuleImport::convertGlobal(llvm::GlobalVariable *globalVar) { // Insert the global after the last one or at the start of the module. OpBuilder::InsertionGuard guard = setGlobalInsertionPoint(); @@ -1973,8 +1998,9 @@ ModuleImport::convertCallOperands(llvm::CallBase *callInst, // treated as indirect calls to constant operands that need to be converted. // Skip the callee operand if it's inline assembly, as it's handled separately // in InlineAsmOp. - if (!isa(callInst->getCalledOperand()) && !isInlineAsm) { - FailureOr called = convertValue(callInst->getCalledOperand()); + llvm::Value *calleeOperand = callInst->getCalledOperand(); + if (!isa(calleeOperand) && !isInlineAsm) { + FailureOr called = convertValue(calleeOperand); if (failed(called)) return failure(); operands.push_back(*called); @@ -2035,12 +2061,20 @@ ModuleImport::convertFunctionType(llvm::CallBase *callInst, if (failed(callType)) return failure(); auto *callee = dyn_cast(calledOperand); + + llvm::FunctionType *origCalleeType = nullptr; + if (callee) { + origCalleeType = callee->getFunctionType(); + } else if (auto *ifunc = dyn_cast(calledOperand)) { + origCalleeType = cast(ifunc->getValueType()); + } + // For indirect calls, return the type of the call itself. - if (!callee) + if (!origCalleeType) return callType; FailureOr calleeType = - castOrFailure(convertType(callee->getFunctionType())); + castOrFailure(convertType(origCalleeType)); if (failed(calleeType)) return failure(); @@ -2059,8 +2093,8 @@ ModuleImport::convertFunctionType(llvm::CallBase *callInst, FlatSymbolRefAttr ModuleImport::convertCalleeName(llvm::CallBase *callInst) { llvm::Value *calledOperand = callInst->getCalledOperand(); - if (auto *callee = dyn_cast(calledOperand)) - return SymbolRefAttr::get(context, callee->getName()); + if (isa(calledOperand)) + return SymbolRefAttr::get(context, calledOperand->getName()); return {}; } @@ -3162,6 +3196,8 @@ OwningOpRef mlir::translateLLVMIRToModule( return {}; if (failed(moduleImport.convertAliases())) return {}; + if (failed(moduleImport.convertIFuncs())) + return {}; moduleImport.convertTargetTriple(); return module; } diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 8908703cc1368..b997e559885e2 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -791,6 +791,8 @@ void ModuleTranslation::forgetMapping(Region ®ion) { globalsMapping.erase(&op); if (isa(op)) aliasesMapping.erase(&op); + if (isa(op)) + ifuncMapping.erase(&op); if (isa(op)) callMapping.erase(&op); llvm::append_range( @@ -1868,6 +1870,33 @@ LogicalResult ModuleTranslation::convertFunctions() { return success(); } +LogicalResult ModuleTranslation::convertIFuncs() { + for (auto op : getModuleBody(mlirModule).getOps()) { + llvm::Type *type = convertType(op.getIFuncType()); + llvm::GlobalValue::LinkageTypes linkage = + convertLinkageToLLVM(op.getLinkage()); + llvm::Constant *resolver; + if (auto *resolverFn = lookupFunction(op.getResolver())) { + resolver = cast(resolverFn); + } else { + Operation *aliasOp = symbolTable().lookupSymbolIn(parentLLVMModule(op), + op.getResolverAttr()); + resolver = cast(lookupAlias(aliasOp)); + } + + auto *ifunc = + llvm::GlobalIFunc::create(type, op.getAddressSpace(), linkage, + op.getSymName(), resolver, llvmModule.get()); + addRuntimePreemptionSpecifier(op.getDsoLocal(), ifunc); + ifunc->setUnnamedAddr(convertUnnamedAddrToLLVM(op.getUnnamedAddr())); + ifunc->setVisibility(convertVisibilityToLLVM(op.getVisibility_())); + + ifuncMapping.try_emplace(op, ifunc); + } + + return success(); +} + LogicalResult ModuleTranslation::convertComdats() { for (auto comdatOp : getModuleBody(mlirModule).getOps()) { for (auto selectorOp : comdatOp.getOps()) { @@ -2284,6 +2313,8 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, return nullptr; if (failed(translator.convertGlobalsAndAliases())) return nullptr; + if (failed(translator.convertIFuncs())) + return nullptr; if (failed(translator.createTBAAMetadata())) return nullptr; if (failed(translator.createIdentMetadata())) @@ -2296,7 +2327,8 @@ mlir::translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, // Convert other top-level operations if possible. for (Operation &o : getModuleBody(module).getOperations()) { if (!isa(&o) && + LLVM::GlobalCtorsOp, LLVM::GlobalDtorsOp, LLVM::ComdatOp, + LLVM::IFuncOp>(&o) && !o.hasTrait() && failed(translator.convertOperation(o, llvmBuilder))) { return nullptr; diff --git a/mlir/test/Dialect/LLVMIR/ifunc.mlir b/mlir/test/Dialect/LLVMIR/ifunc.mlir new file mode 100644 index 0000000000000..33e21bab0d51b --- /dev/null +++ b/mlir/test/Dialect/LLVMIR/ifunc.mlir @@ -0,0 +1,40 @@ +// RUN: mlir-opt %s -split-input-file --verify-roundtrip | FileCheck %s + +// CHECK: llvm.mlir.ifunc external @ifunc : !llvm.func, !llvm.ptr @resolver +llvm.mlir.ifunc @ifunc : !llvm.func, !llvm.ptr @resolver +llvm.func @resolver() -> !llvm.ptr { + %0 = llvm.mlir.constant(333 : i64) : i64 + %1 = llvm.inttoptr %0 : i64 to !llvm.ptr + llvm.return %1 : !llvm.ptr +} + +// ----- + +// CHECK: llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func, !llvm.ptr @resolver {dso_local} +llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func, !llvm.ptr @resolver {dso_local} +llvm.func @resolver() -> !llvm.ptr { + %0 = llvm.mlir.constant(333 : i64) : i64 + %1 = llvm.inttoptr %0 : i64 to !llvm.ptr + llvm.return %1 : !llvm.ptr +} + +// ----- + +// CHECK: llvm.mlir.ifunc private @ifunc : !llvm.func, !llvm.ptr @resolver {dso_local} +llvm.mlir.ifunc private @ifunc : !llvm.func, !llvm.ptr @resolver {dso_local} +llvm.func @resolver() -> !llvm.ptr { + %0 = llvm.mlir.constant(333 : i64) : i64 + %1 = llvm.inttoptr %0 : i64 to !llvm.ptr + llvm.return %1 : !llvm.ptr +} + +// ----- + +// CHECK: llvm.mlir.ifunc weak @ifunc : !llvm.func, !llvm.ptr @resolver +llvm.mlir.ifunc weak @ifunc : !llvm.func, !llvm.ptr @resolver +llvm.func @resolver() -> !llvm.ptr { + %0 = llvm.mlir.constant(333 : i64) : i64 + %1 = llvm.inttoptr %0 : i64 to !llvm.ptr + llvm.return %1 : !llvm.ptr +} + diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index bd1106e304c60..7f2c8c72e5cf9 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -1931,3 +1931,30 @@ llvm.func @invalid_xevm_matrix_3(%a: !llvm.ptr<1>, %base_width_a: i32, %base_hei llvm.return %loaded_a : vector<8xi16> } +// ----- + +llvm.func external @resolve_foo() -> !llvm.ptr attributes {dso_local} +// expected-error@+1 {{'llvm.mlir.ifunc' op resolver must be a definition}} +llvm.mlir.ifunc external @foo : !llvm.func, !llvm.ptr @resolve_foo {dso_local} + +// ----- + +llvm.mlir.global external @resolve_foo() : !llvm.ptr +// expected-error@+1 {{'llvm.mlir.ifunc' op must have a function resolver}} +llvm.mlir.ifunc external @foo : !llvm.func, !llvm.ptr @resolve_foo {dso_local} + +// ----- + +llvm.func external @resolve_foo() -> !llvm.ptr +// expected-error@+1 {{'llvm.mlir.ifunc' op 'common' linkage not supported in ifuncs}} +llvm.mlir.ifunc common @foo : !llvm.func, !llvm.ptr @resolve_foo {dso_local} + +// ----- + +llvm.mlir.global external @resolve_foo() : !llvm.ptr +llvm.mlir.alias external @alias_resolver : !llvm.ptr { + %0 = llvm.mlir.addressof @resolve_foo : !llvm.ptr + llvm.return %0 : !llvm.ptr +} +// expected-error@+1 {{'llvm.mlir.ifunc' op must have a function resolver}} +llvm.mlir.ifunc external @foo : !llvm.func, !llvm.ptr @alias_resolver {dso_local} diff --git a/mlir/test/Target/LLVMIR/Import/ifunc.ll b/mlir/test/Target/LLVMIR/Import/ifunc.ll new file mode 100644 index 0000000000000..0cec205dfce68 --- /dev/null +++ b/mlir/test/Target/LLVMIR/Import/ifunc.ll @@ -0,0 +1,63 @@ +; RUN: mlir-translate --import-llvm %s --split-input-file | FileCheck %s + +; CHECK: llvm.mlir.ifunc external @foo : !llvm.func, !llvm.ptr @resolve_foo {dso_local} +@foo = dso_local ifunc void (ptr, i32), ptr @resolve_foo + +define dso_local void @call_foo(ptr noundef %0, i32 noundef %1) { +; CHECK: llvm.call @foo + call void @foo(ptr noundef %0, i32 noundef %1) + ret void +} + +define dso_local ptr @foo_fptr() { +; CHECK: [[FPTR:%[0-9]+]] = llvm.mlir.addressof @foo +; CHECK: llvm.return [[FPTR]] + ret ptr @foo +} + +define internal ptr @resolve_foo() { + ret ptr @foo_1 +} + +declare void @foo_1(ptr noundef, i32 noundef) + +; // ----- + +define ptr @resolver() { + ret ptr inttoptr (i64 333 to ptr) +} + +@resolver_alias = alias ptr (), ptr @resolver +@resolver_alias_alias = alias ptr (), ptr @resolver_alias + +; CHECK-DAG: llvm.mlir.ifunc external @ifunc : !llvm.func, !llvm.ptr @resolver_alias +@ifunc = ifunc float (i64), ptr @resolver_alias +; CHECK-DAG: llvm.mlir.ifunc external @ifunc2 : !llvm.func, !llvm.ptr @resolver_alias_alias +@ifunc2 = ifunc float (i64), ptr @resolver_alias_alias + +; // ----- + +define ptr @resolver() { + ret ptr inttoptr (i64 333 to ptr) +} + +; CHECK: llvm.mlir.ifunc linkonce_odr hidden @ifunc +@ifunc = linkonce_odr hidden ifunc float (i64), ptr @resolver + +; // ----- + +define ptr @resolver() { + ret ptr inttoptr (i64 333 to ptr) +} + +; CHECK: llvm.mlir.ifunc private @ifunc {{.*}} {dso_local} +@ifunc = private dso_local ifunc float (i64), ptr @resolver + +; // ----- + +define ptr @resolver() { + ret ptr inttoptr (i64 333 to ptr) +} + +; CHECK: llvm.mlir.ifunc weak @ifunc +@ifunc = weak ifunc float (i64), ptr @resolver diff --git a/mlir/test/Target/LLVMIR/ifunc.mlir b/mlir/test/Target/LLVMIR/ifunc.mlir new file mode 100644 index 0000000000000..bba306c1e1ab3 --- /dev/null +++ b/mlir/test/Target/LLVMIR/ifunc.mlir @@ -0,0 +1,70 @@ +// RUN: mlir-translate -mlir-to-llvmir %s --split-input-file | FileCheck %s + +// CHECK: @foo = dso_local ifunc void (ptr, i32), ptr @resolve_foo +llvm.mlir.ifunc external @foo : !llvm.func, !llvm.ptr @resolve_foo {dso_local} +llvm.func @call_foo(%arg0: !llvm.ptr {llvm.noundef}, %arg1: i32 {llvm.noundef}) attributes {dso_local} { +// CHECK: call void @foo + llvm.call @foo(%arg0, %arg1) : (!llvm.ptr {llvm.noundef}, i32 {llvm.noundef}) -> () + llvm.return +} +llvm.func @foo_fptr() -> !llvm.ptr attributes {dso_local} { + %1 = llvm.mlir.addressof @foo : !llvm.ptr +// CHECK: ret ptr @foo + llvm.return %1 : !llvm.ptr +} +llvm.func internal @resolve_foo() -> !llvm.ptr attributes {dso_local} { + %0 = llvm.mlir.addressof @foo_1 : !llvm.ptr + llvm.return %0 : !llvm.ptr +} +llvm.func @foo_1(!llvm.ptr {llvm.noundef}, i32 {llvm.noundef}) + +// ----- + +llvm.mlir.alias external @resolver_alias : !llvm.func { + %0 = llvm.mlir.addressof @resolver : !llvm.ptr + llvm.return %0 : !llvm.ptr +} +llvm.mlir.alias external @resolver_alias_alias : !llvm.func { + %0 = llvm.mlir.addressof @resolver_alias : !llvm.ptr + llvm.return %0 : !llvm.ptr +} + +// CHECK-DAG: @ifunc = ifunc float (i64), ptr @resolver_alias +// CHECK-DAG: @ifunc2 = ifunc float (i64), ptr @resolver_alias_alias +llvm.mlir.ifunc external @ifunc2 : !llvm.func, !llvm.ptr @resolver_alias_alias +llvm.mlir.ifunc external @ifunc : !llvm.func, !llvm.ptr @resolver_alias +llvm.func @resolver() -> !llvm.ptr { + %0 = llvm.mlir.constant(333 : i64) : i64 + %1 = llvm.inttoptr %0 : i64 to !llvm.ptr + llvm.return %1 : !llvm.ptr +} + +// ----- + +// CHECK: @ifunc = linkonce_odr hidden ifunc +llvm.mlir.ifunc linkonce_odr hidden @ifunc : !llvm.func, !llvm.ptr @resolver {dso_local} +llvm.func @resolver() -> !llvm.ptr { + %0 = llvm.mlir.constant(333 : i64) : i64 + %1 = llvm.inttoptr %0 : i64 to !llvm.ptr + llvm.return %1 : !llvm.ptr +} + +// ----- + +// CHECK: @ifunc = private ifunc +llvm.mlir.ifunc private @ifunc : !llvm.func, !llvm.ptr @resolver {dso_local} +llvm.func @resolver() -> !llvm.ptr { + %0 = llvm.mlir.constant(333 : i64) : i64 + %1 = llvm.inttoptr %0 : i64 to !llvm.ptr + llvm.return %1 : !llvm.ptr +} + +// ----- + +// CHECK: @ifunc = weak ifunc +llvm.mlir.ifunc weak @ifunc : !llvm.func, !llvm.ptr @resolver +llvm.func @resolver() -> !llvm.ptr { + %0 = llvm.mlir.constant(333 : i64) : i64 + %1 = llvm.inttoptr %0 : i64 to !llvm.ptr + llvm.return %1 : !llvm.ptr +} From b8bc3ff9bedf0b8f1d38273f7920cb0bba1a5a9e Mon Sep 17 00:00:00 2001 From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com> Date: Thu, 17 Jul 2025 20:25:00 +0300 Subject: [PATCH 210/813] [libc][math] Refactor exp10f implementation to header-only in src/__support/math folder. (#148405) Part of #147386 in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450 --- libc/shared/math.h | 1 + libc/shared/math/exp10f.h | 23 +++ libc/src/__support/math/CMakeLists.txt | 28 +++ .../exp10f_impl.h => __support/math/exp10f.h} | 17 +- libc/src/__support/math/exp10f_utils.h | 157 +++++++++++++++ libc/src/math/generic/CMakeLists.txt | 37 +--- libc/src/math/generic/atanhf.cpp | 1 + libc/src/math/generic/coshf.cpp | 2 +- libc/src/math/generic/exp10f.cpp | 7 +- libc/src/math/generic/explogxf.cpp | 75 -------- libc/src/math/generic/explogxf.h | 180 +----------------- libc/src/math/generic/powf.cpp | 7 +- libc/src/math/generic/sinhf.cpp | 1 + libc/test/src/math/explogxf_test.cpp | 5 - .../llvm-project-overlay/libc/BUILD.bazel | 54 ++++-- 15 files changed, 267 insertions(+), 328 deletions(-) create mode 100644 libc/shared/math/exp10f.h rename libc/src/{math/generic/exp10f_impl.h => __support/math/exp10f.h} (91%) create mode 100644 libc/src/__support/math/exp10f_utils.h delete mode 100644 libc/src/math/generic/explogxf.cpp diff --git a/libc/shared/math.h b/libc/shared/math.h index b37aa46820523..2ae7c1d58ae10 100644 --- a/libc/shared/math.h +++ b/libc/shared/math.h @@ -13,6 +13,7 @@ #include "math/exp.h" #include "math/exp10.h" +#include "math/exp10f.h" #include "math/expf.h" #include "math/expf16.h" #include "math/frexpf.h" diff --git a/libc/shared/math/exp10f.h b/libc/shared/math/exp10f.h new file mode 100644 index 0000000000000..cd2ba54e6f4f2 --- /dev/null +++ b/libc/shared/math/exp10f.h @@ -0,0 +1,23 @@ +//===-- Shared exp10f function ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_MATH_EXP10F_H +#define LLVM_LIBC_SHARED_MATH_EXP10F_H + +#include "shared/libc_common.h" +#include "src/__support/math/exp10f.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using math::exp10f; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SHARED_MATH_EXP10F_H diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt index 0bfc996c44fc8..ad36679409f89 100644 --- a/libc/src/__support/math/CMakeLists.txt +++ b/libc/src/__support/math/CMakeLists.txt @@ -170,3 +170,31 @@ add_header_library( libc.src.__support.integer_literals libc.src.__support.macros.optimization ) + +add_header_library( + exp10f_utils + HDRS + exp10f_utils.h + DEPENDS + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.FPUtil.polyeval + libc.src.__support.common + libc.src.__support.math.exp_utils +) + +add_header_library( + exp10f + HDRS + exp10f.h + DEPENDS + .exp10f_utils + libc.src.__support.macros.config + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.rounding_mode + libc.src.__support.macros.optimization +) diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/__support/math/exp10f.h similarity index 91% rename from libc/src/math/generic/exp10f_impl.h rename to libc/src/__support/math/exp10f.h index 975fd01a0a25c..807b4f0d6c109 100644 --- a/libc/src/math/generic/exp10f_impl.h +++ b/libc/src/__support/math/exp10f.h @@ -1,4 +1,4 @@ -//===-- Single-precision 10^x function ------------------------------------===// +//===-- Implementation header for exp10f ------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,22 +6,21 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H -#define LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H -#include "explogxf.h" +#include "exp10f_utils.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/rounding_mode.h" -#include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY namespace LIBC_NAMESPACE_DECL { -namespace generic { +namespace math { -LIBC_INLINE float exp10f(float x) { +static constexpr float exp10f(float x) { using FPBits = typename fputil::FPBits; FPBits xbits(x); @@ -132,7 +131,7 @@ LIBC_INLINE float exp10f(float x) { return static_cast(multiply_add(p, lo2 * rr.mh, c0 * rr.mh)); } -} // namespace generic +} // namespace math } // namespace LIBC_NAMESPACE_DECL -#endif // LLVM_LIBC_SRC_MATH_GENERIC_EXP10F_IMPL_H +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_H diff --git a/libc/src/__support/math/exp10f_utils.h b/libc/src/__support/math/exp10f_utils.h new file mode 100644 index 0000000000000..0493e1b993e0c --- /dev/null +++ b/libc/src/__support/math/exp10f_utils.h @@ -0,0 +1,157 @@ +//===-- Common utils for exp10f ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H + +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/nearest_integer.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +struct ExpBase { + // Base = e + static constexpr int MID_BITS = 5; + static constexpr int MID_MASK = (1 << MID_BITS) - 1; + // log2(e) * 2^5 + static constexpr double LOG2_B = 0x1.71547652b82fep+0 * (1 << MID_BITS); + // High and low parts of -log(2) * 2^(-5) + static constexpr double M_LOGB_2_HI = -0x1.62e42fefa0000p-1 / (1 << MID_BITS); + static constexpr double M_LOGB_2_LO = + -0x1.cf79abc9e3b3ap-40 / (1 << MID_BITS); + // Look up table for bit fields of 2^(i/32) for i = 0..31, generated by Sollya + // with: + // > for i from 0 to 31 do printdouble(round(2^(i/32), D, RN)); + static constexpr int64_t EXP_2_MID[1 << MID_BITS] = { + 0x3ff0000000000000, 0x3ff059b0d3158574, 0x3ff0b5586cf9890f, + 0x3ff11301d0125b51, 0x3ff172b83c7d517b, 0x3ff1d4873168b9aa, + 0x3ff2387a6e756238, 0x3ff29e9df51fdee1, 0x3ff306fe0a31b715, + 0x3ff371a7373aa9cb, 0x3ff3dea64c123422, 0x3ff44e086061892d, + 0x3ff4bfdad5362a27, 0x3ff5342b569d4f82, 0x3ff5ab07dd485429, + 0x3ff6247eb03a5585, 0x3ff6a09e667f3bcd, 0x3ff71f75e8ec5f74, + 0x3ff7a11473eb0187, 0x3ff82589994cce13, 0x3ff8ace5422aa0db, + 0x3ff93737b0cdc5e5, 0x3ff9c49182a3f090, 0x3ffa5503b23e255d, + 0x3ffae89f995ad3ad, 0x3ffb7f76f2fb5e47, 0x3ffc199bdd85529c, + 0x3ffcb720dcef9069, 0x3ffd5818dcfba487, 0x3ffdfc97337b9b5f, + 0x3ffea4afa2a490da, 0x3fff50765b6e4540, + }; + + // Approximating e^dx with degree-5 minimax polynomial generated by Sollya: + // > Q = fpminimax(expm1(x)/x, 4, [|1, D...|], [-log(2)/64, log(2)/64]); + // Then: + // e^dx ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[3] * dx^5. + static constexpr double COEFFS[4] = { + 0x1.ffffffffe5bc8p-2, 0x1.555555555cd67p-3, 0x1.5555c2a9b48b4p-5, + 0x1.11112a0e34bdbp-7}; + + LIBC_INLINE static double powb_lo(double dx) { + using fputil::multiply_add; + double dx2 = dx * dx; + double c0 = 1.0 + dx; + // c1 = COEFFS[0] + COEFFS[1] * dx + double c1 = multiply_add(dx, ExpBase::COEFFS[1], ExpBase::COEFFS[0]); + // c2 = COEFFS[2] + COEFFS[3] * dx + double c2 = multiply_add(dx, ExpBase::COEFFS[3], ExpBase::COEFFS[2]); + // r = c4 + c5 * dx^4 + // = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[5] * dx^7 + return fputil::polyeval(dx2, c0, c1, c2); + } +}; + +struct Exp10Base : public ExpBase { + // log2(10) * 2^5 + static constexpr double LOG2_B = 0x1.a934f0979a371p1 * (1 << MID_BITS); + // High and low parts of -log10(2) * 2^(-5). + // Notice that since |x * log2(10)| < 150: + // |k| = |round(x * log2(10) * 2^5)| < 2^8 * 2^5 = 2^13 + // So when the FMA instructions are not available, in order for the product + // k * M_LOGB_2_HI + // to be exact, we only store the high part of log10(2) up to 38 bits + // (= 53 - 15) of precision. + // It is generated by Sollya with: + // > round(log10(2), 44, RN); + static constexpr double M_LOGB_2_HI = -0x1.34413509f8p-2 / (1 << MID_BITS); + // > round(log10(2) - 0x1.34413509f8p-2, D, RN); + static constexpr double M_LOGB_2_LO = 0x1.80433b83b532ap-44 / (1 << MID_BITS); + + // Approximating 10^dx with degree-5 minimax polynomial generated by Sollya: + // > Q = fpminimax((10^x - 1)/x, 4, [|D...|], [-log10(2)/2^6, log10(2)/2^6]); + // Then: + // 10^dx ~ P(dx) = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5. + static constexpr double COEFFS[5] = {0x1.26bb1bbb55515p1, 0x1.53524c73bd3eap1, + 0x1.0470591dff149p1, 0x1.2bd7c0a9fbc4dp0, + 0x1.1429e74a98f43p-1}; + + static double powb_lo(double dx) { + using fputil::multiply_add; + double dx2 = dx * dx; + // c0 = 1 + COEFFS[0] * dx + double c0 = multiply_add(dx, Exp10Base::COEFFS[0], 1.0); + // c1 = COEFFS[1] + COEFFS[2] * dx + double c1 = multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]); + // c2 = COEFFS[3] + COEFFS[4] * dx + double c2 = multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]); + // r = c0 + dx^2 * (c1 + c2 * dx^2) + // = c0 + c1 * dx^2 + c2 * dx^4 + // = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5. + return fputil::polyeval(dx2, c0, c1, c2); + } +}; + +// Output of range reduction for exp_b: (2^(mid + hi), lo) +// where: +// b^x = 2^(mid + hi) * b^lo +struct exp_b_reduc_t { + double mh; // 2^(mid + hi) + double lo; +}; + +// The function correctly calculates b^x value with at least float precision +// in a limited range. +// Range reduction: +// b^x = 2^(hi + mid) * b^lo +// where: +// x = (hi + mid) * log_b(2) + lo +// hi is an integer, +// 0 <= mid * 2^MID_BITS < 2^MID_BITS is an integer +// -2^(-MID_BITS - 1) <= lo * log2(b) <= 2^(-MID_BITS - 1) +// Base class needs to provide the following constants: +// - MID_BITS : number of bits after decimal points used for mid +// - MID_MASK : 2^MID_BITS - 1, mask to extract mid bits +// - LOG2_B : log2(b) * 2^MID_BITS for scaling +// - M_LOGB_2_HI : high part of -log_b(2) * 2^(-MID_BITS) +// - M_LOGB_2_LO : low part of -log_b(2) * 2^(-MID_BITS) +// - EXP_2_MID : look up table for bit fields of 2^mid +// Return: +// { 2^(hi + mid), lo } +template +LIBC_INLINE static constexpr exp_b_reduc_t exp_b_range_reduc(float x) { + double xd = static_cast(x); + // kd = round((hi + mid) * log2(b) * 2^MID_BITS) + double kd = fputil::nearest_integer(Base::LOG2_B * xd); + // k = round((hi + mid) * log2(b) * 2^MID_BITS) + int k = static_cast(kd); + // hi = floor(kd * 2^(-MID_BITS)) + // exp_hi = shift hi to the exponent field of double precision. + uint64_t exp_hi = static_cast(k >> Base::MID_BITS) + << fputil::FPBits::FRACTION_LEN; + // mh = 2^hi * 2^mid + // mh_bits = bit field of mh + uint64_t mh_bits = Base::EXP_2_MID[k & Base::MID_MASK] + exp_hi; + double mh = fputil::FPBits(mh_bits).get_val(); + // dx = lo = x - (hi + mid) * log(2) + double dx = fputil::multiply_add( + kd, Base::M_LOGB_2_LO, fputil::multiply_add(kd, Base::M_LOGB_2_HI, xd)); + return {mh, dx}; +} + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 352c2ad4ab22a..99db743315d43 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -358,7 +358,6 @@ add_entrypoint_object( libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.except_value_utils libc.src.__support.FPUtil.fma - libc.src.__support.FPUtil.multiply_add libc.src.__support.FPUtil.polyeval libc.src.__support.macros.optimization ) @@ -448,7 +447,6 @@ add_entrypoint_object( libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.fma - libc.src.__support.FPUtil.multiply_add libc.src.__support.FPUtil.polyeval libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.optimization @@ -1461,21 +1459,6 @@ add_entrypoint_object( libc.src.errno.errno ) -add_header_library( - exp10f_impl - HDRS - exp10f_impl.h - DEPENDS - .explogxf - libc.src.__support.FPUtil.fenv_impl - libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.rounding_mode - libc.src.__support.macros.optimization - libc.src.__support.common - libc.src.errno.errno -) - add_entrypoint_object( exp10f SRCS @@ -1483,7 +1466,8 @@ add_entrypoint_object( HDRS ../exp10f.h DEPENDS - .exp10f_impl + libc.src.__support.math.exp10f + libc.src.errno.errno ) add_entrypoint_object( @@ -1620,17 +1604,15 @@ add_entrypoint_object( ../powf.h DEPENDS .common_constants - .exp10f_impl .exp2f_impl .explogxf + libc.src.__support.math.exp10f libc.src.__support.CPP.bit - libc.src.__support.CPP.optional libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits libc.src.__support.FPUtil.multiply_add libc.src.__support.FPUtil.nearest_integer libc.src.__support.FPUtil.polyeval - libc.src.__support.FPUtil.rounding_mode libc.src.__support.FPUtil.sqrt libc.src.__support.FPUtil.triple_double libc.src.__support.macros.optimization @@ -3784,21 +3766,15 @@ add_entrypoint_object( ) #TODO: Add errno include to the hyperbolic functions. -add_object_library( +add_header_library( explogxf HDRS explogxf.h - SRCS - explogxf.cpp DEPENDS .common_constants - libc.src.__support.FPUtil.basic_operations - libc.src.__support.FPUtil.fenv_impl - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.nearest_integer - libc.src.__support.FPUtil.polyeval - libc.src.__support.common libc.src.__support.math.exp_utils + libc.src.__support.math.exp10f_utils + libc.src.__support.macros.properties.cpu_features libc.src.errno.errno ) @@ -3981,6 +3957,7 @@ add_entrypoint_object( DEPENDS .explogxf libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.fenv_impl libc.src.__support.macros.optimization ) diff --git a/libc/src/math/generic/atanhf.cpp b/libc/src/math/generic/atanhf.cpp index 2149314d2f676..f6fde766ef785 100644 --- a/libc/src/math/generic/atanhf.cpp +++ b/libc/src/math/generic/atanhf.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/math/atanhf.h" +#include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY diff --git a/libc/src/math/generic/coshf.cpp b/libc/src/math/generic/coshf.cpp index c869f7d9dec5f..9f87564d524a6 100644 --- a/libc/src/math/generic/coshf.cpp +++ b/libc/src/math/generic/coshf.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/math/coshf.h" +#include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY diff --git a/libc/src/math/generic/exp10f.cpp b/libc/src/math/generic/exp10f.cpp index 5284c380f52ec..b2d4f097bc7ce 100644 --- a/libc/src/math/generic/exp10f.cpp +++ b/libc/src/math/generic/exp10f.cpp @@ -7,12 +7,11 @@ //===----------------------------------------------------------------------===// #include "src/math/exp10f.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" -#include "src/math/generic/exp10f_impl.h" + +#include "src/__support/math/exp10f.h" namespace LIBC_NAMESPACE_DECL { -LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return generic::exp10f(x); } +LLVM_LIBC_FUNCTION(float, exp10f, (float x)) { return math::exp10f(x); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/explogxf.cpp b/libc/src/math/generic/explogxf.cpp deleted file mode 100644 index d38efa0269693..0000000000000 --- a/libc/src/math/generic/explogxf.cpp +++ /dev/null @@ -1,75 +0,0 @@ -//===-- Single-precision general exp/log functions ------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "explogxf.h" -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { - -// N[Table[Log[2, 1 + x], {x, 0/64, 63/64, 1/64}], 40] -alignas(8) const double LOG_P1_LOG2[LOG_P1_SIZE] = { - 0x0.0000000000000p+0, 0x1.6e79685c2d22ap-6, 0x1.6bad3758efd87p-5, - 0x1.0eb389fa29f9bp-4, 0x1.663f6fac91316p-4, 0x1.bc84240adabbap-4, - 0x1.08c588cda79e4p-3, 0x1.32ae9e278ae1ap-3, 0x1.5c01a39fbd688p-3, - 0x1.84c2bd02f03b3p-3, 0x1.acf5e2db4ec94p-3, 0x1.d49ee4c325970p-3, - 0x1.fbc16b902680ap-3, 0x1.11307dad30b76p-2, 0x1.24407ab0e073ap-2, - 0x1.37124cea4cdedp-2, 0x1.49a784bcd1b8bp-2, 0x1.5c01a39fbd688p-2, - 0x1.6e221cd9d0cdep-2, 0x1.800a563161c54p-2, 0x1.91bba891f1709p-2, - 0x1.a33760a7f6051p-2, 0x1.b47ebf73882a1p-2, 0x1.c592fad295b56p-2, - 0x1.d6753e032ea0fp-2, 0x1.e726aa1e754d2p-2, 0x1.f7a8568cb06cfp-2, - 0x1.03fda8b97997fp-1, 0x1.0c10500d63aa6p-1, 0x1.140c9faa1e544p-1, - 0x1.1bf311e95d00ep-1, 0x1.23c41d42727c8p-1, 0x1.2b803473f7ad1p-1, - 0x1.3327c6ab49ca7p-1, 0x1.3abb3faa02167p-1, 0x1.423b07e986aa9p-1, - 0x1.49a784bcd1b8bp-1, 0x1.510118708a8f9p-1, 0x1.5848226989d34p-1, - 0x1.5f7cff41e09afp-1, 0x1.66a008e4788ccp-1, 0x1.6db196a76194ap-1, - 0x1.74b1fd64e0754p-1, 0x1.7ba18f93502e4p-1, 0x1.82809d5be7073p-1, - 0x1.894f74b06ef8bp-1, 0x1.900e6160002cdp-1, 0x1.96bdad2acb5f6p-1, - 0x1.9d5d9fd5010b3p-1, 0x1.a3ee7f38e181fp-1, 0x1.aa708f58014d3p-1, - 0x1.b0e4126bcc86cp-1, 0x1.b74948f5532dap-1, 0x1.bda071cc67e6ep-1, - 0x1.c3e9ca2e1a055p-1, 0x1.ca258dca93316p-1, 0x1.d053f6d260896p-1, - 0x1.d6753e032ea0fp-1, 0x1.dc899ab3ff56cp-1, 0x1.e29142e0e0140p-1, - 0x1.e88c6b3626a73p-1, 0x1.ee7b471b3a950p-1, 0x1.f45e08bcf0655p-1, - 0x1.fa34e1177c233p-1, -}; - -// N[Table[1/(1 + x), {x, 0/64, 63/64, 1/64}], 40] -alignas(8) const double LOG_P1_1_OVER[LOG_P1_SIZE] = { - 0x1.0000000000000p+0, 0x1.f81f81f81f820p-1, 0x1.f07c1f07c1f08p-1, - 0x1.e9131abf0b767p-1, 0x1.e1e1e1e1e1e1ep-1, 0x1.dae6076b981dbp-1, - 0x1.d41d41d41d41dp-1, 0x1.cd85689039b0bp-1, 0x1.c71c71c71c71cp-1, - 0x1.c0e070381c0e0p-1, 0x1.bacf914c1bad0p-1, 0x1.b4e81b4e81b4fp-1, - 0x1.af286bca1af28p-1, 0x1.a98ef606a63bep-1, 0x1.a41a41a41a41ap-1, - 0x1.9ec8e951033d9p-1, 0x1.999999999999ap-1, 0x1.948b0fcd6e9e0p-1, - 0x1.8f9c18f9c18fap-1, 0x1.8acb90f6bf3aap-1, 0x1.8618618618618p-1, - 0x1.8181818181818p-1, 0x1.7d05f417d05f4p-1, 0x1.78a4c8178a4c8p-1, - 0x1.745d1745d1746p-1, 0x1.702e05c0b8170p-1, 0x1.6c16c16c16c17p-1, - 0x1.6816816816817p-1, 0x1.642c8590b2164p-1, 0x1.6058160581606p-1, - 0x1.5c9882b931057p-1, 0x1.58ed2308158edp-1, 0x1.5555555555555p-1, - 0x1.51d07eae2f815p-1, 0x1.4e5e0a72f0539p-1, 0x1.4afd6a052bf5bp-1, - 0x1.47ae147ae147bp-1, 0x1.446f86562d9fbp-1, 0x1.4141414141414p-1, - 0x1.3e22cbce4a902p-1, 0x1.3b13b13b13b14p-1, 0x1.3813813813814p-1, - 0x1.3521cfb2b78c1p-1, 0x1.323e34a2b10bfp-1, 0x1.2f684bda12f68p-1, - 0x1.2c9fb4d812ca0p-1, 0x1.29e4129e4129ep-1, 0x1.27350b8812735p-1, - 0x1.2492492492492p-1, 0x1.21fb78121fb78p-1, 0x1.1f7047dc11f70p-1, - 0x1.1cf06ada2811dp-1, 0x1.1a7b9611a7b96p-1, 0x1.1811811811812p-1, - 0x1.15b1e5f75270dp-1, 0x1.135c81135c811p-1, 0x1.1111111111111p-1, - 0x1.0ecf56be69c90p-1, 0x1.0c9714fbcda3bp-1, 0x1.0a6810a6810a7p-1, - 0x1.0842108421084p-1, 0x1.0624dd2f1a9fcp-1, 0x1.0410410410410p-1, - 0x1.0204081020408p-1}; - -// Taylos series expansion for Log[2, 1 + x] splitted to EVEN AND ODD numbers -// K_LOG2_ODD starts from x^3 -alignas(8) const - double K_LOG2_ODD[4] = {0x1.ec709dc3a03fdp-2, 0x1.2776c50ef9bfep-2, - 0x1.a61762a7aded9p-3, 0x1.484b13d7c02a9p-3}; - -alignas(8) const - double K_LOG2_EVEN[4] = {-0x1.71547652b82fep-1, -0x1.71547652b82fep-2, - -0x1.ec709dc3a03fdp-3, -0x1.2776c50ef9bfep-3}; - -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/explogxf.h b/libc/src/math/generic/explogxf.h index 5ae1457ca780e..be4328a4f48b5 100644 --- a/libc/src/math/generic/explogxf.h +++ b/libc/src/math/generic/explogxf.h @@ -10,166 +10,17 @@ #define LLVM_LIBC_SRC_MATH_GENERIC_EXPLOGXF_H #include "common_constants.h" -#include "src/__support/FPUtil/FEnvImpl.h" -#include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/nearest_integer.h" + #include "src/__support/common.h" -#include "src/__support/macros/config.h" #include "src/__support/macros/properties/cpu_features.h" - +#include "src/__support/math/exp10f_utils.h" #include "src/__support/math/exp_utils.h" namespace LIBC_NAMESPACE_DECL { -struct ExpBase { - // Base = e - static constexpr int MID_BITS = 5; - static constexpr int MID_MASK = (1 << MID_BITS) - 1; - // log2(e) * 2^5 - static constexpr double LOG2_B = 0x1.71547652b82fep+0 * (1 << MID_BITS); - // High and low parts of -log(2) * 2^(-5) - static constexpr double M_LOGB_2_HI = -0x1.62e42fefa0000p-1 / (1 << MID_BITS); - static constexpr double M_LOGB_2_LO = - -0x1.cf79abc9e3b3ap-40 / (1 << MID_BITS); - // Look up table for bit fields of 2^(i/32) for i = 0..31, generated by Sollya - // with: - // > for i from 0 to 31 do printdouble(round(2^(i/32), D, RN)); - static constexpr int64_t EXP_2_MID[1 << MID_BITS] = { - 0x3ff0000000000000, 0x3ff059b0d3158574, 0x3ff0b5586cf9890f, - 0x3ff11301d0125b51, 0x3ff172b83c7d517b, 0x3ff1d4873168b9aa, - 0x3ff2387a6e756238, 0x3ff29e9df51fdee1, 0x3ff306fe0a31b715, - 0x3ff371a7373aa9cb, 0x3ff3dea64c123422, 0x3ff44e086061892d, - 0x3ff4bfdad5362a27, 0x3ff5342b569d4f82, 0x3ff5ab07dd485429, - 0x3ff6247eb03a5585, 0x3ff6a09e667f3bcd, 0x3ff71f75e8ec5f74, - 0x3ff7a11473eb0187, 0x3ff82589994cce13, 0x3ff8ace5422aa0db, - 0x3ff93737b0cdc5e5, 0x3ff9c49182a3f090, 0x3ffa5503b23e255d, - 0x3ffae89f995ad3ad, 0x3ffb7f76f2fb5e47, 0x3ffc199bdd85529c, - 0x3ffcb720dcef9069, 0x3ffd5818dcfba487, 0x3ffdfc97337b9b5f, - 0x3ffea4afa2a490da, 0x3fff50765b6e4540, - }; - - // Approximating e^dx with degree-5 minimax polynomial generated by Sollya: - // > Q = fpminimax(expm1(x)/x, 4, [|1, D...|], [-log(2)/64, log(2)/64]); - // Then: - // e^dx ~ P(dx) = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[3] * dx^5. - static constexpr double COEFFS[4] = { - 0x1.ffffffffe5bc8p-2, 0x1.555555555cd67p-3, 0x1.5555c2a9b48b4p-5, - 0x1.11112a0e34bdbp-7}; - - LIBC_INLINE static double powb_lo(double dx) { - using fputil::multiply_add; - double dx2 = dx * dx; - double c0 = 1.0 + dx; - // c1 = COEFFS[0] + COEFFS[1] * dx - double c1 = multiply_add(dx, ExpBase::COEFFS[1], ExpBase::COEFFS[0]); - // c2 = COEFFS[2] + COEFFS[3] * dx - double c2 = multiply_add(dx, ExpBase::COEFFS[3], ExpBase::COEFFS[2]); - // r = c4 + c5 * dx^4 - // = 1 + dx + COEFFS[0] * dx^2 + ... + COEFFS[5] * dx^7 - return fputil::polyeval(dx2, c0, c1, c2); - } -}; - -struct Exp10Base : public ExpBase { - // log2(10) * 2^5 - static constexpr double LOG2_B = 0x1.a934f0979a371p1 * (1 << MID_BITS); - // High and low parts of -log10(2) * 2^(-5). - // Notice that since |x * log2(10)| < 150: - // |k| = |round(x * log2(10) * 2^5)| < 2^8 * 2^5 = 2^13 - // So when the FMA instructions are not available, in order for the product - // k * M_LOGB_2_HI - // to be exact, we only store the high part of log10(2) up to 38 bits - // (= 53 - 15) of precision. - // It is generated by Sollya with: - // > round(log10(2), 44, RN); - static constexpr double M_LOGB_2_HI = -0x1.34413509f8p-2 / (1 << MID_BITS); - // > round(log10(2) - 0x1.34413509f8p-2, D, RN); - static constexpr double M_LOGB_2_LO = 0x1.80433b83b532ap-44 / (1 << MID_BITS); - - // Approximating 10^dx with degree-5 minimax polynomial generated by Sollya: - // > Q = fpminimax((10^x - 1)/x, 4, [|D...|], [-log10(2)/2^6, log10(2)/2^6]); - // Then: - // 10^dx ~ P(dx) = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5. - static constexpr double COEFFS[5] = {0x1.26bb1bbb55515p1, 0x1.53524c73bd3eap1, - 0x1.0470591dff149p1, 0x1.2bd7c0a9fbc4dp0, - 0x1.1429e74a98f43p-1}; - - static double powb_lo(double dx) { - using fputil::multiply_add; - double dx2 = dx * dx; - // c0 = 1 + COEFFS[0] * dx - double c0 = multiply_add(dx, Exp10Base::COEFFS[0], 1.0); - // c1 = COEFFS[1] + COEFFS[2] * dx - double c1 = multiply_add(dx, Exp10Base::COEFFS[2], Exp10Base::COEFFS[1]); - // c2 = COEFFS[3] + COEFFS[4] * dx - double c2 = multiply_add(dx, Exp10Base::COEFFS[4], Exp10Base::COEFFS[3]); - // r = c0 + dx^2 * (c1 + c2 * dx^2) - // = c0 + c1 * dx^2 + c2 * dx^4 - // = 1 + COEFFS[0] * dx + ... + COEFFS[4] * dx^5. - return fputil::polyeval(dx2, c0, c1, c2); - } -}; - constexpr int LOG_P1_BITS = 6; constexpr int LOG_P1_SIZE = 1 << LOG_P1_BITS; -// N[Table[Log[2, 1 + x], {x, 0/64, 63/64, 1/64}], 40] -extern const double LOG_P1_LOG2[LOG_P1_SIZE]; - -// N[Table[1/(1 + x), {x, 0/64, 63/64, 1/64}], 40] -extern const double LOG_P1_1_OVER[LOG_P1_SIZE]; - -// Taylor series expansion for Log[2, 1 + x] splitted to EVEN AND ODD numbers -// K_LOG2_ODD starts from x^3 -extern const double K_LOG2_ODD[4]; -extern const double K_LOG2_EVEN[4]; - -// Output of range reduction for exp_b: (2^(mid + hi), lo) -// where: -// b^x = 2^(mid + hi) * b^lo -struct exp_b_reduc_t { - double mh; // 2^(mid + hi) - double lo; -}; - -// The function correctly calculates b^x value with at least float precision -// in a limited range. -// Range reduction: -// b^x = 2^(hi + mid) * b^lo -// where: -// x = (hi + mid) * log_b(2) + lo -// hi is an integer, -// 0 <= mid * 2^MID_BITS < 2^MID_BITS is an integer -// -2^(-MID_BITS - 1) <= lo * log2(b) <= 2^(-MID_BITS - 1) -// Base class needs to provide the following constants: -// - MID_BITS : number of bits after decimal points used for mid -// - MID_MASK : 2^MID_BITS - 1, mask to extract mid bits -// - LOG2_B : log2(b) * 2^MID_BITS for scaling -// - M_LOGB_2_HI : high part of -log_b(2) * 2^(-MID_BITS) -// - M_LOGB_2_LO : low part of -log_b(2) * 2^(-MID_BITS) -// - EXP_2_MID : look up table for bit fields of 2^mid -// Return: -// { 2^(hi + mid), lo } -template LIBC_INLINE exp_b_reduc_t exp_b_range_reduc(float x) { - double xd = static_cast(x); - // kd = round((hi + mid) * log2(b) * 2^MID_BITS) - double kd = fputil::nearest_integer(Base::LOG2_B * xd); - // k = round((hi + mid) * log2(b) * 2^MID_BITS) - int k = static_cast(kd); - // hi = floor(kd * 2^(-MID_BITS)) - // exp_hi = shift hi to the exponent field of double precision. - uint64_t exp_hi = static_cast(k >> Base::MID_BITS) - << fputil::FPBits::FRACTION_LEN; - // mh = 2^hi * 2^mid - // mh_bits = bit field of mh - uint64_t mh_bits = Base::EXP_2_MID[k & Base::MID_MASK] + exp_hi; - double mh = fputil::FPBits(mh_bits).get_val(); - // dx = lo = x - (hi + mid) * log(2) - double dx = fputil::multiply_add( - kd, Base::M_LOGB_2_LO, fputil::multiply_add(kd, Base::M_LOGB_2_HI, xd)); - return {mh, dx}; -} - // The function correctly calculates sinh(x) and cosh(x) by calculating exp(x) // and exp(-x) simultaneously. // To compute e^x, we perform the following range @@ -269,33 +120,6 @@ template LIBC_INLINE double exp_pm_eval(float x) { return r; } -// x should be positive, normal finite value -LIBC_INLINE static double log2_eval(double x) { - using FPB = fputil::FPBits; - FPB bs(x); - - double result = 0; - result += bs.get_exponent(); - - int p1 = (bs.get_mantissa() >> (FPB::FRACTION_LEN - LOG_P1_BITS)) & - (LOG_P1_SIZE - 1); - - bs.set_uintval(bs.uintval() & (FPB::FRACTION_MASK >> LOG_P1_BITS)); - bs.set_biased_exponent(FPB::EXP_BIAS); - double dx = (bs.get_val() - 1.0) * LOG_P1_1_OVER[p1]; - - // Taylor series for log(2,1+x) - double c1 = fputil::multiply_add(dx, K_LOG2_ODD[0], K_LOG2_EVEN[0]); - double c2 = fputil::multiply_add(dx, K_LOG2_ODD[1], K_LOG2_EVEN[1]); - double c3 = fputil::multiply_add(dx, K_LOG2_ODD[2], K_LOG2_EVEN[2]); - double c4 = fputil::multiply_add(dx, K_LOG2_ODD[3], K_LOG2_EVEN[3]); - - // c0 = dx * (1.0 / ln(2)) + LOG_P1_LOG2[p1] - double c0 = fputil::multiply_add(dx, 0x1.71547652b82fep+0, LOG_P1_LOG2[p1]); - result += LIBC_NAMESPACE::fputil::polyeval(dx * dx, c0, c1, c2, c3, c4); - return result; -} - // x should be positive, normal finite value // TODO: Simplify range reduction and polynomial degree for float16. // See issue #137190. diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp index dfdfd5d6d5760..a45ef511c9bad 100644 --- a/libc/src/math/generic/powf.cpp +++ b/libc/src/math/generic/powf.cpp @@ -9,20 +9,17 @@ #include "src/math/powf.h" #include "common_constants.h" // Lookup tables EXP_M1 and EXP_M2. #include "src/__support/CPP/bit.h" -#include "src/__support/CPP/optional.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" #include "src/__support/FPUtil/double_double.h" -#include "src/__support/FPUtil/except_value_utils.h" #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/nearest_integer.h" -#include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/FPUtil/sqrt.h" // Speedup for powf(x, 1/2) = sqrtf(x) #include "src/__support/common.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/math/exp10f.h" // Speedup for powf(10, y) = exp10f(y) -#include "exp10f_impl.h" // Speedup for powf(10, y) = exp10f(y) #include "exp2f_impl.h" // Speedup for powf(2, y) = exp2f(y) namespace LIBC_NAMESPACE_DECL { @@ -781,7 +778,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) { return generic::exp2f(y); case 0x4120'0000: // x = 10.0f // pow(10, y) = exp10(y) - return generic::exp10f(y); + return math::exp10f(y); #endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS } diff --git a/libc/src/math/generic/sinhf.cpp b/libc/src/math/generic/sinhf.cpp index d6158fd302536..63111f84de141 100644 --- a/libc/src/math/generic/sinhf.cpp +++ b/libc/src/math/generic/sinhf.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/math/sinhf.h" +#include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/macros/config.h" diff --git a/libc/test/src/math/explogxf_test.cpp b/libc/test/src/math/explogxf_test.cpp index 01197b835433f..ff1181e0c6fd0 100644 --- a/libc/test/src/math/explogxf_test.cpp +++ b/libc/test/src/math/explogxf_test.cpp @@ -43,11 +43,6 @@ TEST_F(LlvmLibcExplogfTest, ExpInFloatRange) { def_prec); } -TEST_F(LlvmLibcExplogfTest, Log2InFloatRange) { - CHECK_DATA(0.0f, inf, mpfr::Operation::Log2, LIBC_NAMESPACE::log2_eval, - f_normal, def_count, def_prec); -} - TEST_F(LlvmLibcExplogfTest, LogInFloatRange) { CHECK_DATA(0.0f, inf, mpfr::Operation::Log, LIBC_NAMESPACE::log_eval, f_normal, def_count, def_prec); diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index 47464d448f997..e3d807a46fe6a 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1996,16 +1996,15 @@ libc_support_library( libc_support_library( name = "explogxf", - srcs = ["src/math/generic/explogxf.cpp"], hdrs = ["src/math/generic/explogxf.h"], deps = [ - ":__support_common", ":__support_fputil_fenv_impl", ":__support_fputil_fma", ":__support_fputil_multiply_add", ":__support_fputil_nearest_integer", - ":__support_fputil_polyeval", ":__support_math_exp_utils", + ":__support_math_exp10f_utils", + ":__support_macros_properties_cpu_features", ":common_constants", ], ) @@ -2050,19 +2049,6 @@ libc_support_library( ], ) -libc_support_library( - name = "exp10f_impl", - hdrs = ["src/math/generic/exp10f_impl.h"], - deps = [ - ":__support_fputil_fma", - ":__support_fputil_multiply_add", - ":__support_fputil_rounding_mode", - ":__support_macros_optimization", - ":common_constants", - ":explogxf", - ], -) - libc_support_library( name = "exp2f_impl", hdrs = ["src/math/generic/exp2f_impl.h"], @@ -2263,6 +2249,33 @@ libc_support_library( ], ) +libc_support_library( + name = "__support_math_exp10f_utils", + hdrs = ["src/__support/math/exp10f_utils.h"], + deps = [ + ":__support_fputil_basic_operations", + ":__support_fputil_fenv_impl", + ":__support_fputil_multiply_add", + ":__support_fputil_nearest_integer", + ":__support_fputil_polyeval", + ":__support_common", + ":__support_math_exp_utils", + ], +) + +libc_support_library( + name = "__support_math_exp10f", + hdrs = ["src/__support/math/exp10f.h"], + deps = [ + ":__support_math_exp10f_utils", + ":__support_fputil_fenv_impl", + ":__support_fputil_fp_bits", + ":__support_fputil_multiply_add", + ":__support_fputil_rounding_mode", + ":__support_macros_optimization", + ], +) + ############################### complex targets ################################ libc_function( @@ -2726,10 +2739,10 @@ libc_math_function( name = "cosf", additional_deps = [ ":__support_fputil_fma", - ":__support_fputil_multiply_add", ":__support_macros_optimization", ":__support_macros_properties_cpu_features", ":sincosf_utils", + ":errno", ], ) @@ -2875,7 +2888,8 @@ libc_math_function( libc_math_function( name = "exp10f", additional_deps = [ - ":exp10f_impl", + ":__support_math_exp10f", + ":errno", ], ) @@ -3724,14 +3738,13 @@ libc_math_function( ":__support_fputil_multiply_add", ":__support_fputil_nearest_integer", ":__support_fputil_polyeval", - ":__support_fputil_rounding_mode", ":__support_fputil_sqrt", ":__support_fputil_triple_double", ":__support_macros_optimization", + ":__support_math_exp10f", ":common_constants", ":explogxf", ":exp2f_impl", - ":exp10f_impl", ], ) @@ -3840,7 +3853,6 @@ libc_math_function( name = "sinf", additional_deps = [ ":__support_fputil_fma", - ":__support_fputil_multiply_add", ":__support_fputil_polyeval", ":__support_fputil_rounding_mode", ":__support_macros_optimization", From 13549fd90af45d2200159cac14a12cf01db56aa1 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 17 Jul 2025 10:29:18 -0700 Subject: [PATCH 211/813] MCAssembler: Modify Contents when VarFixups is not empty When there is no VarFixup, VarContentStart is zero. `slice(F.VarContentStart - Contents.size(), F.getSize())` might lead to "runtime error: addition of unsigned offset to" in ubsan builds after #148544 --- llvm/lib/MC/MCAssembler.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp index f1a82f6b08d31..3e96bdf5169d8 100644 --- a/llvm/lib/MC/MCAssembler.cpp +++ b/llvm/lib/MC/MCAssembler.cpp @@ -735,13 +735,17 @@ void MCAssembler::layout() { // In the variable part, fixup offsets are relative to the fixed part's // start. Extend the variable contents to the left to account for the // fixed part size. - Contents = MutableArrayRef(F.getParent()->ContentStorage) - .slice(F.VarContentStart - Contents.size(), F.getSize()); - for (MCFixup &Fixup : F.getVarFixups()) { - uint64_t FixedValue; - MCValue Target; - evaluateFixup(F, Fixup, Target, FixedValue, - /*RecordReloc=*/true, Contents); + auto VarFixups = F.getVarFixups(); + if (VarFixups.size()) { + Contents = + MutableArrayRef(F.getParent()->ContentStorage) + .slice(F.VarContentStart - Contents.size(), F.getSize()); + for (MCFixup &Fixup : VarFixups) { + uint64_t FixedValue; + MCValue Target; + evaluateFixup(F, Fixup, Target, FixedValue, + /*RecordReloc=*/true, Contents); + } } } else if (auto *AF = dyn_cast(&F)) { // For RISC-V linker relaxation, an alignment relocation might be From 6f28eec6dc68c64ebe108be3fdb7d0affb1e3349 Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Thu, 17 Jul 2025 10:32:11 -0700 Subject: [PATCH 212/813] [libc] Fixed StringConverter Error Edge Case (#149356) Fixed StringConverter edge case related to destination limit If we call pop() but there is no space in the dest array, we should always return the "no space in destination" error even if the following character is invalid (since we shouldn't really have to look at the next character) --- libc/src/__support/wchar/string_converter.h | 6 ++ .../__support/wchar/string_converter_test.cpp | 57 +++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/libc/src/__support/wchar/string_converter.h b/libc/src/__support/wchar/string_converter.h index 0635bc57bf3e2..869ebdfc8b390 100644 --- a/libc/src/__support/wchar/string_converter.h +++ b/libc/src/__support/wchar/string_converter.h @@ -56,6 +56,9 @@ template class StringConverter { // TODO: following functions are almost identical // look into templating CharacterConverter pop functions ErrorOr popUTF32() { + if (num_to_write == 0) + return Error(-1); + if (cr.isEmpty() || src_idx == 0) { auto src_elements_read = pushFullCharacter(); if (!src_elements_read.has_value()) @@ -79,6 +82,9 @@ template class StringConverter { } ErrorOr popUTF8() { + if (num_to_write == 0) + return Error(-1); + if (cr.isEmpty() || src_idx == 0) { auto src_elements_read = pushFullCharacter(); if (!src_elements_read.has_value()) diff --git a/libc/test/src/__support/wchar/string_converter_test.cpp b/libc/test/src/__support/wchar/string_converter_test.cpp index 14d074156d033..d514df9317852 100644 --- a/libc/test/src/__support/wchar/string_converter_test.cpp +++ b/libc/test/src/__support/wchar/string_converter_test.cpp @@ -245,6 +245,63 @@ TEST(LlvmLibcStringConverterTest, UTF8To32ErrorHandling) { ASSERT_EQ(static_cast(sc.getSourceIndex()), 4); } +TEST(LlvmLibcStringConverterTest, InvalidCharacterOutsideBounds) { + // if an invalid character exists in the source string but we don't have space + // to write it, we should return a "stop converting" error rather than an + // invalid character error + + // first 4 bytes are clown emoji (🤡) + // next 3 form an invalid character + const char *src1 = "\xF0\x9F\xA4\xA1\x90\x88\x30"; + LIBC_NAMESPACE::internal::mbstate ps1; + LIBC_NAMESPACE::internal::StringConverter sc1( + reinterpret_cast(src1), &ps1, 1); + + auto res1 = sc1.popUTF32(); + ASSERT_TRUE(res1.has_value()); + ASSERT_EQ(static_cast(res1.value()), 0x1f921); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 4); + + res1 = sc1.popUTF32(); + ASSERT_FALSE(res1.has_value()); + // no space to write error NOT invalid character error (EILSEQ) + ASSERT_EQ(static_cast(res1.error()), -1); + ASSERT_EQ(static_cast(sc1.getSourceIndex()), 4); + + const wchar_t src2[] = { + static_cast(0x1f921), static_cast(0xffffff), + static_cast(0x0)}; // clown emoji, invalid utf32 + LIBC_NAMESPACE::internal::mbstate ps2; + LIBC_NAMESPACE::internal::StringConverter sc2( + reinterpret_cast(src2), &ps2, 4); + + auto res2 = sc2.popUTF8(); + ASSERT_TRUE(res2.has_value()); + ASSERT_EQ(static_cast(res2.value()), 0xF0); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); + + res2 = sc2.popUTF8(); + ASSERT_TRUE(res2.has_value()); + ASSERT_EQ(static_cast(res2.value()), 0x9F); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); + + res2 = sc2.popUTF8(); + ASSERT_TRUE(res2.has_value()); + ASSERT_EQ(static_cast(res2.value()), 0xA4); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); + + res2 = sc2.popUTF8(); + ASSERT_TRUE(res2.has_value()); + ASSERT_EQ(static_cast(res2.value()), 0xA1); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); + + res2 = sc2.popUTF8(); + ASSERT_FALSE(res2.has_value()); + // no space to write error NOT invalid character error (EILSEQ) + ASSERT_EQ(static_cast(res2.error()), -1); + ASSERT_EQ(static_cast(sc2.getSourceIndex()), 1); +} + TEST(LlvmLibcStringConverterTest, MultipleStringConverters32To8) { /* We do NOT test partially popping a character and expecting the next From 163da8796bed51f82d7c07d0ac6db6de7879bd21 Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Thu, 17 Jul 2025 13:34:52 -0400 Subject: [PATCH 213/813] [Docs] Mention security of libclang (#149357) Libclang is a wrapper around the Clang frontend, and frontends are not security-sensitive components of the LLVM project. However, libclang is often embedded in people's downstream tools, so it's best to mention that explicitly. --- clang/docs/LibClang.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst index 6c2b11ac7fc23..e747022b9c173 100644 --- a/clang/docs/LibClang.rst +++ b/clang/docs/LibClang.rst @@ -404,3 +404,9 @@ following situations are explicitly unsupported: compatible across library versions. * For the same reason as above, serializing objects from one version of the library and deserializing with a different version is also not supported. + +Note: because libclang is a wrapper around the compiler frontend, it is not a +`security-sensitive component`_ of the LLVM Project. Consider using a sandbox +or some other mitigation approach if processing untrusted input. + +.. _security-sensitive component: https://llvm.org/docs/Security.html#what-is-considered-a-security-issue From aa3978573e15205b43c6a7e3b4a6f940ccded7a2 Mon Sep 17 00:00:00 2001 From: tyb0807 Date: Thu, 17 Jul 2025 19:38:21 +0200 Subject: [PATCH 214/813] [mlir][vector][memref] Add `alignment` attribute to memory access ops (#144344) Alignment information is important to allow LLVM backends such as AMDGPU to select wide memory accesses (e.g., dwordx4 or b128). Since this info is not always inferable, it's better to inform LLVM backends explicitly about it. Furthermore, alignment is not necessarily a property of the element type, but of each individual memory access op (we can have overaligned and underaligned accesses compared to the natural/preferred alignment of the element type). This patch introduces `alignment` attribute to memref/vector.load/store ops. Follow-up PRs will 1. Propagate the attribute to LLVM/SPIR-V. 2. Introduce `alignment` attribute to other vector memory access ops: vector.gather + vector.scatter vector.transfer_read + vector.transfer_write vector.compressstore + vector.expandload vector.maskedload + vector.maskedstore 3. Replace `--convert-vector-to-llvm='use-vector-alignment=1` with a simple pass to populate alignment attributes based on the vector types. --- mlir/docs/DefiningDialects/Operations.md | 2 + .../mlir/Dialect/MemRef/IR/MemRefOps.td | 60 ++++++++++++++++++- .../mlir/Dialect/Vector/IR/VectorOps.td | 55 ++++++++++++++++- mlir/include/mlir/IR/CommonAttrConstraints.td | 4 ++ mlir/test/Dialect/MemRef/invalid.mlir | 18 ++++++ mlir/test/Dialect/MemRef/ops.mlir | 11 ++++ mlir/test/Dialect/Vector/invalid.mlir | 18 ++++++ mlir/test/Dialect/Vector/ops.mlir | 10 ++++ 8 files changed, 172 insertions(+), 6 deletions(-) diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md index b3bde055f04f0..2225329ff830b 100644 --- a/mlir/docs/DefiningDialects/Operations.md +++ b/mlir/docs/DefiningDialects/Operations.md @@ -306,6 +306,8 @@ Right now, the following primitive constraints are supported: * `IntPositive`: Specifying an integer attribute whose value is positive * `IntNonNegative`: Specifying an integer attribute whose value is non-negative +* `IntPowerOf2`: Specifying an integer attribute whose value is a power of + two > 0 * `ArrayMinCount`: Specifying an array attribute to have at least `N` elements * `ArrayMaxCount`: Specifying an array attribute to have at most `N` diff --git a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td index 09bb3932ef293..9321089ab55fa 100644 --- a/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td +++ b/mlir/include/mlir/Dialect/MemRef/IR/MemRefOps.td @@ -1216,6 +1216,11 @@ def LoadOp : MemRef_Op<"load", be reused in the cache. For details, refer to the [https://llvm.org/docs/LangRef.html#load-instruction](LLVM load instruction). + An optional `alignment` attribute allows to specify the byte alignment of the + load operation. It must be a positive power of 2. The operation must access + memory at an address aligned to this boundary. Violations may lead to + architecture-specific faults or performance penalties. + A value of 0 indicates no specific alignment requirement. Example: ```mlir @@ -1226,7 +1231,39 @@ def LoadOp : MemRef_Op<"load", let arguments = (ins Arg:$memref, Variadic:$indices, - DefaultValuedOptionalAttr:$nontemporal); + DefaultValuedOptionalAttr:$nontemporal, + ConfinedAttr, + [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment); + + let builders = [ + OpBuilder<(ins "Value":$memref, + "ValueRange":$indices, + CArg<"bool", "false">:$nontemporal, + CArg<"uint64_t", "0">:$alignment), [{ + return build($_builder, $_state, memref, indices, nontemporal, + alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : + nullptr); + }]>, + OpBuilder<(ins "Type":$resultType, + "Value":$memref, + "ValueRange":$indices, + CArg<"bool", "false">:$nontemporal, + CArg<"uint64_t", "0">:$alignment), [{ + return build($_builder, $_state, resultType, memref, indices, nontemporal, + alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : + nullptr); + }]>, + OpBuilder<(ins "TypeRange":$resultTypes, + "Value":$memref, + "ValueRange":$indices, + CArg<"bool", "false">:$nontemporal, + CArg<"uint64_t", "0">:$alignment), [{ + return build($_builder, $_state, resultTypes, memref, indices, nontemporal, + alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : + nullptr); + }]> + ]; + let results = (outs AnyType:$result); let extraClassDeclaration = [{ @@ -1912,6 +1949,11 @@ def MemRef_StoreOp : MemRef_Op<"store", be reused in the cache. For details, refer to the [https://llvm.org/docs/LangRef.html#store-instruction](LLVM store instruction). + An optional `alignment` attribute allows to specify the byte alignment of the + store operation. It must be a positive power of 2. The operation must access + memory at an address aligned to this boundary. Violations may lead to + architecture-specific faults or performance penalties. + A value of 0 indicates no specific alignment requirement. Example: ```mlir @@ -1923,13 +1965,25 @@ def MemRef_StoreOp : MemRef_Op<"store", Arg:$memref, Variadic:$indices, - DefaultValuedOptionalAttr:$nontemporal); + DefaultValuedOptionalAttr:$nontemporal, + ConfinedAttr, + [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment); let builders = [ + OpBuilder<(ins "Value":$valueToStore, + "Value":$memref, + "ValueRange":$indices, + CArg<"bool", "false">:$nontemporal, + CArg<"uint64_t", "0">:$alignment), [{ + return build($_builder, $_state, valueToStore, memref, indices, nontemporal, + alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : + nullptr); + }]>, OpBuilder<(ins "Value":$valueToStore, "Value":$memref), [{ $_state.addOperands(valueToStore); $_state.addOperands(memref); - }]>]; + }]> + ]; let extraClassDeclaration = [{ Value getValueToStore() { return getOperand(0); } diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td index cbe490f6e4dd1..e07188a1a04bf 100644 --- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td @@ -1809,12 +1809,42 @@ def Vector_LoadOp : Vector_Op<"load", [ ```mlir %result = vector.load %memref[%c0] : memref<7xf32>, vector<8xf32> ``` + + An optional `alignment` attribute allows to specify the byte alignment of the + load operation. It must be a positive power of 2. The operation must access + memory at an address aligned to this boundary. Violations may lead to + architecture-specific faults or performance penalties. + A value of 0 indicates no specific alignment requirement. }]; let arguments = (ins Arg:$base, Variadic:$indices, - DefaultValuedOptionalAttr:$nontemporal); + DefaultValuedOptionalAttr:$nontemporal, + ConfinedAttr, + [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment); + + let builders = [ + OpBuilder<(ins "VectorType":$resultType, + "Value":$base, + "ValueRange":$indices, + CArg<"bool", "false">:$nontemporal, + CArg<"uint64_t", "0">:$alignment), [{ + return build($_builder, $_state, resultType, base, indices, nontemporal, + alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : + nullptr); + }]>, + OpBuilder<(ins "TypeRange":$resultTypes, + "Value":$base, + "ValueRange":$indices, + CArg<"bool", "false">:$nontemporal, + CArg<"uint64_t", "0">:$alignment), [{ + return build($_builder, $_state, resultTypes, base, indices, nontemporal, + alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : + nullptr); + }]> + ]; + let results = (outs AnyVectorOfAnyRank:$result); let extraClassDeclaration = [{ @@ -1895,6 +1925,12 @@ def Vector_StoreOp : Vector_Op<"store", [ ```mlir vector.store %valueToStore, %memref[%c0] : memref<7xf32>, vector<8xf32> ``` + + An optional `alignment` attribute allows to specify the byte alignment of the + store operation. It must be a positive power of 2. The operation must access + memory at an address aligned to this boundary. Violations may lead to + architecture-specific faults or performance penalties. + A value of 0 indicates no specific alignment requirement. }]; let arguments = (ins @@ -1902,8 +1938,21 @@ def Vector_StoreOp : Vector_Op<"store", [ Arg:$base, Variadic:$indices, - DefaultValuedOptionalAttr:$nontemporal - ); + DefaultValuedOptionalAttr:$nontemporal, + ConfinedAttr, + [AllAttrOf<[IntPositive, IntPowerOf2]>]>:$alignment); + + let builders = [ + OpBuilder<(ins "Value":$valueToStore, + "Value":$base, + "ValueRange":$indices, + CArg<"bool", "false">:$nontemporal, + CArg<"uint64_t", "0">:$alignment), [{ + return build($_builder, $_state, valueToStore, base, indices, nontemporal, + alignment != 0 ? $_builder.getI64IntegerAttr(alignment) : + nullptr); + }]> + ]; let extraClassDeclaration = [{ MemRefType getMemRefType() { diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td index e91a13fea5c7f..18da85a580710 100644 --- a/mlir/include/mlir/IR/CommonAttrConstraints.td +++ b/mlir/include/mlir/IR/CommonAttrConstraints.td @@ -796,6 +796,10 @@ def IntPositive : AttrConstraint< CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getValue().isStrictlyPositive()">, "whose value is positive">; +def IntPowerOf2 : AttrConstraint< + CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getValue().isPowerOf2()">, + "whose value is a power of two > 0">; + class ArrayMaxCount : AttrConstraint< CPred<"::llvm::cast<::mlir::ArrayAttr>($_self).size() <= " # n>, "with at most " # n # " elements">; diff --git a/mlir/test/Dialect/MemRef/invalid.mlir b/mlir/test/Dialect/MemRef/invalid.mlir index 704cdaf838f45..fa803efa1d910 100644 --- a/mlir/test/Dialect/MemRef/invalid.mlir +++ b/mlir/test/Dialect/MemRef/invalid.mlir @@ -962,6 +962,24 @@ func.func @test_store_zero_results2(%x: i32, %p: memref) { // ----- +func.func @invalid_load_alignment(%memref: memref<4xi32>) { + %c0 = arith.constant 0 : index + // expected-error @below {{'memref.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + %val = memref.load %memref[%c0] { alignment = -1 } : memref<4xi32> + return +} + +// ----- + +func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: i32) { + %c0 = arith.constant 0 : index + // expected-error @below {{'memref.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + memref.store %val, %memref[%c0] { alignment = 3 } : memref<4xi32> + return +} + +// ----- + func.func @test_alloc_memref_map_rank_mismatch() { ^bb0: // expected-error@+1 {{memref layout mismatch between rank and affine map: 2 != 1}} diff --git a/mlir/test/Dialect/MemRef/ops.mlir b/mlir/test/Dialect/MemRef/ops.mlir index e11de7bec2d0a..6c2298a3f8acb 100644 --- a/mlir/test/Dialect/MemRef/ops.mlir +++ b/mlir/test/Dialect/MemRef/ops.mlir @@ -265,6 +265,17 @@ func.func @zero_dim_no_idx(%arg0 : memref, %arg1 : memref, %arg2 : mem // CHECK: memref.store %{{.*}}, %{{.*}}[] : memref } + +// CHECK-LABEL: func @load_store_alignment +func.func @load_store_alignment(%memref: memref<4xi32>) { + %c0 = arith.constant 0 : index + // CHECK: memref.load {{.*}} {alignment = 16 : i64} + %val = memref.load %memref[%c0] { alignment = 16 } : memref<4xi32> + // CHECK: memref.store {{.*}} {alignment = 16 : i64} + memref.store %val, %memref[%c0] { alignment = 16 } : memref<4xi32> + return +} + // CHECK-LABEL: func @memref_view(%arg0 func.func @memref_view(%arg0 : index, %arg1 : index, %arg2 : index) { %0 = memref.alloc() : memref<2048xi8> diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index 5038646e1f026..8017140a0bfab 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -1995,6 +1995,15 @@ func.func @vector_load(%src : memref) { // ----- +func.func @invalid_load_alignment(%memref: memref<4xi32>) { + %c0 = arith.constant 0 : index + // expected-error @below {{'vector.load' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + %val = vector.load %memref[%c0] { alignment = -1 } : memref<4xi32>, vector<4xi32> + return +} + +// ----- + //===----------------------------------------------------------------------===// // vector.store //===----------------------------------------------------------------------===// @@ -2005,3 +2014,12 @@ func.func @vector_store(%dest : memref, %vec : vector<16x16xi8>) { vector.store %vec, %dest[%c0] : memref, vector<16x16xi8> return } + +// ----- + +func.func @invalid_store_alignment(%memref: memref<4xi32>, %val: vector<4xi32>) { + %c0 = arith.constant 0 : index + // expected-error @below {{'vector.store' op attribute 'alignment' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive and whose value is a power of two > 0}} + vector.store %val, %memref[%c0] { alignment = 3 } : memref<4xi32>, vector<4xi32> + return +} diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir index 10bf0f1620568..39578ac56e369 100644 --- a/mlir/test/Dialect/Vector/ops.mlir +++ b/mlir/test/Dialect/Vector/ops.mlir @@ -853,6 +853,16 @@ func.func @vector_load_and_store_2d_vector_memref(%memref : memref<200x100xvecto return } +// CHECK-LABEL: func @load_store_alignment +func.func @load_store_alignment(%memref: memref<4xi32>) { + %c0 = arith.constant 0 : index + // CHECK: vector.load {{.*}} {alignment = 16 : i64} + %val = vector.load %memref[%c0] { alignment = 16 } : memref<4xi32>, vector<4xi32> + // CHECK: vector.store {{.*}} {alignment = 16 : i64} + vector.store %val, %memref[%c0] { alignment = 16 } : memref<4xi32>, vector<4xi32> + return +} + // CHECK-LABEL: @masked_load_and_store func.func @masked_load_and_store(%base: memref, %mask: vector<16xi1>, %passthru: vector<16xf32>) { %c0 = arith.constant 0 : index From e73d1a5341fafb1eadb77b787eb6e65630b4db3a Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Thu, 17 Jul 2025 10:40:04 -0700 Subject: [PATCH 215/813] [msan] Add tests for avx512-gfni-intrinsics (#149258) Gluten-free, nuts included or something --- .../X86/avx512-gfni-intrinsics.ll | 670 ++++++++++++++++++ 1 file changed, 670 insertions(+) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll new file mode 100644 index 0000000000000..e5e4371c525b2 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-gfni-intrinsics.ll @@ -0,0 +1,670 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -S -mattr=+avx512vl,+gfni,+avx512bw -passes=msan 2>&1 | FileCheck %s +; +; Forked from llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll +; +; Strictly handled: +; - llvm.x86.vgf2p8affineinvqb.128 +; - llvm.x86.vgf2p8affineinvqb.256 +; - llvm.x86.vgf2p8affineinvqb.512 +; - llvm.x86.vgf2p8affineqb.128 +; - llvm.x86.vgf2p8affineqb.256 +; - llvm.x86.vgf2p8affineqb.512 +; +; Heuristically handled: +; - llvm.x86.vgf2p8mulb.128 +; - llvm.x86.vgf2p8mulb.256 +; - llvm.x86.vgf2p8mulb.512 + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8>, <16 x i8>, i8) +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8affineinvqb_128( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] +; CHECK: 19: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 20: +; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5) +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2 +; CHECK-NEXT: store { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP36]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3) + %3 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 4) + %4 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5) + %5 = select <16 x i1> %1, <16 x i8> %3, <16 x i8> zeroinitializer + %6 = select <16 x i1> %1, <16 x i8> %4, <16 x i8> %passthru + %7 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %2, 0 + %8 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, <16 x i8> %5, 1 + %9 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } %8, <16 x i8> %6, 2 + ret { <16 x i8>, <16 x i8>, <16 x i8> } %9 +} + +declare <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8>, <32 x i8>, i8) +define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8affineinvqb_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] +; CHECK: 19: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 20: +; CHECK-NEXT: [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5) +; CHECK-NEXT: [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2 +; CHECK-NEXT: store { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP36]] +; + %1 = bitcast i32 %mask to <32 x i1> + %2 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3) + %3 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 4) + %4 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 5) + %5 = select <32 x i1> %1, <32 x i8> %3, <32 x i8> zeroinitializer + %6 = select <32 x i1> %1, <32 x i8> %4, <32 x i8> %passthru + %7 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> %2, 0 + %8 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } %7, <32 x i8> %5, 1 + %9 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } %8, <32 x i8> %6, 2 + ret { <32 x i8>, <32 x i8>, <32 x i8> } %9 +} + +declare <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8>, <64 x i8>, i8) +define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8affineinvqb_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP17]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP18]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] +; CHECK: 19: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 20: +; CHECK-NEXT: [[TMP21:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 5) +; CHECK-NEXT: [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <64 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <64 x i8> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP25]], <64 x i8> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = xor <64 x i8> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP29:%.*]] = or <64 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <64 x i8> [[TMP29]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP30]], <64 x i8> [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP21]], <64 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP11]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> zeroinitializer, <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP32]], <64 x i8> [[TMP26]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP33]], <64 x i8> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP34]], <64 x i8> [[TMP31]], 2 +; CHECK-NEXT: store { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP36]] +; + %1 = bitcast i64 %mask to <64 x i1> + %2 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3) + %3 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 4) + %4 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 5) + %5 = select <64 x i1> %1, <64 x i8> %3, <64 x i8> zeroinitializer + %6 = select <64 x i1> %1, <64 x i8> %4, <64 x i8> %passthru + %7 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> %2, 0 + %8 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } %7, <64 x i8> %5, 1 + %9 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } %8, <64 x i8> %6, 2 + ret { <64 x i8>, <64 x i8>, <64 x i8> } %9 +} + +declare <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8>, <16 x i8>, i8) +define { <16 x i8>, <16 x i8>, <16 x i8> } @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8affineqb_128( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]], i8 3) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <16 x i8> [[TMP2]] to i128 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i128 [[TMP17]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i8> [[TMP3]] to i128 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i128 [[TMP18]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] +; CHECK: 19: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 20: +; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> [[SRC1]], <16 x i8> [[SRC2]], i8 5) +; CHECK-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <16 x i8> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <16 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i8> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP25]], <16 x i8> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP16]], <16 x i8> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> zeroinitializer, <16 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = xor <16 x i8> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP29:%.*]] = or <16 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <16 x i8> [[TMP29]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP30]], <16 x i8> [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP21]], <16 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> [[TMP11]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } { <16 x i8> zeroinitializer, <16 x i8> splat (i8 -1), <16 x i8> splat (i8 -1) }, <16 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP32]], <16 x i8> [[TMP26]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP33]], <16 x i8> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP34]], <16 x i8> [[TMP31]], 2 +; CHECK-NEXT: store { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <16 x i8>, <16 x i8>, <16 x i8> } [[TMP36]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3) + %3 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 4) + %4 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 5) + %5 = select <16 x i1> %1, <16 x i8> %3, <16 x i8> zeroinitializer + %6 = select <16 x i1> %1, <16 x i8> %4, <16 x i8> %passthru + %7 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } poison, <16 x i8> %2, 0 + %8 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, <16 x i8> %5, 1 + %9 = insertvalue { <16 x i8>, <16 x i8>, <16 x i8> } %8, <16 x i8> %6, 2 + ret { <16 x i8>, <16 x i8>, <16 x i8> } %9 +} + +declare <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8>, <32 x i8>, i8) +define { <32 x i8>, <32 x i8>, <32 x i8> } @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8affineqb_256( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i256 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]], i8 3) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i256 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i8> [[TMP2]] to i256 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i256 [[TMP17]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <32 x i8> [[TMP3]] to i256 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i256 [[TMP18]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] +; CHECK: 19: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 20: +; CHECK-NEXT: [[TMP21:%.*]] = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> [[SRC1]], <32 x i8> [[SRC2]], i8 5) +; CHECK-NEXT: [[TMP22:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <32 x i8> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <32 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <32 x i8> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP25]], <32 x i8> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP16]], <32 x i8> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> zeroinitializer, <32 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = xor <32 x i8> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP29:%.*]] = or <32 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <32 x i8> [[TMP29]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP30]], <32 x i8> [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP21]], <32 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> [[TMP11]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } { <32 x i8> zeroinitializer, <32 x i8> splat (i8 -1), <32 x i8> splat (i8 -1) }, <32 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP32]], <32 x i8> [[TMP26]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP33]], <32 x i8> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP34]], <32 x i8> [[TMP31]], 2 +; CHECK-NEXT: store { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <32 x i8>, <32 x i8>, <32 x i8> } [[TMP36]] +; + %1 = bitcast i32 %mask to <32 x i1> + %2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3) + %3 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 4) + %4 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 5) + %5 = select <32 x i1> %1, <32 x i8> %3, <32 x i8> zeroinitializer + %6 = select <32 x i1> %1, <32 x i8> %4, <32 x i8> %passthru + %7 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } poison, <32 x i8> %2, 0 + %8 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } %7, <32 x i8> %5, 1 + %9 = insertvalue { <32 x i8>, <32 x i8>, <32 x i8> } %8, <32 x i8> %6, 2 + ret { <32 x i8>, <32 x i8>, <32 x i8> } %9 +} + +declare <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8>, <64 x i8>, i8) +define { <64 x i8>, <64 x i8>, <64 x i8> } @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8affineqb_512( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP7]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0 +; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]] +; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]] +; CHECK: 9: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 10: +; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]], i8 3) +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP12]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i512 [[TMP13]], 0 +; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]] +; CHECK-NEXT: br i1 [[_MSOR5]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]] +; CHECK: 14: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 15: +; CHECK-NEXT: [[TMP16:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 4) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <64 x i8> [[TMP2]] to i512 +; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i512 [[TMP17]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <64 x i8> [[TMP3]] to i512 +; CHECK-NEXT: [[_MSCMP7:%.*]] = icmp ne i512 [[TMP18]], 0 +; CHECK-NEXT: [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]] +; CHECK-NEXT: br i1 [[_MSOR8]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]] +; CHECK: 19: +; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: 20: +; CHECK-NEXT: [[TMP21:%.*]] = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> [[SRC1]], <64 x i8> [[SRC2]], i8 5) +; CHECK-NEXT: [[TMP22:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = xor <64 x i8> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = or <64 x i8> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = or <64 x i8> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP25]], <64 x i8> [[TMP22]] +; CHECK-NEXT: [[TMP26:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP16]], <64 x i8> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> zeroinitializer, <64 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP28:%.*]] = xor <64 x i8> [[TMP21]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP29:%.*]] = or <64 x i8> [[TMP28]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = or <64 x i8> [[TMP29]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT1:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP30]], <64 x i8> [[TMP27]] +; CHECK-NEXT: [[TMP31:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP21]], <64 x i8> [[PASSTHRU]] +; CHECK-NEXT: [[TMP32:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> [[TMP11]], 0 +; CHECK-NEXT: [[TMP33:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } { <64 x i8> zeroinitializer, <64 x i8> splat (i8 -1), <64 x i8> splat (i8 -1) }, <64 x i8> [[_MSPROP_SELECT]], 1 +; CHECK-NEXT: [[TMP34:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP32]], <64 x i8> [[TMP26]], 1 +; CHECK-NEXT: [[TMP35:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP33]], <64 x i8> [[_MSPROP_SELECT1]], 2 +; CHECK-NEXT: [[TMP36:%.*]] = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP34]], <64 x i8> [[TMP31]], 2 +; CHECK-NEXT: store { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP35]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret { <64 x i8>, <64 x i8>, <64 x i8> } [[TMP36]] +; + %1 = bitcast i64 %mask to <64 x i1> + %2 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3) + %3 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 4) + %4 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 5) + %5 = select <64 x i1> %1, <64 x i8> %3, <64 x i8> zeroinitializer + %6 = select <64 x i1> %1, <64 x i8> %4, <64 x i8> %passthru + %7 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } poison, <64 x i8> %2, 0 + %8 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } %7, <64 x i8> %5, 1 + %9 = insertvalue { <64 x i8>, <64 x i8>, <64 x i8> } %8, <64 x i8> %6, 2 + ret { <64 x i8>, <64 x i8>, <64 x i8> } %9 +} + +declare <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8>, <16 x i8>) +define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_128( +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: store <16 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[TMP3]] +; + %1 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2) + ret <16 x i8> %1 +} + +define <16 x i8> @test_vgf2p8mulb_128_mask(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> %passthru, i16 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_128_mask( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[_MSPROP]], <16 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <16 x i8> [[TMP7]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i8> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <16 x i8> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP11]], <16 x i8> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP6]], <16 x i8> [[TMP7]], <16 x i8> [[PASSTHRU]] +; CHECK-NEXT: store <16 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[TMP12]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2) + %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> %passthru + ret <16 x i8> %3 +} + +define <16 x i8> @test_vgf2p8mulb_128_maskz(<16 x i8> %src1, <16 x i8> %src2, i16 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_128_maskz( +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> [[SRC1:%.*]], <16 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[_MSPROP]], <16 x i8> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <16 x i8> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <16 x i8> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <16 x i8> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i8> [[TMP10]], <16 x i8> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> zeroinitializer +; CHECK-NEXT: store <16 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <16 x i8> [[TMP11]] +; + %1 = bitcast i16 %mask to <16 x i1> + %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2) + %3 = select <16 x i1> %1, <16 x i8> %2, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8>, <32 x i8>) +define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_256( +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i8> [[TMP3]] +; + %1 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) + ret <32 x i8> %1 +} + +define <32 x i8> @test_vgf2p8mulb_256_mask(<32 x i8> %src1, <32 x i8> %src2, <32 x i8> %passthru, i32 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_256_mask( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP1]] to <32 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[_MSPROP]], <32 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <32 x i8> [[TMP7]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <32 x i8> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <32 x i8> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP11]], <32 x i8> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <32 x i1> [[TMP6]], <32 x i8> [[TMP7]], <32 x i8> [[PASSTHRU]] +; CHECK-NEXT: store <32 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i8> [[TMP12]] +; + %1 = bitcast i32 %mask to <32 x i1> + %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) + %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> %passthru + ret <32 x i8> %3 +} + +define <32 x i8> @test_vgf2p8mulb_256_maskz(<32 x i8> %src1, <32 x i8> %src2, i32 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_256_maskz( +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP1]] to <32 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[MASK:%.*]] to <32 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> [[SRC1:%.*]], <32 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[_MSPROP]], <32 x i8> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <32 x i8> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <32 x i8> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <32 x i8> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP4]], <32 x i8> [[TMP10]], <32 x i8> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP6]], <32 x i8> zeroinitializer +; CHECK-NEXT: store <32 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <32 x i8> [[TMP11]] +; + %1 = bitcast i32 %mask to <32 x i1> + %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) + %3 = select <32 x i1> %1, <32 x i8> %2, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +declare <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8>, <64 x i8>) +define <64 x i8> @test_vgf2p8mulb_512(<64 x i8> %src1, <64 x i8> %src2) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_512( +; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: store <64 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <64 x i8> [[TMP3]] +; + %1 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) + ret <64 x i8> %1 +} + +define <64 x i8> @test_vgf2p8mulb_512_mask(<64 x i8> %src1, <64 x i8> %src2, <64 x i8> %passthru, i64 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_512_mask( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[TMP1]] to <64 x i1> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: [[TMP8:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[_MSPROP]], <64 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <64 x i8> [[TMP7]], [[PASSTHRU:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = or <64 x i8> [[TMP9]], [[_MSPROP]] +; CHECK-NEXT: [[TMP11:%.*]] = or <64 x i8> [[TMP10]], [[TMP4]] +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP11]], <64 x i8> [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = select <64 x i1> [[TMP6]], <64 x i8> [[TMP7]], <64 x i8> [[PASSTHRU]] +; CHECK-NEXT: store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <64 x i8> [[TMP12]] +; + %1 = bitcast i64 %mask to <64 x i1> + %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) + %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> %passthru + ret <64 x i8> %3 +} + +define <64 x i8> @test_vgf2p8mulb_512_maskz(<64 x i8> %src1, <64 x i8> %src2, i64 %mask) #0 { +; CHECK-LABEL: @test_vgf2p8mulb_512_maskz( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP1]] to <64 x i1> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> [[SRC1:%.*]], <64 x i8> [[SRC2:%.*]]) +; CHECK-NEXT: [[TMP7:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[_MSPROP]], <64 x i8> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = xor <64 x i8> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = or <64 x i8> [[TMP8]], [[_MSPROP]] +; CHECK-NEXT: [[TMP10:%.*]] = or <64 x i8> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[_MSPROP_SELECT:%.*]] = select <64 x i1> [[TMP4]], <64 x i8> [[TMP10]], <64 x i8> [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = select <64 x i1> [[TMP5]], <64 x i8> [[TMP6]], <64 x i8> zeroinitializer +; CHECK-NEXT: store <64 x i8> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <64 x i8> [[TMP11]] +; + %1 = bitcast i64 %mask to <64 x i1> + %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) + %3 = select <64 x i1> %1, <64 x i8> %2, <64 x i8> zeroinitializer + ret <64 x i8> %3 +} + +attributes #0 = { sanitize_memory } From 3b11aaaf94fe6c7b4ccfd031f952265f706c1b68 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Thu, 17 Jul 2025 19:02:08 +0100 Subject: [PATCH 216/813] [mlir][linalg] Add support for scalable vectorization of linalg.mmt4d (#146531) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds support for scalable vectorization of linalg.mmt4d. The key design change is the introduction of a new vectorizer state variable: * `assumeDynamicDimsMatchVecSizes` ...along with the corresponding Transform dialect attribute: * `assume_dynamic_dims_match_vec_sizes`. This flag instructs the vectorizer to assume that dynamic memref/tensor dimensions match the corresponding vector sizes (fixed or scalable). With this assumption, masking becomes unnecessary, which simplifies the lowering pipeline significantly. While this assumption is not universally valid, it typically holds for `linalg.mmt4d`. Inputs and outputs are explicitly packed using `linalg.pack`, and this packing includes padding, ensuring that dimension sizes align with vector sizes (*). * Related discussion: https://github.com/llvm/llvm-project/issues/143920 An upcoming patch will include an end-to-end test that leverages scalable vectorization of linalg.mmt4d to demonstrate the newly enabled functionality. This would not be feasible without the changes introduced here, as it would otherwise require additional logic to handle complex - but ultimately redundant - masks. (*) This holds provided that the tile sizes used for packing match the vector sizes used during vectorization. It is the user’s responsibility to enforce this. --- .../Linalg/TransformOps/LinalgTransformOps.td | 11 +- .../Dialect/Linalg/Transforms/Transforms.h | 3 +- .../TransformOps/LinalgTransformOps.cpp | 3 +- .../Linalg/Transforms/Vectorization.cpp | 53 ++++++-- .../Linalg/vectorization/linalg-ops.mlir | 117 ++++++++++++++---- 5 files changed, 143 insertions(+), 44 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index b4dde776822a1..bafeca924e4c5 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -2431,12 +2431,11 @@ def VectorizeOp : Op:$vector_sizes, - DefaultValuedOptionalAttr: - $static_vector_sizes, - OptionalAttr:$vectorize_nd_extract, - DefaultValuedOptionalAttr: - $scalable_sizes); + Variadic:$vector_sizes, + DefaultValuedOptionalAttr:$static_vector_sizes, + OptionalAttr:$vectorize_nd_extract, + OptionalAttr:$assume_dynamic_dims_match_vec_sizes, + DefaultValuedOptionalAttr:$scalable_sizes); let results = (outs); diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 74280fdd82f4e..9e62d0dcc7890 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -880,7 +880,8 @@ FailureOr vectorize(RewriterBase &rewriter, Operation *op, ArrayRef inputVectorSizes = {}, ArrayRef inputScalableVecDims = {}, - bool vectorizeNDExtract = false, bool flatten1DDepthwiseConv = false); + bool vectorizeNDExtract = false, bool flatten1DDepthwiseConv = false, + bool assumeDynamicDimsMatchVecSizes = false); /// Emit a suitable vector form for a Copy op with fully static shape. LogicalResult vectorizeCopy(RewriterBase &builder, memref::CopyOp copyOp); diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 5d5f9de465561..c959310136319 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -3920,7 +3920,8 @@ DiagnosedSilenceableFailure transform::VectorizeOp::apply( } FailureOr vectorResults = linalg::vectorize(rewriter, target, vectorSizes, getScalableSizes(), - getVectorizeNdExtract().value_or(false)); + getVectorizeNdExtract().value_or(false), false, + getAssumeDynamicDimsMatchVecSizes().value_or(false)); if (failed(vectorResults)) { return mlir::emitSilenceableFailure(target->getLoc()) << "Attempted to vectorize, but failed"; diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 458ed543b8216..4add50f4b36e5 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -219,7 +219,8 @@ struct VectorizationState { /// canonical vector shape for vectorization. LogicalResult initState(RewriterBase &rewriter, LinalgOp linalgOp, ArrayRef inputVectorSizes, - ArrayRef inputScalableVecDims); + ArrayRef inputScalableVecDims, + bool assumeDynamicDimsMatchVecSizes = false); /// Returns the canonical vector shape used to vectorize the iteration space. ArrayRef getCanonicalVecShape() const { return canonicalVecShape; } @@ -328,6 +329,14 @@ struct VectorizationState { /// Global vectorization guard for the incoming rewriter. It's initialized /// when the vectorization state is initialized. OpBuilder::InsertionGuard rewriterGuard; + + /// Do all dynamic dims match the corresponding vector sizes? + /// + /// When a dynamic tensor/memref dimension matches the corresponding vector + /// dimension, masking can be safely skipped, despite the presence of dynamic + /// shapes. Use this flag with care and only for cases where you are + /// confident the assumption holds. + bool assumeDynamicDimsMatchVecSizes = false; }; LogicalResult @@ -364,10 +373,12 @@ VectorizationState::precomputeIterSpaceValueSizes(RewriterBase &rewriter, /// Initializes the vectorization state, including the computation of the /// canonical vector shape for vectorization. // TODO: Move this to the constructor when we can remove the failure cases. -LogicalResult -VectorizationState::initState(RewriterBase &rewriter, LinalgOp linalgOp, - ArrayRef inputVectorSizes, - ArrayRef inputScalableVecDims) { +LogicalResult VectorizationState::initState(RewriterBase &rewriter, + LinalgOp linalgOp, + ArrayRef inputVectorSizes, + ArrayRef inputScalableVecDims, + bool assumeDimsMatchVec) { + assumeDynamicDimsMatchVecSizes = assumeDimsMatchVec; // Initialize the insertion point. rewriter.setInsertionPoint(linalgOp); @@ -467,6 +478,23 @@ Value VectorizationState::getOrCreateMaskFor( return Value(); } + if (assumeDynamicDimsMatchVecSizes) { + // While for _dynamic_ dim sizes we can _assume_ that the corresponding + // vector sizes match, we still need to check the _static_ dim sizes. Only + // then we can be 100% sure that masking is not required. + if (llvm::all_of(llvm::zip(permutedStaticSizes, maskType.getShape()), + [](auto it) { + return std::get<0>(it) == ShapedType::kDynamic + ? true + : std::get<0>(it) == std::get<1>(it); + })) { + LDBG("Dynamic + static dimensions match vector sizes, masking is not " + "required.\n"); + activeMaskCache[maskingMap] = Value(); + return Value(); + } + } + // Permute the iteration space value sizes to compute the mask upper bounds. SmallVector upperBounds = applyPermutationMap(maskingMap, ArrayRef(iterSpaceValueSizes)); @@ -2469,7 +2497,8 @@ vectorizeScalableVectorPrecondition(Operation *op, return success(isElementwise(linalgOp) || isa(op) || isa(op) || isa(op) || - isa(op) || hasReductionIterator(linalgOp)); + isa(op) || isa(op) || + hasReductionIterator(linalgOp)); } LogicalResult mlir::linalg::vectorizeOpPrecondition( @@ -2525,11 +2554,10 @@ bool mlir::linalg::hasVectorizationImpl(Operation *op) { tensor::InsertSliceOp>(op); } -FailureOr -mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, - ArrayRef inputVectorSizes, - ArrayRef inputScalableVecDims, - bool vectorizeNDExtract, bool flatten1DDepthwiseConv) { +FailureOr mlir::linalg::vectorize( + RewriterBase &rewriter, Operation *op, ArrayRef inputVectorSizes, + ArrayRef inputScalableVecDims, bool vectorizeNDExtract, + bool flatten1DDepthwiseConv, bool assumeDynamicDimsMatchVecSizes) { LDBG("Attempting to vectorize:\n" << *op << "\n"); LDBG("Input vector sizes: "); LLVM_DEBUG(llvm::interleaveComma(inputVectorSizes, llvm::dbgs())); @@ -2549,7 +2577,8 @@ mlir::linalg::vectorize(RewriterBase &rewriter, Operation *op, VectorizationState state(rewriter); if (auto linalgOp = dyn_cast(op)) { if (failed(state.initState(rewriter, linalgOp, inputVectorSizes, - inputScalableVecDims))) { + inputScalableVecDims, + assumeDynamicDimsMatchVecSizes))) { LDBG("Vectorization state couldn't be initialized\n"); return failure(); } diff --git a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir index 679adf0a52175..4fc39e220f86d 100644 --- a/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir +++ b/mlir/test/Dialect/Linalg/vectorization/linalg-ops.mlir @@ -840,6 +840,99 @@ module attributes {transform.with_named_sequence} { } } +// ----- + +///---------------------------------------------------------------------------------------- +/// Tests for linalg.mmt4d +///---------------------------------------------------------------------------------------- + +func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) { + linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>) + outs(%C_in: memref<16x16x8x8xf32>) + return +} + +// CHECK-LABEL: func.func @mmt4d( +// CHECK-SAME: %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) { +// CHECK: %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32> +// CHECK: %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32> +// CHECK: %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32> +// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32> +// CHECK: %[[RED:.*]] = vector.multi_reduction , %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32> +// CHECK: vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %mmt4d : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @mmt4d_scalable(%A: memref<16x16x8x1xf32>, %B: memref<16x16x?x1xf32>, %C_in: memref<16x16x8x?xf32>) { + linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x?x1xf32>) + outs(%C_in: memref<16x16x8x?xf32>) + return +} +// CHECK-LABEL: func.func @mmt4d_scalable( +// CHECK-SAME: %[[A:.*]]: memref<16x16x8x1xf32>, +// CHECK-SAME: %[[B:.*]]: memref<16x16x?x1xf32>, +// CHECK-SAME: %[[C_IN:.*]]: memref<16x16x8x?xf32>) { +// CHECK: %[[VAL_0:.*]] = arith.constant 16 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 16 : index +// CHECK: %[[VAL_2:.*]] = arith.constant 16 : index +// CHECK: %[[C8:.*]] = arith.constant 8 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[DIM_2:.*]] = memref.dim %[[B]], %[[C2]] : memref<16x16x?x1xf32> +// CHECK: %[[VAL_6:.*]] = arith.constant 1 : index +// CHECK: %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32> +// CHECK: %[[MASK_1:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_2]], %[[DIM_2]], %[[VAL_6]] : vector<16x16x[4]x1xi1> +// CHECK: %[[VEC_B:.*]] = vector.mask %[[MASK_1]] { vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32> } : vector<16x16x[4]x1xi1> -> vector<16x16x16x8x[4]x1xf32> +// CHECK: %[[MASK_2:.*]] = vector.create_mask %[[VAL_0]], %[[VAL_1]], %[[C8]], %[[DIM_2]] : vector<16x16x8x[4]xi1> +// CHECK: %[[VAL_15:.*]] = vector.mask %[[MASK_2]] { vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32> } : vector<16x16x8x[4]xi1> -> vector<16x16x8x[4]xf32> +// CHECK: %[[VAL_16:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32> +// CHECK: %[[MASK_3:.*]] = vector.create_mask %[[VAL_0]], %[[VAL_1]], %[[VAL_2]], %[[C8]], %[[DIM_2]], %[[VAL_6]] : vector<16x16x16x8x[4]x1xi1> +// CHECK: %[[VAL_18:.*]] = vector.mask %[[MASK_3]] { vector.multi_reduction , %[[VAL_16]], %[[VAL_15]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32> } : vector<16x16x16x8x[4]x1xi1> -> vector<16x16x8x[4]xf32> +// CHECK: vector.mask %[[MASK_2]] { vector.transfer_write %[[VAL_18]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32> } : vector<16x16x8x[4]xi1> + + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %mmt4d vector_sizes [16, 16, 16, 8, [4], 1] : !transform.any_op + transform.yield + } +} + +// ----- + +func.func @mmt4d_scalable_with_assume(%A: memref<16x16x8x1xf32>, %B: memref<16x16x?x1xf32>, %C_in: memref<16x16x8x?xf32>) { + linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x?x1xf32>) + outs(%C_in: memref<16x16x8x?xf32>) + return +} +// CHECK-LABEL: func.func @mmt4d_scalable_with_assume( +// CHECK-SAME: %[[A:.*]]: memref<16x16x8x1xf32>, +// CHECK-SAME: %[[B:.*]]: memref<16x16x?x1xf32>, +// CHECK-SAME: %[[C_IN:.*]]: memref<16x16x8x?xf32>) { +// CHECK-NOT: mask +// CHECK: %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x[4]x1xf32> +// CHECK: %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x?x1xf32>, vector<16x16x16x8x[4]x1xf32> +// CHECK: %[[VAL_13:.*]] = vector.transfer_read %[[C_IN]]{{.*}} : memref<16x16x8x?xf32>, vector<16x16x8x[4]xf32> +// CHECK: %[[VAL_14:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x[4]x1xf32> +// CHECK: %[[VAL_15:.*]] = vector.multi_reduction , %[[VAL_14]], %[[VAL_13]] [2, 5] : vector<16x16x16x8x[4]x1xf32> to vector<16x16x8x[4]xf32> +// CHECK: vector.transfer_write %[[VAL_15]], %[[C_IN]]{{.*}} : vector<16x16x8x[4]xf32>, memref<16x16x8x?xf32> + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op + transform.structured.vectorize %mmt4d vector_sizes [16, 16, 16, 8, [4], 1] {assume_dynamic_dims_match_vec_sizes} : !transform.any_op + transform.yield + } +} + ///---------------------------------------------------------------------------------------- /// Tests for other Ops ///---------------------------------------------------------------------------------------- @@ -1094,30 +1187,6 @@ module attributes {transform.with_named_sequence} { } } -// ----- - -func.func @mmt4d(%A: memref<16x16x8x1xf32>, %B: memref<16x16x8x1xf32>, %C_in: memref<16x16x8x8xf32>) { - linalg.mmt4d ins(%A, %B: memref<16x16x8x1xf32>, memref<16x16x8x1xf32>) - outs(%C_in: memref<16x16x8x8xf32>) - return -} - -// CHECK-LABEL: func.func @mmt4d( -// CHECK-SAME: %[[A:.*]]: memref<16x16x8x1xf32>, %[[B:.*]]: memref<16x16x8x1xf32>, %[[C:.*]]: memref<16x16x8x8xf32>) { -// CHECK: %[[VEC_A:.*]] = vector.transfer_read %[[A]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32> -// CHECK: %[[VEC_B:.*]] = vector.transfer_read %[[B]]{{.*}} : memref<16x16x8x1xf32>, vector<16x16x16x8x8x1xf32> -// CHECK: %[[VEC_C:.*]] = vector.transfer_read %[[C]]{{.*}} : memref<16x16x8x8xf32>, vector<16x16x8x8xf32> -// CHECK: %[[MUL:.*]] = arith.mulf %[[VEC_A]], %[[VEC_B]] : vector<16x16x16x8x8x1xf32> -// CHECK: %[[RED:.*]] = vector.multi_reduction , %[[MUL]], %[[VEC_C]] [2, 5] : vector<16x16x16x8x8x1xf32> to vector<16x16x8x8xf32> -// CHECK: vector.transfer_write %[[RED]], %[[C]]{{.*}} : vector<16x16x8x8xf32>, memref<16x16x8x8xf32> - -module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { - %mmt4d = transform.structured.match ops{["linalg.mmt4d"]} in %arg1 : (!transform.any_op) -> !transform.any_op - transform.structured.vectorize %mmt4d : !transform.any_op - transform.yield - } -} // ----- From f480e1b8258eac3565b3ffaf3f8ed0f77eb87fee Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Thu, 17 Jul 2025 11:10:23 -0700 Subject: [PATCH 217/813] [NVPTX] Add PRMT constant folding and cleanup usage of PRMT node (#148906) --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 244 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 23 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 18 - llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 2630 +++++++++++------ llvm/test/CodeGen/NVPTX/prmt-const-folding.ll | 171 ++ 5 files changed, 2132 insertions(+), 954 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/prmt-const-folding.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d017c658c53a3..7aa06f9079b09 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1048,9 +1048,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, MVT::v32i32, MVT::v64i32, MVT::v128i32}, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - // Enable custom lowering for the i128 bit operand with clusterlaunchcontrol - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i128, Custom); + // Enable custom lowering for the following: + // * MVT::i128 - clusterlaunchcontrol + // * MVT::i32 - prmt + // * MVT::Other - internal.addrspace.wrap + setOperationAction(ISD::INTRINSIC_WO_CHAIN, {MVT::i32, MVT::i128, MVT::Other}, + Custom); } const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { @@ -2060,6 +2063,19 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { return DAG.getBuildVector(Node->getValueType(0), dl, Ops); } +static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, + SelectionDAG &DAG, + unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32, + {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)}); +} + +static SDValue getPRMT(SDValue A, SDValue B, uint64_t Selector, SDLoc DL, + SelectionDAG &DAG, + unsigned Mode = NVPTX::PTXPrmtMode::NONE) { + return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode); +} + SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { // Handle bitcasting from v2i8 without hitting the default promotion // strategy which goes through stack memory. @@ -2111,15 +2127,12 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32); R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32); } - return DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {L, R, DAG.getConstant(SelectionValue, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); + return getPRMT(L, R, SelectionValue, DL, DAG); }; auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340); auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340); auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210); + return DAG.getBitcast(VT, PRMT3210); } // Get value or the Nth operand as an APInt(32). Undef values treated as 0. @@ -2176,11 +2189,14 @@ SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32, DAG.getZExtOrTrunc(Index, DL, MVT::i32), DAG.getConstant(0x7770, DL, MVT::i32)); - SDValue PRMT = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::i32, - {DAG.getBitcast(MVT::i32, Vector), DAG.getConstant(0, DL, MVT::i32), - Selector, DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0)); + SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector), + DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG); + SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0)); + SDNodeFlags Flags; + Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8); + Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8); + Ext->setFlags(Flags); + return Ext; } // Constant index will be matched by tablegen. @@ -2242,9 +2258,9 @@ SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } SDLoc DL(Op); - return DAG.getNode(NVPTXISD::PRMT, DL, MVT::v4i8, V1, V2, - DAG.getConstant(Selector, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)); + SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1), + DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG); + return DAG.getBitcast(Op.getValueType(), PRMT); } /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift @@ -2729,10 +2745,46 @@ static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, {TryCancelResponse0, TryCancelResponse1}); } +static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG) { + const unsigned Mode = [&]() { + switch (Op->getConstantOperandVal(0)) { + case Intrinsic::nvvm_prmt: + return NVPTX::PTXPrmtMode::NONE; + case Intrinsic::nvvm_prmt_b4e: + return NVPTX::PTXPrmtMode::B4E; + case Intrinsic::nvvm_prmt_ecl: + return NVPTX::PTXPrmtMode::ECL; + case Intrinsic::nvvm_prmt_ecr: + return NVPTX::PTXPrmtMode::ECR; + case Intrinsic::nvvm_prmt_f4e: + return NVPTX::PTXPrmtMode::F4E; + case Intrinsic::nvvm_prmt_rc16: + return NVPTX::PTXPrmtMode::RC16; + case Intrinsic::nvvm_prmt_rc8: + return NVPTX::PTXPrmtMode::RC8; + default: + llvm_unreachable("unsupported/unhandled intrinsic"); + } + }(); + SDLoc DL(Op); + SDValue A = Op->getOperand(1); + SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2) + : DAG.getConstant(0, DL, MVT::i32); + SDValue Selector = (Op->op_end() - 1)->get(); + return getPRMT(A, B, Selector, DL, DAG, Mode); +} static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG) { switch (Op->getConstantOperandVal(0)) { default: return Op; + case Intrinsic::nvvm_prmt: + case Intrinsic::nvvm_prmt_b4e: + case Intrinsic::nvvm_prmt_ecl: + case Intrinsic::nvvm_prmt_ecr: + case Intrinsic::nvvm_prmt_f4e: + case Intrinsic::nvvm_prmt_rc16: + case Intrinsic::nvvm_prmt_rc8: + return lowerPrmtIntrinsic(Op, DAG); case Intrinsic::nvvm_internal_addrspace_wrap: return Op.getOperand(1); case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled: @@ -5775,11 +5827,10 @@ PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDLoc DL(N); auto &DAG = DCI.DAG; - auto PRMT = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT); + auto PRMT = + getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1), + (Op1Bytes << 8) | Op0Bytes, DL, DAG); + return DAG.getBitcast(VT, PRMT); } static SDValue combineADDRSPACECAST(SDNode *N, @@ -5797,47 +5848,120 @@ static SDValue combineADDRSPACECAST(SDNode *N, return SDValue(); } +// Given a constant selector value and a prmt mode, return the selector value +// normalized to the generic prmt mode. See the PTX ISA documentation for more +// details: +// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt +static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) { + if (Mode == NVPTX::PTXPrmtMode::NONE) + return Selector; + + const unsigned V = Selector.trunc(2).getZExtValue(); + + const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2, + unsigned S3) { + return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12)); + }; + + switch (Mode) { + case NVPTX::PTXPrmtMode::F4E: + return GetSelector(V, V + 1, V + 2, V + 3); + case NVPTX::PTXPrmtMode::B4E: + return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7); + case NVPTX::PTXPrmtMode::RC8: + return GetSelector(V, V, V, V); + case NVPTX::PTXPrmtMode::ECL: + return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U); + case NVPTX::PTXPrmtMode::ECR: + return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V); + case NVPTX::PTXPrmtMode::RC16: { + unsigned V1 = (V & 1) << 1; + return GetSelector(V1, V1 + 1, V1, V1 + 1); + } + default: + llvm_unreachable("Invalid PRMT mode"); + } +} + +static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) { + // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} + APInt BitField = B.concat(A); + APInt SelectorVal = getPRMTSelector(Selector, Mode); + APInt Result(32, 0); + for (unsigned I : llvm::seq(4U)) { + APInt Sel = SelectorVal.extractBits(4, I * 4); + unsigned Idx = Sel.getLoBits(3).getZExtValue(); + unsigned Sign = Sel.getHiBits(1).getZExtValue(); + APInt Byte = BitField.extractBits(8, Idx * 8); + if (Sign) + Byte = Byte.ashr(8); + Result.insertBits(Byte, I * 8); + } + return Result; +} + +static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + CodeGenOptLevel OptLevel) { + if (OptLevel == CodeGenOptLevel::None) + return SDValue(); + + // Constant fold PRMT + if (isa(N->getOperand(0)) && + isa(N->getOperand(1)) && + isa(N->getOperand(2))) + return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0), + N->getConstantOperandAPInt(1), + N->getConstantOperandAPInt(2), + N->getConstantOperandVal(3)), + SDLoc(N), N->getValueType(0)); + + return SDValue(); +} + SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel(); switch (N->getOpcode()) { - default: break; - case ISD::ADD: - return PerformADDCombine(N, DCI, OptLevel); - case ISD::FADD: - return PerformFADDCombine(N, DCI, OptLevel); - case ISD::MUL: - return PerformMULCombine(N, DCI, OptLevel); - case ISD::SHL: - return PerformSHLCombine(N, DCI, OptLevel); - case ISD::AND: - return PerformANDCombine(N, DCI); - case ISD::UREM: - case ISD::SREM: - return PerformREMCombine(N, DCI, OptLevel); - case ISD::SETCC: - return PerformSETCCCombine(N, DCI, STI.getSmVersion()); - case ISD::LOAD: - case NVPTXISD::LoadParamV2: - case NVPTXISD::LoadV2: - case NVPTXISD::LoadV4: - return combineUnpackingMovIntoLoad(N, DCI); - case NVPTXISD::StoreParam: - case NVPTXISD::StoreParamV2: - case NVPTXISD::StoreParamV4: - return PerformStoreParamCombine(N, DCI); - case ISD::STORE: - case NVPTXISD::StoreV2: - case NVPTXISD::StoreV4: - return PerformStoreCombine(N, DCI); - case ISD::EXTRACT_VECTOR_ELT: - return PerformEXTRACTCombine(N, DCI); - case ISD::VSELECT: - return PerformVSELECTCombine(N, DCI); - case ISD::BUILD_VECTOR: - return PerformBUILD_VECTORCombine(N, DCI); - case ISD::ADDRSPACECAST: - return combineADDRSPACECAST(N, DCI); + default: + break; + case ISD::ADD: + return PerformADDCombine(N, DCI, OptLevel); + case ISD::ADDRSPACECAST: + return combineADDRSPACECAST(N, DCI); + case ISD::AND: + return PerformANDCombine(N, DCI); + case ISD::BUILD_VECTOR: + return PerformBUILD_VECTORCombine(N, DCI); + case ISD::EXTRACT_VECTOR_ELT: + return PerformEXTRACTCombine(N, DCI); + case ISD::FADD: + return PerformFADDCombine(N, DCI, OptLevel); + case ISD::LOAD: + case NVPTXISD::LoadParamV2: + case NVPTXISD::LoadV2: + case NVPTXISD::LoadV4: + return combineUnpackingMovIntoLoad(N, DCI); + case ISD::MUL: + return PerformMULCombine(N, DCI, OptLevel); + case NVPTXISD::PRMT: + return combinePRMT(N, DCI, OptLevel); + case ISD::SETCC: + return PerformSETCCCombine(N, DCI, STI.getSmVersion()); + case ISD::SHL: + return PerformSHLCombine(N, DCI, OptLevel); + case ISD::SREM: + case ISD::UREM: + return PerformREMCombine(N, DCI, OptLevel); + case NVPTXISD::StoreParam: + case NVPTXISD::StoreParamV2: + case NVPTXISD::StoreParamV4: + return PerformStoreParamCombine(N, DCI); + case ISD::STORE: + case NVPTXISD::StoreV2: + case NVPTXISD::StoreV4: + return PerformStoreCombine(N, DCI); + case ISD::VSELECT: + return PerformVSELECTCombine(N, DCI); } return SDValue(); } @@ -6387,7 +6511,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, ConstantSDNode *Selector = dyn_cast(Op.getOperand(2)); unsigned Mode = Op.getConstantOperandVal(3); - if (Mode != NVPTX::PTXPrmtMode::NONE || !Selector) + if (!Selector) return; KnownBits AKnown = DAG.computeKnownBits(A, Depth); @@ -6396,7 +6520,7 @@ static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}} KnownBits BitField = BKnown.concat(AKnown); - APInt SelectorVal = Selector->getAPIntValue(); + APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode); for (unsigned I : llvm::seq(std::min(4U, Known.getBitWidth() / 8))) { APInt Sel = SelectorVal.extractBits(4, I * 4); unsigned Idx = Sel.getLoBits(3).getZExtValue(); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 4eef6c939720c..a5bb83dfadb84 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1453,18 +1453,33 @@ let hasSideEffects = false in { (ins PrmtMode:$mode), "prmt.b32$mode", [(set i32:$d, (prmt i32:$a, i32:$b, imm:$c, imm:$mode))]>; + def PRMT_B32rir + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins B32:$a, i32imm:$b, B32:$c), + (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>; def PRMT_B32rii : BasicFlagsNVPTXInst<(outs B32:$d), (ins B32:$a, i32imm:$b, Hexu32imm:$c), (ins PrmtMode:$mode), "prmt.b32$mode", [(set i32:$d, (prmt i32:$a, imm:$b, imm:$c, imm:$mode))]>; - def PRMT_B32rir + def PRMT_B32irr : BasicFlagsNVPTXInst<(outs B32:$d), - (ins B32:$a, i32imm:$b, B32:$c), - (ins PrmtMode:$mode), + (ins i32imm:$a, B32:$b, B32:$c), (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt imm:$a, i32:$b, i32:$c, imm:$mode))]>; + def PRMT_B32iri + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins i32imm:$a, B32:$b, Hexu32imm:$c), (ins PrmtMode:$mode), + "prmt.b32$mode", + [(set i32:$d, (prmt imm:$a, i32:$b, imm:$c, imm:$mode))]>; + def PRMT_B32iir + : BasicFlagsNVPTXInst<(outs B32:$d), + (ins i32imm:$a, i32imm:$b, B32:$c), (ins PrmtMode:$mode), "prmt.b32$mode", - [(set i32:$d, (prmt i32:$a, imm:$b, i32:$c, imm:$mode))]>; + [(set i32:$d, (prmt imm:$a, imm:$b, i32:$c, imm:$mode))]>; } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index bad4c3c4c5f3a..70150bdfc8d16 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1047,24 +1047,6 @@ class F_MATH_3 - : Pat<(prmt_intrinsic i32:$a, i32:$b, i32:$c), - (PRMT_B32rrr $a, $b, $c, prmt_mode)>; - -class PRMT2Pat - : Pat<(prmt_intrinsic i32:$a, i32:$c), - (PRMT_B32rir $a, (i32 0), $c, prmt_mode)>; - -def : PRMT3Pat; -def : PRMT3Pat; -def : PRMT3Pat; - -def : PRMT2Pat; -def : PRMT2Pat; -def : PRMT2Pat; -def : PRMT2Pat; - - def INT_NVVM_NANOSLEEP_I : BasicNVPTXInst<(outs), (ins i32imm:$i), "nanosleep.u32", [(int_nvvm_nanosleep imm:$i)]>, Requires<[hasPTX<63>, hasSM<70>]>; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 410c0019c7222..cbc9f700b1f01 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1,14 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; ## Support i16x2 instructions -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 -mattr=+ptx80 \ -; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: | FileCheck -allow-deprecated-dag-overlap %s -; RUN: %if ptxas %{ \ -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_90 \ -; RUN: -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ -; RUN: | %ptxas-verify -arch=sm_90 \ +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefixes=CHECK,O0 +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefixes=CHECK,O3 +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -O0 -disable-post-ra -frame-pointer=all -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_90 \ +; RUN: %} +; RUN: %if ptxas %{ \ +; RUN: llc < %s -mcpu=sm_90 -mattr=+ptx80 -verify-machineinstrs \ +; RUN: | %ptxas-verify -arch=sm_90 \ ; RUN: %} +target triple = "nvptx64-nvidia-cuda" target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" define <4 x i8> @test_ret_const() #0 { @@ -79,61 +84,111 @@ define i8 @test_extract_3(<4 x i8> %a) #0 { } define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 { -; CHECK-LABEL: test_extract_i( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; -; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; -; CHECK-NEXT: or.b32 %r3, %r2, 30576; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, %r3; -; CHECK-NEXT: st.param.b32 [func_retval0], %r4; -; CHECK-NEXT: ret; +; O0-LABEL: test_extract_i( +; O0: { +; O0-NEXT: .reg .b32 %r<5>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; +; O0-NEXT: cvt.u32.u64 %r2, %rd1; +; O0-NEXT: or.b32 %r3, %r2, 30576; +; O0-NEXT: prmt.b32 %r4, %r1, 0, %r3; +; O0-NEXT: st.param.b32 [func_retval0], %r4; +; O0-NEXT: ret; +; +; O3-LABEL: test_extract_i( +; O3: { +; O3-NEXT: .reg .b32 %r<5>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_extract_i_param_1]; +; O3-NEXT: or.b32 %r3, %r2, 30576; +; O3-NEXT: prmt.b32 %r4, %r1, 0, %r3; +; O3-NEXT: st.param.b32 [func_retval0], %r4; +; O3-NEXT: ret; %e = extractelement <4 x i8> %a, i64 %idx ret i8 %e } define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_add( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_add_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_add_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; -; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; -; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; -; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; -; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; -; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; -; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_add( +; O0: { +; O0-NEXT: .reg .b16 %rs<13>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_add_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_add_param_0]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; +; O0-NEXT: cvt.u16.u32 %rs1, %r3; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; +; O0-NEXT: cvt.u16.u32 %rs2, %r4; +; O0-NEXT: add.s16 %rs3, %rs2, %rs1; +; O0-NEXT: cvt.u32.u16 %r5, %rs3; +; O0-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; +; O0-NEXT: cvt.u16.u32 %rs4, %r6; +; O0-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; +; O0-NEXT: cvt.u16.u32 %rs5, %r7; +; O0-NEXT: add.s16 %rs6, %rs5, %rs4; +; O0-NEXT: cvt.u32.u16 %r8, %rs6; +; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O0-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; +; O0-NEXT: cvt.u16.u32 %rs7, %r10; +; O0-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; +; O0-NEXT: cvt.u16.u32 %rs8, %r11; +; O0-NEXT: add.s16 %rs9, %rs8, %rs7; +; O0-NEXT: cvt.u32.u16 %r12, %rs9; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; +; O0-NEXT: cvt.u16.u32 %rs10, %r13; +; O0-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; +; O0-NEXT: cvt.u16.u32 %rs11, %r14; +; O0-NEXT: add.s16 %rs12, %rs11, %rs10; +; O0-NEXT: cvt.u32.u16 %r15, %rs12; +; O0-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_add( +; O3: { +; O3-NEXT: .reg .b16 %rs<13>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_add_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_add_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; +; O3-NEXT: cvt.u16.u32 %rs1, %r3; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; +; O3-NEXT: cvt.u16.u32 %rs2, %r4; +; O3-NEXT: add.s16 %rs3, %rs2, %rs1; +; O3-NEXT: cvt.u32.u16 %r5, %rs3; +; O3-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; +; O3-NEXT: cvt.u16.u32 %rs4, %r6; +; O3-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; +; O3-NEXT: cvt.u16.u32 %rs5, %r7; +; O3-NEXT: add.s16 %rs6, %rs5, %rs4; +; O3-NEXT: cvt.u32.u16 %r8, %rs6; +; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O3-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; +; O3-NEXT: cvt.u16.u32 %rs7, %r10; +; O3-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; +; O3-NEXT: cvt.u16.u32 %rs8, %r11; +; O3-NEXT: add.s16 %rs9, %rs8, %rs7; +; O3-NEXT: cvt.u32.u16 %r12, %rs9; +; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; +; O3-NEXT: cvt.u16.u32 %rs10, %r13; +; O3-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; +; O3-NEXT: cvt.u16.u32 %rs11, %r14; +; O3-NEXT: add.s16 %rs12, %rs11, %rs10; +; O3-NEXT: cvt.u32.u16 %r15, %rs12; +; O3-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: ret; %r = add <4 x i8> %a, %b ret <4 x i8> %r } @@ -205,341 +260,631 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { } define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_sub( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_sub_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_sub_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; -; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; -; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; -; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; -; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7; -; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; -; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_sub( +; O0: { +; O0-NEXT: .reg .b16 %rs<13>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_sub_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_sub_param_0]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; +; O0-NEXT: cvt.u16.u32 %rs1, %r3; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; +; O0-NEXT: cvt.u16.u32 %rs2, %r4; +; O0-NEXT: sub.s16 %rs3, %rs2, %rs1; +; O0-NEXT: cvt.u32.u16 %r5, %rs3; +; O0-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; +; O0-NEXT: cvt.u16.u32 %rs4, %r6; +; O0-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; +; O0-NEXT: cvt.u16.u32 %rs5, %r7; +; O0-NEXT: sub.s16 %rs6, %rs5, %rs4; +; O0-NEXT: cvt.u32.u16 %r8, %rs6; +; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O0-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; +; O0-NEXT: cvt.u16.u32 %rs7, %r10; +; O0-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; +; O0-NEXT: cvt.u16.u32 %rs8, %r11; +; O0-NEXT: sub.s16 %rs9, %rs8, %rs7; +; O0-NEXT: cvt.u32.u16 %r12, %rs9; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; +; O0-NEXT: cvt.u16.u32 %rs10, %r13; +; O0-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; +; O0-NEXT: cvt.u16.u32 %rs11, %r14; +; O0-NEXT: sub.s16 %rs12, %rs11, %rs10; +; O0-NEXT: cvt.u32.u16 %r15, %rs12; +; O0-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_sub( +; O3: { +; O3-NEXT: .reg .b16 %rs<13>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_sub_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_sub_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; +; O3-NEXT: cvt.u16.u32 %rs1, %r3; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; +; O3-NEXT: cvt.u16.u32 %rs2, %r4; +; O3-NEXT: sub.s16 %rs3, %rs2, %rs1; +; O3-NEXT: cvt.u32.u16 %r5, %rs3; +; O3-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; +; O3-NEXT: cvt.u16.u32 %rs4, %r6; +; O3-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; +; O3-NEXT: cvt.u16.u32 %rs5, %r7; +; O3-NEXT: sub.s16 %rs6, %rs5, %rs4; +; O3-NEXT: cvt.u32.u16 %r8, %rs6; +; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O3-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; +; O3-NEXT: cvt.u16.u32 %rs7, %r10; +; O3-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; +; O3-NEXT: cvt.u16.u32 %rs8, %r11; +; O3-NEXT: sub.s16 %rs9, %rs8, %rs7; +; O3-NEXT: cvt.u32.u16 %r12, %rs9; +; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; +; O3-NEXT: cvt.u16.u32 %rs10, %r13; +; O3-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; +; O3-NEXT: cvt.u16.u32 %rs11, %r14; +; O3-NEXT: sub.s16 %rs12, %rs11, %rs10; +; O3-NEXT: cvt.u32.u16 %r15, %rs12; +; O3-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: ret; %r = sub <4 x i8> %a, %b ret <4 x i8> %r } define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_smax( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_smax_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_smax_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; -; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3; -; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; -; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5; -; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; -; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7; -; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; -; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9; -; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_smax( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_smax_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_smax_param_0]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: setp.gt.s32 %p1, %r4, %r3; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: setp.gt.s32 %p2, %r6, %r5; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: setp.gt.s32 %p3, %r8, %r7; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: setp.gt.s32 %p4, %r10, %r9; +; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3; +; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2; +; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_smax( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_smax_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_smax_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: setp.gt.s32 %p1, %r4, %r3; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: setp.gt.s32 %p2, %r6, %r5; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: setp.gt.s32 %p3, %r8, %r7; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: setp.gt.s32 %p4, %r10, %r9; +; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3; +; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2; +; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r } define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_umax( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_umax_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_umax_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; -; CHECK-NEXT: setp.gt.u32 %p1, %r4, %r3; -; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; -; CHECK-NEXT: setp.gt.u32 %p2, %r6, %r5; -; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; -; CHECK-NEXT: setp.gt.u32 %p3, %r8, %r7; -; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; -; CHECK-NEXT: setp.gt.u32 %p4, %r10, %r9; -; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_umax( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_umax_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_umax_param_0]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: setp.gt.u32 %p1, %r4, %r3; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: setp.gt.u32 %p2, %r6, %r5; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: setp.gt.u32 %p3, %r8, %r7; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: setp.gt.u32 %p4, %r10, %r9; +; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3; +; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2; +; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_umax( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_umax_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_umax_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: setp.gt.u32 %p1, %r4, %r3; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: setp.gt.u32 %p2, %r6, %r5; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: setp.gt.u32 %p3, %r8, %r7; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: setp.gt.u32 %p4, %r10, %r9; +; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3; +; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2; +; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: ret; %cmp = icmp ugt <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r } define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_smin( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_smin_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_smin_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; -; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3; -; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; -; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5; -; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; -; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7; -; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; -; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9; -; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_smin( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_smin_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_smin_param_0]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: setp.le.s32 %p1, %r4, %r3; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: setp.le.s32 %p2, %r6, %r5; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: setp.le.s32 %p3, %r8, %r7; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: setp.le.s32 %p4, %r10, %r9; +; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3; +; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2; +; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_smin( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_smin_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_smin_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: setp.le.s32 %p1, %r4, %r3; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: setp.le.s32 %p2, %r6, %r5; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: setp.le.s32 %p3, %r8, %r7; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: setp.le.s32 %p4, %r10, %r9; +; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3; +; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2; +; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r } define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_umin( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_umin_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_umin_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; -; CHECK-NEXT: setp.le.u32 %p1, %r4, %r3; -; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; -; CHECK-NEXT: setp.le.u32 %p2, %r6, %r5; -; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; -; CHECK-NEXT: setp.le.u32 %p3, %r8, %r7; -; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; -; CHECK-NEXT: setp.le.u32 %p4, %r10, %r9; -; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; -; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_umin( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_umin_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_umin_param_0]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: setp.le.u32 %p1, %r4, %r3; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: setp.le.u32 %p2, %r6, %r5; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: setp.le.u32 %p3, %r8, %r7; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: setp.le.u32 %p4, %r10, %r9; +; O0-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; O0-NEXT: selp.b32 %r12, %r8, %r7, %p3; +; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O0-NEXT: selp.b32 %r14, %r6, %r5, %p2; +; O0-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_umin( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_umin_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_umin_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: setp.le.u32 %p1, %r4, %r3; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: setp.le.u32 %p2, %r6, %r5; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: setp.le.u32 %p3, %r8, %r7; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: setp.le.u32 %p4, %r10, %r9; +; O3-NEXT: selp.b32 %r11, %r10, %r9, %p4; +; O3-NEXT: selp.b32 %r12, %r8, %r7, %p3; +; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O3-NEXT: selp.b32 %r14, %r6, %r5, %p2; +; O3-NEXT: selp.b32 %r15, %r4, %r3, %p1; +; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: ret; %cmp = icmp ule <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r } define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { -; CHECK-LABEL: test_eq( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<23>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r3, [test_eq_param_2]; -; CHECK-NEXT: ld.param.b32 %r2, [test_eq_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_eq_param_0]; -; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U; -; CHECK-NEXT: setp.eq.b32 %p1, %r5, %r4; -; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U; -; CHECK-NEXT: setp.eq.b32 %p2, %r7, %r6; -; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7772U; -; CHECK-NEXT: setp.eq.b32 %p3, %r9, %r8; -; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7773U; -; CHECK-NEXT: setp.eq.b32 %p4, %r11, %r10; -; CHECK-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U; -; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: prmt.b32 %r14, %r3, 0, 0x7772U; -; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r3, 0, 0x7771U; -; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: prmt.b32 %r19, %r3, 0, 0x7770U; -; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; -; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U; -; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r22; -; CHECK-NEXT: ret; +; O0-LABEL: test_eq( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<23>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r3, [test_eq_param_2]; +; O0-NEXT: ld.param.b32 %r2, [test_eq_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_eq_param_0]; +; O0-NEXT: prmt.b32 %r4, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U; +; O0-NEXT: setp.eq.b32 %p1, %r5, %r4; +; O0-NEXT: prmt.b32 %r6, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U; +; O0-NEXT: setp.eq.b32 %p2, %r7, %r6; +; O0-NEXT: prmt.b32 %r8, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r9, %r1, 0, 0x7772U; +; O0-NEXT: setp.eq.b32 %p3, %r9, %r8; +; O0-NEXT: prmt.b32 %r10, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r11, %r1, 0, 0x7773U; +; O0-NEXT: setp.eq.b32 %p4, %r11, %r10; +; O0-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U; +; O0-NEXT: selp.b32 %r13, %r11, %r12, %p4; +; O0-NEXT: prmt.b32 %r14, %r3, 0, 0x7772U; +; O0-NEXT: selp.b32 %r15, %r9, %r14, %p3; +; O0-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r3, 0, 0x7771U; +; O0-NEXT: selp.b32 %r18, %r7, %r17, %p2; +; O0-NEXT: prmt.b32 %r19, %r3, 0, 0x7770U; +; O0-NEXT: selp.b32 %r20, %r5, %r19, %p1; +; O0-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U; +; O0-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r22; +; O0-NEXT: ret; +; +; O3-LABEL: test_eq( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<23>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_eq_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_eq_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: setp.eq.b32 %p1, %r4, %r3; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: setp.eq.b32 %p2, %r6, %r5; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: setp.eq.b32 %p3, %r8, %r7; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: setp.eq.b32 %p4, %r10, %r9; +; O3-NEXT: ld.param.b32 %r11, [test_eq_param_2]; +; O3-NEXT: prmt.b32 %r12, %r11, 0, 0x7773U; +; O3-NEXT: selp.b32 %r13, %r10, %r12, %p4; +; O3-NEXT: prmt.b32 %r14, %r11, 0, 0x7772U; +; O3-NEXT: selp.b32 %r15, %r8, %r14, %p3; +; O3-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r11, 0, 0x7771U; +; O3-NEXT: selp.b32 %r18, %r6, %r17, %p2; +; O3-NEXT: prmt.b32 %r19, %r11, 0, 0x7770U; +; O3-NEXT: selp.b32 %r20, %r4, %r19, %p1; +; O3-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U; +; O3-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r22; +; O3-NEXT: ret; %cmp = icmp eq <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c ret <4 x i8> %r } define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { -; CHECK-LABEL: test_ne( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<23>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r3, [test_ne_param_2]; -; CHECK-NEXT: ld.param.b32 %r2, [test_ne_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_ne_param_0]; -; CHECK-NEXT: prmt.b32 %r4, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U; -; CHECK-NEXT: setp.ne.b32 %p1, %r5, %r4; -; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U; -; CHECK-NEXT: setp.ne.b32 %p2, %r7, %r6; -; CHECK-NEXT: prmt.b32 %r8, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r9, %r1, 0, 0x7772U; -; CHECK-NEXT: setp.ne.b32 %p3, %r9, %r8; -; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7773U; -; CHECK-NEXT: setp.ne.b32 %p4, %r11, %r10; -; CHECK-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U; -; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: prmt.b32 %r14, %r3, 0, 0x7772U; -; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r3, 0, 0x7771U; -; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: prmt.b32 %r19, %r3, 0, 0x7770U; -; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; -; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U; -; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r22; -; CHECK-NEXT: ret; +; O0-LABEL: test_ne( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<23>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r3, [test_ne_param_2]; +; O0-NEXT: ld.param.b32 %r2, [test_ne_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_ne_param_0]; +; O0-NEXT: prmt.b32 %r4, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U; +; O0-NEXT: setp.ne.b32 %p1, %r5, %r4; +; O0-NEXT: prmt.b32 %r6, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r7, %r1, 0, 0x7771U; +; O0-NEXT: setp.ne.b32 %p2, %r7, %r6; +; O0-NEXT: prmt.b32 %r8, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r9, %r1, 0, 0x7772U; +; O0-NEXT: setp.ne.b32 %p3, %r9, %r8; +; O0-NEXT: prmt.b32 %r10, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r11, %r1, 0, 0x7773U; +; O0-NEXT: setp.ne.b32 %p4, %r11, %r10; +; O0-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U; +; O0-NEXT: selp.b32 %r13, %r11, %r12, %p4; +; O0-NEXT: prmt.b32 %r14, %r3, 0, 0x7772U; +; O0-NEXT: selp.b32 %r15, %r9, %r14, %p3; +; O0-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r3, 0, 0x7771U; +; O0-NEXT: selp.b32 %r18, %r7, %r17, %p2; +; O0-NEXT: prmt.b32 %r19, %r3, 0, 0x7770U; +; O0-NEXT: selp.b32 %r20, %r5, %r19, %p1; +; O0-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U; +; O0-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r22; +; O0-NEXT: ret; +; +; O3-LABEL: test_ne( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<23>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_ne_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_ne_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: setp.ne.b32 %p1, %r4, %r3; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: setp.ne.b32 %p2, %r6, %r5; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: setp.ne.b32 %p3, %r8, %r7; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: setp.ne.b32 %p4, %r10, %r9; +; O3-NEXT: ld.param.b32 %r11, [test_ne_param_2]; +; O3-NEXT: prmt.b32 %r12, %r11, 0, 0x7773U; +; O3-NEXT: selp.b32 %r13, %r10, %r12, %p4; +; O3-NEXT: prmt.b32 %r14, %r11, 0, 0x7772U; +; O3-NEXT: selp.b32 %r15, %r8, %r14, %p3; +; O3-NEXT: prmt.b32 %r16, %r15, %r13, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r11, 0, 0x7771U; +; O3-NEXT: selp.b32 %r18, %r6, %r17, %p2; +; O3-NEXT: prmt.b32 %r19, %r11, 0, 0x7770U; +; O3-NEXT: selp.b32 %r20, %r4, %r19, %p1; +; O3-NEXT: prmt.b32 %r21, %r20, %r18, 0x3340U; +; O3-NEXT: prmt.b32 %r22, %r21, %r16, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r22; +; O3-NEXT: ret; %cmp = icmp ne <4 x i8> %a, %b %r = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %c ret <4 x i8> %r } define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_mul( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_mul_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_mul_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; -; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; -; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; -; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; -; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; -; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; -; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_mul( +; O0: { +; O0-NEXT: .reg .b16 %rs<13>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_mul_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_mul_param_0]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; +; O0-NEXT: cvt.u16.u32 %rs1, %r3; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; +; O0-NEXT: cvt.u16.u32 %rs2, %r4; +; O0-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; +; O0-NEXT: cvt.u32.u16 %r5, %rs3; +; O0-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; +; O0-NEXT: cvt.u16.u32 %rs4, %r6; +; O0-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; +; O0-NEXT: cvt.u16.u32 %rs5, %r7; +; O0-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; +; O0-NEXT: cvt.u32.u16 %r8, %rs6; +; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O0-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; +; O0-NEXT: cvt.u16.u32 %rs7, %r10; +; O0-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; +; O0-NEXT: cvt.u16.u32 %rs8, %r11; +; O0-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; +; O0-NEXT: cvt.u32.u16 %r12, %rs9; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; +; O0-NEXT: cvt.u16.u32 %rs10, %r13; +; O0-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; +; O0-NEXT: cvt.u16.u32 %rs11, %r14; +; O0-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; +; O0-NEXT: cvt.u32.u16 %r15, %rs12; +; O0-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_mul( +; O3: { +; O3-NEXT: .reg .b16 %rs<13>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_mul_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_mul_param_1]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7773U; +; O3-NEXT: cvt.u16.u32 %rs1, %r3; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; +; O3-NEXT: cvt.u16.u32 %rs2, %r4; +; O3-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; +; O3-NEXT: cvt.u32.u16 %r5, %rs3; +; O3-NEXT: prmt.b32 %r6, %r2, 0, 0x7772U; +; O3-NEXT: cvt.u16.u32 %rs4, %r6; +; O3-NEXT: prmt.b32 %r7, %r1, 0, 0x7772U; +; O3-NEXT: cvt.u16.u32 %rs5, %r7; +; O3-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; +; O3-NEXT: cvt.u32.u16 %r8, %rs6; +; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O3-NEXT: prmt.b32 %r10, %r2, 0, 0x7771U; +; O3-NEXT: cvt.u16.u32 %rs7, %r10; +; O3-NEXT: prmt.b32 %r11, %r1, 0, 0x7771U; +; O3-NEXT: cvt.u16.u32 %rs8, %r11; +; O3-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; +; O3-NEXT: cvt.u32.u16 %r12, %rs9; +; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x7770U; +; O3-NEXT: cvt.u16.u32 %rs10, %r13; +; O3-NEXT: prmt.b32 %r14, %r1, 0, 0x7770U; +; O3-NEXT: cvt.u16.u32 %rs11, %r14; +; O3-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; +; O3-NEXT: cvt.u32.u16 %r15, %rs12; +; O3-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r17; +; O3-NEXT: ret; %r = mul <4 x i8> %a, %b ret <4 x i8> %r } define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_or( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_or_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_or_param_0]; -; CHECK-NEXT: or.b32 %r3, %r1, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_or( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_or_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_or_param_0]; +; O0-NEXT: or.b32 %r3, %r1, %r2; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_or( +; O3: { +; O3-NEXT: .reg .b32 %r<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_or_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_or_param_1]; +; O3-NEXT: or.b32 %r3, %r1, %r2; +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %r = or <4 x i8> %a, %b ret <4 x i8> %r } define <4 x i8> @test_or_computed(i8 %a) { -; CHECK-LABEL: test_or_computed( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [test_or_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: or.b32 %r7, %r6, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r7; -; CHECK-NEXT: ret; +; O0-LABEL: test_or_computed( +; O0: { +; O0-NEXT: .reg .b16 %rs<2>; +; O0-NEXT: .reg .b32 %r<8>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b8 %rs1, [test_or_computed_param_0]; +; O0-NEXT: mov.b32 %r1, 0; +; O0-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; +; O0-NEXT: cvt.u32.u16 %r3, %rs1; +; O0-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; +; O0-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; +; O0-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; O0-NEXT: or.b32 %r7, %r6, %r5; +; O0-NEXT: st.param.b32 [func_retval0], %r7; +; O0-NEXT: ret; +; +; O3-LABEL: test_or_computed( +; O3: { +; O3-NEXT: .reg .b32 %r<6>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b8 %r1, [test_or_computed_param_0]; +; O3-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x5410U; +; O3-NEXT: bfi.b32 %r4, 5, %r3, 8, 8; +; O3-NEXT: or.b32 %r5, %r4, %r3; +; O3-NEXT: st.param.b32 [func_retval0], %r5; +; O3-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 %r = or <4 x i8> %ins.1, %ins.0 @@ -575,37 +920,61 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 { } define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_xor( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_xor_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_xor_param_0]; -; CHECK-NEXT: xor.b32 %r3, %r1, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_xor( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_xor_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_xor_param_0]; +; O0-NEXT: xor.b32 %r3, %r1, %r2; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_xor( +; O3: { +; O3-NEXT: .reg .b32 %r<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_xor_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_xor_param_1]; +; O3-NEXT: xor.b32 %r3, %r1, %r2; +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %r = xor <4 x i8> %a, %b ret <4 x i8> %r } define <4 x i8> @test_xor_computed(i8 %a) { -; CHECK-LABEL: test_xor_computed( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [test_xor_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: xor.b32 %r7, %r6, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r7; -; CHECK-NEXT: ret; +; O0-LABEL: test_xor_computed( +; O0: { +; O0-NEXT: .reg .b16 %rs<2>; +; O0-NEXT: .reg .b32 %r<8>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b8 %rs1, [test_xor_computed_param_0]; +; O0-NEXT: mov.b32 %r1, 0; +; O0-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; +; O0-NEXT: cvt.u32.u16 %r3, %rs1; +; O0-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; +; O0-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; +; O0-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; O0-NEXT: xor.b32 %r7, %r6, %r5; +; O0-NEXT: st.param.b32 [func_retval0], %r7; +; O0-NEXT: ret; +; +; O3-LABEL: test_xor_computed( +; O3: { +; O3-NEXT: .reg .b32 %r<6>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b8 %r1, [test_xor_computed_param_0]; +; O3-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x5410U; +; O3-NEXT: bfi.b32 %r4, 5, %r3, 8, 8; +; O3-NEXT: xor.b32 %r5, %r4, %r3; +; O3-NEXT: st.param.b32 [func_retval0], %r5; +; O3-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 %r = xor <4 x i8> %ins.1, %ins.0 @@ -641,37 +1010,61 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 { } define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_and( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_and_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_and_param_0]; -; CHECK-NEXT: and.b32 %r3, %r1, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_and( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_and_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_and_param_0]; +; O0-NEXT: and.b32 %r3, %r1, %r2; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_and( +; O3: { +; O3-NEXT: .reg .b32 %r<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_and_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_and_param_1]; +; O3-NEXT: and.b32 %r3, %r1, %r2; +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %r = and <4 x i8> %a, %b ret <4 x i8> %r } define <4 x i8> @test_and_computed(i8 %a) { -; CHECK-LABEL: test_and_computed( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [test_and_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: and.b32 %r7, %r6, %r5; -; CHECK-NEXT: st.param.b32 [func_retval0], %r7; -; CHECK-NEXT: ret; +; O0-LABEL: test_and_computed( +; O0: { +; O0-NEXT: .reg .b16 %rs<2>; +; O0-NEXT: .reg .b32 %r<8>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b8 %rs1, [test_and_computed_param_0]; +; O0-NEXT: mov.b32 %r1, 0; +; O0-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; +; O0-NEXT: cvt.u32.u16 %r3, %rs1; +; O0-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; +; O0-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; +; O0-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; +; O0-NEXT: and.b32 %r7, %r6, %r5; +; O0-NEXT: st.param.b32 [func_retval0], %r7; +; O0-NEXT: ret; +; +; O3-LABEL: test_and_computed( +; O3: { +; O3-NEXT: .reg .b32 %r<6>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b8 %r1, [test_and_computed_param_0]; +; O3-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x5410U; +; O3-NEXT: bfi.b32 %r4, 5, %r3, 8, 8; +; O3-NEXT: and.b32 %r5, %r4, %r3; +; O3-NEXT: st.param.b32 [func_retval0], %r5; +; O3-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 %r = and <4 x i8> %ins.1, %ins.0 @@ -707,76 +1100,132 @@ define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 { } define void @test_ldst_v2i8(ptr %a, ptr %b) { -; CHECK-LABEL: test_ldst_v2i8( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2i8_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2i8_param_0]; -; CHECK-NEXT: ld.b32 %r1, [%rd1]; -; CHECK-NEXT: st.b32 [%rd2], %r1; -; CHECK-NEXT: ret; +; O0-LABEL: test_ldst_v2i8( +; O0: { +; O0-NEXT: .reg .b32 %r<2>; +; O0-NEXT: .reg .b64 %rd<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd2, [test_ldst_v2i8_param_1]; +; O0-NEXT: ld.param.b64 %rd1, [test_ldst_v2i8_param_0]; +; O0-NEXT: ld.b32 %r1, [%rd1]; +; O0-NEXT: st.b32 [%rd2], %r1; +; O0-NEXT: ret; +; +; O3-LABEL: test_ldst_v2i8( +; O3: { +; O3-NEXT: .reg .b32 %r<2>; +; O3-NEXT: .reg .b64 %rd<3>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_ldst_v2i8_param_0]; +; O3-NEXT: ld.b32 %r1, [%rd1]; +; O3-NEXT: ld.param.b64 %rd2, [test_ldst_v2i8_param_1]; +; O3-NEXT: st.b32 [%rd2], %r1; +; O3-NEXT: ret; %t1 = load <4 x i8>, ptr %a store <4 x i8> %t1, ptr %b, align 16 ret void } define void @test_ldst_v3i8(ptr %a, ptr %b) { -; CHECK-LABEL: test_ldst_v3i8( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3i8_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3i8_param_0]; -; CHECK-NEXT: ld.b32 %r1, [%rd1]; -; CHECK-NEXT: st.b16 [%rd2], %r1; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U; -; CHECK-NEXT: st.b8 [%rd2+2], %r2; -; CHECK-NEXT: ret; +; O0-LABEL: test_ldst_v3i8( +; O0: { +; O0-NEXT: .reg .b32 %r<3>; +; O0-NEXT: .reg .b64 %rd<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd2, [test_ldst_v3i8_param_1]; +; O0-NEXT: ld.param.b64 %rd1, [test_ldst_v3i8_param_0]; +; O0-NEXT: ld.b32 %r1, [%rd1]; +; O0-NEXT: st.b16 [%rd2], %r1; +; O0-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U; +; O0-NEXT: st.b8 [%rd2+2], %r2; +; O0-NEXT: ret; +; +; O3-LABEL: test_ldst_v3i8( +; O3: { +; O3-NEXT: .reg .b32 %r<3>; +; O3-NEXT: .reg .b64 %rd<3>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_ldst_v3i8_param_0]; +; O3-NEXT: ld.b32 %r1, [%rd1]; +; O3-NEXT: ld.param.b64 %rd2, [test_ldst_v3i8_param_1]; +; O3-NEXT: st.b16 [%rd2], %r1; +; O3-NEXT: prmt.b32 %r2, %r1, 0, 0x7772U; +; O3-NEXT: st.b8 [%rd2+2], %r2; +; O3-NEXT: ret; %t1 = load <3 x i8>, ptr %a store <3 x i8> %t1, ptr %b, align 16 ret void } define void @test_ldst_v4i8(ptr %a, ptr %b) { -; CHECK-LABEL: test_ldst_v4i8( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<2>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_param_0]; -; CHECK-NEXT: ld.b32 %r1, [%rd1]; -; CHECK-NEXT: st.b32 [%rd2], %r1; -; CHECK-NEXT: ret; +; O0-LABEL: test_ldst_v4i8( +; O0: { +; O0-NEXT: .reg .b32 %r<2>; +; O0-NEXT: .reg .b64 %rd<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_param_1]; +; O0-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_param_0]; +; O0-NEXT: ld.b32 %r1, [%rd1]; +; O0-NEXT: st.b32 [%rd2], %r1; +; O0-NEXT: ret; +; +; O3-LABEL: test_ldst_v4i8( +; O3: { +; O3-NEXT: .reg .b32 %r<2>; +; O3-NEXT: .reg .b64 %rd<3>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_param_0]; +; O3-NEXT: ld.b32 %r1, [%rd1]; +; O3-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_param_1]; +; O3-NEXT: st.b32 [%rd2], %r1; +; O3-NEXT: ret; %t1 = load <4 x i8>, ptr %a store <4 x i8> %t1, ptr %b, align 16 ret void } define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) { -; CHECK-LABEL: test_ldst_v4i8_unaligned( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0]; -; CHECK-NEXT: ld.b8 %r1, [%rd1]; -; CHECK-NEXT: ld.b8 %r2, [%rd1+1]; -; CHECK-NEXT: ld.b8 %r3, [%rd1+2]; -; CHECK-NEXT: ld.b8 %r4, [%rd1+3]; -; CHECK-NEXT: st.b8 [%rd2+3], %r4; -; CHECK-NEXT: st.b8 [%rd2+2], %r3; -; CHECK-NEXT: st.b8 [%rd2+1], %r2; -; CHECK-NEXT: st.b8 [%rd2], %r1; -; CHECK-NEXT: ret; +; O0-LABEL: test_ldst_v4i8_unaligned( +; O0: { +; O0-NEXT: .reg .b32 %r<5>; +; O0-NEXT: .reg .b64 %rd<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1]; +; O0-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0]; +; O0-NEXT: ld.b8 %r1, [%rd1]; +; O0-NEXT: ld.b8 %r2, [%rd1+1]; +; O0-NEXT: ld.b8 %r3, [%rd1+2]; +; O0-NEXT: ld.b8 %r4, [%rd1+3]; +; O0-NEXT: st.b8 [%rd2+3], %r4; +; O0-NEXT: st.b8 [%rd2+2], %r3; +; O0-NEXT: st.b8 [%rd2+1], %r2; +; O0-NEXT: st.b8 [%rd2], %r1; +; O0-NEXT: ret; +; +; O3-LABEL: test_ldst_v4i8_unaligned( +; O3: { +; O3-NEXT: .reg .b32 %r<5>; +; O3-NEXT: .reg .b64 %rd<3>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0]; +; O3-NEXT: ld.b8 %r1, [%rd1+1]; +; O3-NEXT: ld.b8 %r2, [%rd1]; +; O3-NEXT: ld.b8 %r3, [%rd1+3]; +; O3-NEXT: ld.b8 %r4, [%rd1+2]; +; O3-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1]; +; O3-NEXT: st.b8 [%rd2+2], %r4; +; O3-NEXT: st.b8 [%rd2+3], %r3; +; O3-NEXT: st.b8 [%rd2], %r2; +; O3-NEXT: st.b8 [%rd2+1], %r1; +; O3-NEXT: ret; %t1 = load <4 x i8>, ptr %a, align 1 store <4 x i8> %t1, ptr %b, align 1 ret void @@ -784,17 +1233,29 @@ define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) { define void @test_ldst_v8i8(ptr %a, ptr %b) { -; CHECK-LABEL: test_ldst_v8i8( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<3>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8i8_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8i8_param_0]; -; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; -; CHECK-NEXT: ret; +; O0-LABEL: test_ldst_v8i8( +; O0: { +; O0-NEXT: .reg .b32 %r<3>; +; O0-NEXT: .reg .b64 %rd<3>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd2, [test_ldst_v8i8_param_1]; +; O0-NEXT: ld.param.b64 %rd1, [test_ldst_v8i8_param_0]; +; O0-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; +; O0-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; +; O0-NEXT: ret; +; +; O3-LABEL: test_ldst_v8i8( +; O3: { +; O3-NEXT: .reg .b32 %r<3>; +; O3-NEXT: .reg .b64 %rd<3>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_ldst_v8i8_param_0]; +; O3-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; +; O3-NEXT: ld.param.b64 %rd2, [test_ldst_v8i8_param_1]; +; O3-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; +; O3-NEXT: ret; %t1 = load <8 x i8>, ptr %a store <8 x i8> %t1, ptr %b, align 16 ret void @@ -803,168 +1264,310 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) { declare <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) #0 define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_call( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; -; CHECK-NEXT: { // callseq 0, 0 -; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0], %r1; -; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1], %r2; -; CHECK-NEXT: .param .align 4 .b8 retval0[4]; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b32 %r3, [retval0]; -; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_call( +; O0: { +; O0-NEXT: .reg .b32 %r<5>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_call_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_call_param_0]; +; O0-NEXT: { // callseq 0, 0 +; O0-NEXT: .param .align 4 .b8 param0[4]; +; O0-NEXT: st.param.b32 [param0], %r1; +; O0-NEXT: .param .align 4 .b8 param1[4]; +; O0-NEXT: st.param.b32 [param1], %r2; +; O0-NEXT: .param .align 4 .b8 retval0[4]; +; O0-NEXT: call.uni (retval0), test_callee, (param0, param1); +; O0-NEXT: ld.param.b32 %r3, [retval0]; +; O0-NEXT: } // callseq 0 +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_call( +; O3: { +; O3-NEXT: .reg .b32 %r<5>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_call_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_call_param_1]; +; O3-NEXT: { // callseq 0, 0 +; O3-NEXT: .param .align 4 .b8 param0[4]; +; O3-NEXT: st.param.b32 [param0], %r1; +; O3-NEXT: .param .align 4 .b8 param1[4]; +; O3-NEXT: st.param.b32 [param1], %r2; +; O3-NEXT: .param .align 4 .b8 retval0[4]; +; O3-NEXT: call.uni (retval0), test_callee, (param0, param1); +; O3-NEXT: ld.param.b32 %r3, [retval0]; +; O3-NEXT: } // callseq 0 +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %r = call <4 x i8> @test_callee(<4 x i8> %a, <4 x i8> %b) ret <4 x i8> %r } define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_call_flipped( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; -; CHECK-NEXT: { // callseq 1, 0 -; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0], %r2; -; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1], %r1; -; CHECK-NEXT: .param .align 4 .b8 retval0[4]; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b32 %r3, [retval0]; -; CHECK-NEXT: } // callseq 1 -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_call_flipped( +; O0: { +; O0-NEXT: .reg .b32 %r<5>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; +; O0-NEXT: { // callseq 1, 0 +; O0-NEXT: .param .align 4 .b8 param0[4]; +; O0-NEXT: st.param.b32 [param0], %r2; +; O0-NEXT: .param .align 4 .b8 param1[4]; +; O0-NEXT: st.param.b32 [param1], %r1; +; O0-NEXT: .param .align 4 .b8 retval0[4]; +; O0-NEXT: call.uni (retval0), test_callee, (param0, param1); +; O0-NEXT: ld.param.b32 %r3, [retval0]; +; O0-NEXT: } // callseq 1 +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_call_flipped( +; O3: { +; O3-NEXT: .reg .b32 %r<5>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; +; O3-NEXT: { // callseq 1, 0 +; O3-NEXT: .param .align 4 .b8 param0[4]; +; O3-NEXT: st.param.b32 [param0], %r2; +; O3-NEXT: .param .align 4 .b8 param1[4]; +; O3-NEXT: st.param.b32 [param1], %r1; +; O3-NEXT: .param .align 4 .b8 retval0[4]; +; O3-NEXT: call.uni (retval0), test_callee, (param0, param1); +; O3-NEXT: ld.param.b32 %r3, [retval0]; +; O3-NEXT: } // callseq 1 +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %r = call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a) ret <4 x i8> %r } define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_tailcall_flipped( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; -; CHECK-NEXT: { // callseq 2, 0 -; CHECK-NEXT: .param .align 4 .b8 param0[4]; -; CHECK-NEXT: st.param.b32 [param0], %r2; -; CHECK-NEXT: .param .align 4 .b8 param1[4]; -; CHECK-NEXT: st.param.b32 [param1], %r1; -; CHECK-NEXT: .param .align 4 .b8 retval0[4]; -; CHECK-NEXT: call.uni (retval0), test_callee, (param0, param1); -; CHECK-NEXT: ld.param.b32 %r3, [retval0]; -; CHECK-NEXT: } // callseq 2 -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_tailcall_flipped( +; O0: { +; O0-NEXT: .reg .b32 %r<5>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; +; O0-NEXT: { // callseq 2, 0 +; O0-NEXT: .param .align 4 .b8 param0[4]; +; O0-NEXT: st.param.b32 [param0], %r2; +; O0-NEXT: .param .align 4 .b8 param1[4]; +; O0-NEXT: st.param.b32 [param1], %r1; +; O0-NEXT: .param .align 4 .b8 retval0[4]; +; O0-NEXT: call.uni (retval0), test_callee, (param0, param1); +; O0-NEXT: ld.param.b32 %r3, [retval0]; +; O0-NEXT: } // callseq 2 +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_tailcall_flipped( +; O3: { +; O3-NEXT: .reg .b32 %r<5>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; +; O3-NEXT: { // callseq 2, 0 +; O3-NEXT: .param .align 4 .b8 param0[4]; +; O3-NEXT: st.param.b32 [param0], %r2; +; O3-NEXT: .param .align 4 .b8 param1[4]; +; O3-NEXT: st.param.b32 [param1], %r1; +; O3-NEXT: .param .align 4 .b8 retval0[4]; +; O3-NEXT: call.uni (retval0), test_callee, (param0, param1); +; O3-NEXT: ld.param.b32 %r3, [retval0]; +; O3-NEXT: } // callseq 2 +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %r = tail call <4 x i8> @test_callee(<4 x i8> %b, <4 x i8> %a) ret <4 x i8> %r } define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 { -; CHECK-LABEL: test_select( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<2>; -; CHECK-NEXT: .reg .b16 %rs<3>; -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; -; CHECK-NEXT: and.b16 %rs2, %rs1, 1; -; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_0]; -; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_select( +; O0: { +; O0-NEXT: .reg .pred %p<2>; +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; O0-NEXT: and.b16 %rs2, %rs1, 1; +; O0-NEXT: setp.ne.b16 %p1, %rs2, 0; +; O0-NEXT: ld.param.b32 %r2, [test_select_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_select_param_0]; +; O0-NEXT: selp.b32 %r3, %r1, %r2, %p1; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_select( +; O3: { +; O3-NEXT: .reg .pred %p<2>; +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b32 %r<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b8 %rs1, [test_select_param_2]; +; O3-NEXT: and.b16 %rs2, %rs1, 1; +; O3-NEXT: setp.ne.b16 %p1, %rs2, 0; +; O3-NEXT: ld.param.b32 %r1, [test_select_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_select_param_1]; +; O3-NEXT: selp.b32 %r3, %r1, %r2, %p1; +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %r = select i1 %c, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r } define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) #0 { -; CHECK-LABEL: test_select_cc( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<28>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; -; CHECK-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; -; CHECK-NEXT: prmt.b32 %r5, %r4, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r6, %r3, 0, 0x7770U; -; CHECK-NEXT: setp.ne.b32 %p1, %r6, %r5; -; CHECK-NEXT: prmt.b32 %r7, %r4, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r8, %r3, 0, 0x7771U; -; CHECK-NEXT: setp.ne.b32 %p2, %r8, %r7; -; CHECK-NEXT: prmt.b32 %r9, %r4, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r10, %r3, 0, 0x7772U; -; CHECK-NEXT: setp.ne.b32 %p3, %r10, %r9; -; CHECK-NEXT: prmt.b32 %r11, %r4, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U; -; CHECK-NEXT: setp.ne.b32 %p4, %r12, %r11; -; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U; -; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4; -; CHECK-NEXT: prmt.b32 %r16, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; -; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3; -; CHECK-NEXT: prmt.b32 %r19, %r18, %r15, 0x3340U; -; CHECK-NEXT: prmt.b32 %r20, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r21, %r1, 0, 0x7771U; -; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2; -; CHECK-NEXT: prmt.b32 %r23, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r24, %r1, 0, 0x7770U; -; CHECK-NEXT: selp.b32 %r25, %r24, %r23, %p1; -; CHECK-NEXT: prmt.b32 %r26, %r25, %r22, 0x3340U; -; CHECK-NEXT: prmt.b32 %r27, %r26, %r19, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r27; -; CHECK-NEXT: ret; +; O0-LABEL: test_select_cc( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<28>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; O0-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; O0-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; O0-NEXT: prmt.b32 %r5, %r4, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r6, %r3, 0, 0x7770U; +; O0-NEXT: setp.ne.b32 %p1, %r6, %r5; +; O0-NEXT: prmt.b32 %r7, %r4, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r8, %r3, 0, 0x7771U; +; O0-NEXT: setp.ne.b32 %p2, %r8, %r7; +; O0-NEXT: prmt.b32 %r9, %r4, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r10, %r3, 0, 0x7772U; +; O0-NEXT: setp.ne.b32 %p3, %r10, %r9; +; O0-NEXT: prmt.b32 %r11, %r4, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r12, %r3, 0, 0x7773U; +; O0-NEXT: setp.ne.b32 %p4, %r12, %r11; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U; +; O0-NEXT: selp.b32 %r15, %r14, %r13, %p4; +; O0-NEXT: prmt.b32 %r16, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O0-NEXT: selp.b32 %r18, %r17, %r16, %p3; +; O0-NEXT: prmt.b32 %r19, %r18, %r15, 0x3340U; +; O0-NEXT: prmt.b32 %r20, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r21, %r1, 0, 0x7771U; +; O0-NEXT: selp.b32 %r22, %r21, %r20, %p2; +; O0-NEXT: prmt.b32 %r23, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r24, %r1, 0, 0x7770U; +; O0-NEXT: selp.b32 %r25, %r24, %r23, %p1; +; O0-NEXT: prmt.b32 %r26, %r25, %r22, 0x3340U; +; O0-NEXT: prmt.b32 %r27, %r26, %r19, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r27; +; O0-NEXT: ret; +; +; O3-LABEL: test_select_cc( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<28>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_select_cc_param_3]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O3-NEXT: ld.param.b32 %r4, [test_select_cc_param_2]; +; O3-NEXT: prmt.b32 %r5, %r4, 0, 0x7770U; +; O3-NEXT: setp.ne.b32 %p1, %r5, %r3; +; O3-NEXT: prmt.b32 %r6, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r7, %r4, 0, 0x7771U; +; O3-NEXT: setp.ne.b32 %p2, %r7, %r6; +; O3-NEXT: prmt.b32 %r8, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r9, %r4, 0, 0x7772U; +; O3-NEXT: setp.ne.b32 %p3, %r9, %r8; +; O3-NEXT: prmt.b32 %r10, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r11, %r4, 0, 0x7773U; +; O3-NEXT: setp.ne.b32 %p4, %r11, %r10; +; O3-NEXT: ld.param.b32 %r12, [test_select_cc_param_1]; +; O3-NEXT: prmt.b32 %r13, %r12, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r14, %r1, 0, 0x7773U; +; O3-NEXT: selp.b32 %r15, %r14, %r13, %p4; +; O3-NEXT: prmt.b32 %r16, %r12, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r17, %r1, 0, 0x7772U; +; O3-NEXT: selp.b32 %r18, %r17, %r16, %p3; +; O3-NEXT: prmt.b32 %r19, %r18, %r15, 0x3340U; +; O3-NEXT: prmt.b32 %r20, %r12, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r21, %r1, 0, 0x7771U; +; O3-NEXT: selp.b32 %r22, %r21, %r20, %p2; +; O3-NEXT: prmt.b32 %r23, %r12, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r24, %r1, 0, 0x7770U; +; O3-NEXT: selp.b32 %r25, %r24, %r23, %p1; +; O3-NEXT: prmt.b32 %r26, %r25, %r22, 0x3340U; +; O3-NEXT: prmt.b32 %r27, %r26, %r19, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r27; +; O3-NEXT: ret; %cc = icmp ne <4 x i8> %c, %d %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b ret <4 x i8> %r } define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b, -; CHECK-LABEL: test_select_cc_i32_i8( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<23>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1]; -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0]; -; CHECK-NEXT: ld.param.b32 %r10, [test_select_cc_i32_i8_param_3]; -; CHECK-NEXT: ld.param.b32 %r9, [test_select_cc_i32_i8_param_2]; -; CHECK-NEXT: prmt.b32 %r11, %r10, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r12, %r9, 0, 0x7770U; -; CHECK-NEXT: setp.ne.b32 %p1, %r12, %r11; -; CHECK-NEXT: prmt.b32 %r13, %r10, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r14, %r9, 0, 0x7771U; -; CHECK-NEXT: setp.ne.b32 %p2, %r14, %r13; -; CHECK-NEXT: prmt.b32 %r15, %r10, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r16, %r9, 0, 0x7772U; -; CHECK-NEXT: setp.ne.b32 %p3, %r16, %r15; -; CHECK-NEXT: prmt.b32 %r17, %r10, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r18, %r9, 0, 0x7773U; -; CHECK-NEXT: setp.ne.b32 %p4, %r18, %r17; -; CHECK-NEXT: selp.b32 %r19, %r4, %r8, %p4; -; CHECK-NEXT: selp.b32 %r20, %r3, %r7, %p3; -; CHECK-NEXT: selp.b32 %r21, %r2, %r6, %p2; -; CHECK-NEXT: selp.b32 %r22, %r1, %r5, %p1; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19}; -; CHECK-NEXT: ret; +; O0-LABEL: test_select_cc_i32_i8( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<23>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1]; +; O0-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0]; +; O0-NEXT: ld.param.b32 %r10, [test_select_cc_i32_i8_param_3]; +; O0-NEXT: ld.param.b32 %r9, [test_select_cc_i32_i8_param_2]; +; O0-NEXT: prmt.b32 %r11, %r10, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r12, %r9, 0, 0x7770U; +; O0-NEXT: setp.ne.b32 %p1, %r12, %r11; +; O0-NEXT: prmt.b32 %r13, %r10, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r14, %r9, 0, 0x7771U; +; O0-NEXT: setp.ne.b32 %p2, %r14, %r13; +; O0-NEXT: prmt.b32 %r15, %r10, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r16, %r9, 0, 0x7772U; +; O0-NEXT: setp.ne.b32 %p3, %r16, %r15; +; O0-NEXT: prmt.b32 %r17, %r10, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r18, %r9, 0, 0x7773U; +; O0-NEXT: setp.ne.b32 %p4, %r18, %r17; +; O0-NEXT: selp.b32 %r19, %r4, %r8, %p4; +; O0-NEXT: selp.b32 %r20, %r3, %r7, %p3; +; O0-NEXT: selp.b32 %r21, %r2, %r6, %p2; +; O0-NEXT: selp.b32 %r22, %r1, %r5, %p1; +; O0-NEXT: st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19}; +; O0-NEXT: ret; +; +; O3-LABEL: test_select_cc_i32_i8( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<23>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0]; +; O3-NEXT: ld.param.b32 %r5, [test_select_cc_i32_i8_param_3]; +; O3-NEXT: prmt.b32 %r6, %r5, 0, 0x7770U; +; O3-NEXT: ld.param.b32 %r7, [test_select_cc_i32_i8_param_2]; +; O3-NEXT: prmt.b32 %r8, %r7, 0, 0x7770U; +; O3-NEXT: setp.ne.b32 %p1, %r8, %r6; +; O3-NEXT: prmt.b32 %r9, %r5, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r10, %r7, 0, 0x7771U; +; O3-NEXT: setp.ne.b32 %p2, %r10, %r9; +; O3-NEXT: prmt.b32 %r11, %r5, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r12, %r7, 0, 0x7772U; +; O3-NEXT: setp.ne.b32 %p3, %r12, %r11; +; O3-NEXT: prmt.b32 %r13, %r5, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r14, %r7, 0, 0x7773U; +; O3-NEXT: setp.ne.b32 %p4, %r14, %r13; +; O3-NEXT: ld.param.v4.b32 {%r15, %r16, %r17, %r18}, [test_select_cc_i32_i8_param_1]; +; O3-NEXT: selp.b32 %r19, %r4, %r18, %p4; +; O3-NEXT: selp.b32 %r20, %r3, %r17, %p3; +; O3-NEXT: selp.b32 %r21, %r2, %r16, %p2; +; O3-NEXT: selp.b32 %r22, %r1, %r15, %p1; +; O3-NEXT: st.param.v4.b32 [func_retval0], {%r22, %r21, %r20, %r19}; +; O3-NEXT: ret; <4 x i8> %c, <4 x i8> %d) #0 { %cc = icmp ne <4 x i8> %c, %d %r = select <4 x i1> %cc, <4 x i32> %a, <4 x i32> %b @@ -972,37 +1575,69 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b, } define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, -; CHECK-LABEL: test_select_cc_i8_i32( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<26>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; -; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_i8_i32_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_i8_i32_param_0]; -; CHECK-NEXT: setp.ne.b32 %p1, %r3, %r7; -; CHECK-NEXT: setp.ne.b32 %p2, %r4, %r8; -; CHECK-NEXT: setp.ne.b32 %p3, %r5, %r9; -; CHECK-NEXT: setp.ne.b32 %p4, %r6, %r10; -; CHECK-NEXT: prmt.b32 %r11, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r12, %r1, 0, 0x7773U; -; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4; -; CHECK-NEXT: prmt.b32 %r14, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r15, %r1, 0, 0x7772U; -; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x3340U; -; CHECK-NEXT: prmt.b32 %r18, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U; -; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2; -; CHECK-NEXT: prmt.b32 %r21, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; -; CHECK-NEXT: selp.b32 %r23, %r22, %r21, %p1; -; CHECK-NEXT: prmt.b32 %r24, %r23, %r20, 0x3340U; -; CHECK-NEXT: prmt.b32 %r25, %r24, %r17, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r25; -; CHECK-NEXT: ret; +; O0-LABEL: test_select_cc_i8_i32( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<26>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; +; O0-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; +; O0-NEXT: ld.param.b32 %r2, [test_select_cc_i8_i32_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_select_cc_i8_i32_param_0]; +; O0-NEXT: setp.ne.b32 %p1, %r3, %r7; +; O0-NEXT: setp.ne.b32 %p2, %r4, %r8; +; O0-NEXT: setp.ne.b32 %p3, %r5, %r9; +; O0-NEXT: setp.ne.b32 %p4, %r6, %r10; +; O0-NEXT: prmt.b32 %r11, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r12, %r1, 0, 0x7773U; +; O0-NEXT: selp.b32 %r13, %r12, %r11, %p4; +; O0-NEXT: prmt.b32 %r14, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r15, %r1, 0, 0x7772U; +; O0-NEXT: selp.b32 %r16, %r15, %r14, %p3; +; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x3340U; +; O0-NEXT: prmt.b32 %r18, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U; +; O0-NEXT: selp.b32 %r20, %r19, %r18, %p2; +; O0-NEXT: prmt.b32 %r21, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O0-NEXT: selp.b32 %r23, %r22, %r21, %p1; +; O0-NEXT: prmt.b32 %r24, %r23, %r20, 0x3340U; +; O0-NEXT: prmt.b32 %r25, %r24, %r17, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r25; +; O0-NEXT: ret; +; +; O3-LABEL: test_select_cc_i8_i32( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<26>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_select_cc_i8_i32_param_0]; +; O3-NEXT: ld.param.v4.b32 {%r2, %r3, %r4, %r5}, [test_select_cc_i8_i32_param_2]; +; O3-NEXT: ld.param.v4.b32 {%r6, %r7, %r8, %r9}, [test_select_cc_i8_i32_param_3]; +; O3-NEXT: setp.ne.b32 %p1, %r2, %r6; +; O3-NEXT: setp.ne.b32 %p2, %r3, %r7; +; O3-NEXT: setp.ne.b32 %p3, %r4, %r8; +; O3-NEXT: setp.ne.b32 %p4, %r5, %r9; +; O3-NEXT: ld.param.b32 %r10, [test_select_cc_i8_i32_param_1]; +; O3-NEXT: prmt.b32 %r11, %r10, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r12, %r1, 0, 0x7773U; +; O3-NEXT: selp.b32 %r13, %r12, %r11, %p4; +; O3-NEXT: prmt.b32 %r14, %r10, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r15, %r1, 0, 0x7772U; +; O3-NEXT: selp.b32 %r16, %r15, %r14, %p3; +; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x3340U; +; O3-NEXT: prmt.b32 %r18, %r10, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r19, %r1, 0, 0x7771U; +; O3-NEXT: selp.b32 %r20, %r19, %r18, %p2; +; O3-NEXT: prmt.b32 %r21, %r10, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r22, %r1, 0, 0x7770U; +; O3-NEXT: selp.b32 %r23, %r22, %r21, %p1; +; O3-NEXT: prmt.b32 %r24, %r23, %r20, 0x3340U; +; O3-NEXT: prmt.b32 %r25, %r24, %r17, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r25; +; O3-NEXT: ret; <4 x i32> %c, <4 x i32> %d) #0 { %cc = icmp ne <4 x i32> %c, %d %r = select <4 x i1> %cc, <4 x i8> %a, <4 x i8> %b @@ -1027,23 +1662,41 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { } define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { -; CHECK-LABEL: test_trunc_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; -; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; -; CHECK-NEXT: cvt.u32.u64 %r1, %rd4; -; CHECK-NEXT: cvt.u32.u64 %r2, %rd3; -; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; -; CHECK-NEXT: cvt.u32.u64 %r4, %rd2; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd1; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r7; -; CHECK-NEXT: ret; +; O0-LABEL: test_trunc_2xi64( +; O0: { +; O0-NEXT: .reg .b32 %r<8>; +; O0-NEXT: .reg .b64 %rd<5>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; +; O0-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; +; O0-NEXT: cvt.u32.u64 %r1, %rd4; +; O0-NEXT: cvt.u32.u64 %r2, %rd3; +; O0-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; +; O0-NEXT: cvt.u32.u64 %r4, %rd2; +; O0-NEXT: cvt.u32.u64 %r5, %rd1; +; O0-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; +; O0-NEXT: prmt.b32 %r7, %r6, %r3, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r7; +; O0-NEXT: ret; +; +; O3-LABEL: test_trunc_2xi64( +; O3: { +; O3-NEXT: .reg .b32 %r<8>; +; O3-NEXT: .reg .b64 %rd<5>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; +; O3-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; +; O3-NEXT: cvt.u32.u64 %r1, %rd4; +; O3-NEXT: cvt.u32.u64 %r2, %rd3; +; O3-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; +; O3-NEXT: cvt.u32.u64 %r4, %rd2; +; O3-NEXT: cvt.u32.u64 %r5, %rd1; +; O3-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; +; O3-NEXT: prmt.b32 %r7, %r6, %r3, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r7; +; O3-NEXT: ret; %r = trunc <4 x i64> %a to <4 x i8> ret <4 x i8> %r } @@ -1066,24 +1719,43 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 { } define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 { -; CHECK-LABEL: test_zext_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<6>; -; CHECK-NEXT: .reg .b64 %rd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0]; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U; -; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; -; CHECK-NEXT: prmt.b32 %r3, %r1, 0, 0x7772U; -; CHECK-NEXT: cvt.u64.u32 %rd2, %r3; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7771U; -; CHECK-NEXT: cvt.u64.u32 %rd3, %r4; -; CHECK-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U; -; CHECK-NEXT: cvt.u64.u32 %rd4, %r5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; -; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd2, %rd1}; -; CHECK-NEXT: ret; +; O0-LABEL: test_zext_2xi64( +; O0: { +; O0-NEXT: .reg .b32 %r<6>; +; O0-NEXT: .reg .b64 %rd<5>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0]; +; O0-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U; +; O0-NEXT: cvt.u64.u32 %rd1, %r2; +; O0-NEXT: prmt.b32 %r3, %r1, 0, 0x7772U; +; O0-NEXT: cvt.u64.u32 %rd2, %r3; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7771U; +; O0-NEXT: cvt.u64.u32 %rd3, %r4; +; O0-NEXT: prmt.b32 %r5, %r1, 0, 0x7770U; +; O0-NEXT: cvt.u64.u32 %rd4, %r5; +; O0-NEXT: st.param.v2.b64 [func_retval0], {%rd4, %rd3}; +; O0-NEXT: st.param.v2.b64 [func_retval0+16], {%rd2, %rd1}; +; O0-NEXT: ret; +; +; O3-LABEL: test_zext_2xi64( +; O3: { +; O3-NEXT: .reg .b32 %r<6>; +; O3-NEXT: .reg .b64 %rd<5>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0]; +; O3-NEXT: prmt.b32 %r2, %r1, 0, 0x7771U; +; O3-NEXT: cvt.u64.u32 %rd1, %r2; +; O3-NEXT: prmt.b32 %r3, %r1, 0, 0x7770U; +; O3-NEXT: cvt.u64.u32 %rd2, %r3; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7773U; +; O3-NEXT: cvt.u64.u32 %rd3, %r4; +; O3-NEXT: prmt.b32 %r5, %r1, 0, 0x7772U; +; O3-NEXT: cvt.u64.u32 %rd4, %r5; +; O3-NEXT: st.param.v2.b64 [func_retval0+16], {%rd4, %rd3}; +; O3-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; +; O3-NEXT: ret; %r = zext <4 x i8> %a to <4 x i64> ret <4 x i64> %r } @@ -1142,20 +1814,31 @@ define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 { define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { -; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; -; CHECK-NEXT: mov.b32 %r1, 6; -; CHECK-NEXT: prmt.b32 %r2, %r1, 7, 0x3340U; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 5, 0x3340U; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r5; -; CHECK-NEXT: ret; +; O0-LABEL: test_bitcast_4xi8_to_2xhalf( +; O0: { +; O0-NEXT: .reg .b16 %rs<2>; +; O0-NEXT: .reg .b32 %r<6>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; +; O0-NEXT: mov.b32 %r1, 6; +; O0-NEXT: prmt.b32 %r2, %r1, 7, 0x3340U; +; O0-NEXT: cvt.u32.u16 %r3, %rs1; +; O0-NEXT: prmt.b32 %r4, %r3, 5, 0x3340U; +; O0-NEXT: prmt.b32 %r5, %r4, %r2, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r5; +; O0-NEXT: ret; +; +; O3-LABEL: test_bitcast_4xi8_to_2xhalf( +; O3: { +; O3-NEXT: .reg .b32 %r<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b8 %r1, [test_bitcast_4xi8_to_2xhalf_param_0]; +; O3-NEXT: prmt.b32 %r2, %r1, 5, 0x3340U; +; O3-NEXT: prmt.b32 %r3, %r2, 1798, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 %ins.2 = insertelement <4 x i8> %ins.1, i8 6, i32 2 @@ -1166,153 +1849,277 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 { -; CHECK-LABEL: test_shufflevector( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; -; CHECK-NEXT: // implicit-def: %r3 -; CHECK-NEXT: prmt.b32 %r2, %r1, %r3, 0x123U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; -; CHECK-NEXT: ret; +; O0-LABEL: test_shufflevector( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; +; O0-NEXT: // implicit-def: %r3 +; O0-NEXT: prmt.b32 %r2, %r1, %r3, 0x123U; +; O0-NEXT: st.param.b32 [func_retval0], %r2; +; O0-NEXT: ret; +; +; O3-LABEL: test_shufflevector( +; O3: { +; O3-NEXT: .reg .b32 %r<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; +; O3-NEXT: prmt.b32 %r2, %r1, %r3, 0x123U; +; O3-NEXT: st.param.b32 [func_retval0], %r2; +; O3-NEXT: ret; %s = shufflevector <4 x i8> %a, <4 x i8> undef, <4 x i32> ret <4 x i8> %s } define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 { -; CHECK-LABEL: test_shufflevector_2( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r2, [test_shufflevector_2_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_2_param_0]; -; CHECK-NEXT: prmt.b32 %r3, %r1, %r2, 0x2537U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_shufflevector_2( +; O0: { +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b32 %r2, [test_shufflevector_2_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_shufflevector_2_param_0]; +; O0-NEXT: prmt.b32 %r3, %r1, %r2, 0x2537U; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_shufflevector_2( +; O3: { +; O3-NEXT: .reg .b32 %r<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_shufflevector_2_param_0]; +; O3-NEXT: ld.param.b32 %r2, [test_shufflevector_2_param_1]; +; O3-NEXT: prmt.b32 %r3, %r1, %r2, 0x2537U; +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %s = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> ret <4 x i8> %s } define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 { -; CHECK-LABEL: test_insertelement( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b8 %rs1, [test_insertelement_param_1]; -; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; -; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; -; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; -; CHECK-NEXT: st.param.b32 [func_retval0], %r3; -; CHECK-NEXT: ret; +; O0-LABEL: test_insertelement( +; O0: { +; O0-NEXT: .reg .b16 %rs<2>; +; O0-NEXT: .reg .b32 %r<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b8 %rs1, [test_insertelement_param_1]; +; O0-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; O0-NEXT: cvt.u32.u16 %r2, %rs1; +; O0-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; +; O0-NEXT: st.param.b32 [func_retval0], %r3; +; O0-NEXT: ret; +; +; O3-LABEL: test_insertelement( +; O3: { +; O3-NEXT: .reg .b32 %r<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; O3-NEXT: ld.param.b8 %r2, [test_insertelement_param_1]; +; O3-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; +; O3-NEXT: st.param.b32 [func_retval0], %r3; +; O3-NEXT: ret; %i = insertelement <4 x i8> %a, i8 %x, i64 1 ret <4 x i8> %i } define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { -; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: cvt.rzi.s16.f16 %rs5, %rs4; -; CHECK-NEXT: cvt.rzi.s16.f16 %rs6, %rs3; -; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; CHECK-NEXT: cvt.u32.u16 %r4, %rs8; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs7; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs2; -; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs1; -; CHECK-NEXT: mov.b32 %r7, {%rs10, %rs9}; -; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r7; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs12; -; CHECK-NEXT: cvt.u32.u16 %r9, %rs11; -; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r6, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r11; -; CHECK-NEXT: ret; +; O0-LABEL: test_fptosi_4xhalf_to_4xi8( +; O0: { +; O0-NEXT: .reg .b16 %rs<13>; +; O0-NEXT: .reg .b32 %r<12>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0]; +; O0-NEXT: cvt.rzi.s16.f16 %rs5, %rs4; +; O0-NEXT: cvt.rzi.s16.f16 %rs6, %rs3; +; O0-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; O0-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; O0-NEXT: cvt.u32.u16 %r4, %rs8; +; O0-NEXT: cvt.u32.u16 %r5, %rs7; +; O0-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; +; O0-NEXT: cvt.rzi.s16.f16 %rs9, %rs2; +; O0-NEXT: cvt.rzi.s16.f16 %rs10, %rs1; +; O0-NEXT: mov.b32 %r7, {%rs10, %rs9}; +; O0-NEXT: mov.b32 {%rs11, %rs12}, %r7; +; O0-NEXT: cvt.u32.u16 %r8, %rs12; +; O0-NEXT: cvt.u32.u16 %r9, %rs11; +; O0-NEXT: prmt.b32 %r10, %r9, %r8, 0x3340U; +; O0-NEXT: prmt.b32 %r11, %r10, %r6, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r11; +; O0-NEXT: ret; +; +; O3-LABEL: test_fptosi_4xhalf_to_4xi8( +; O3: { +; O3-NEXT: .reg .b16 %rs<13>; +; O3-NEXT: .reg .b32 %r<10>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0]; +; O3-NEXT: cvt.rzi.s16.f16 %rs5, %rs4; +; O3-NEXT: cvt.rzi.s16.f16 %rs6, %rs3; +; O3-NEXT: mov.b32 %r1, {%rs6, %rs5}; +; O3-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; O3-NEXT: cvt.u32.u16 %r2, %rs8; +; O3-NEXT: cvt.u32.u16 %r3, %rs7; +; O3-NEXT: prmt.b32 %r4, %r3, %r2, 0x3340U; +; O3-NEXT: cvt.rzi.s16.f16 %rs9, %rs2; +; O3-NEXT: cvt.rzi.s16.f16 %rs10, %rs1; +; O3-NEXT: mov.b32 %r5, {%rs10, %rs9}; +; O3-NEXT: mov.b32 {%rs11, %rs12}, %r5; +; O3-NEXT: cvt.u32.u16 %r6, %rs12; +; O3-NEXT: cvt.u32.u16 %r7, %rs11; +; O3-NEXT: prmt.b32 %r8, %r7, %r6, 0x3340U; +; O3-NEXT: prmt.b32 %r9, %r8, %r4, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r9; +; O3-NEXT: ret; %r = fptosi <4 x half> %a to <4 x i8> ret <4 x i8> %r } define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { -; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: cvt.rzi.u16.f16 %rs5, %rs4; -; CHECK-NEXT: cvt.rzi.u16.f16 %rs6, %rs3; -; CHECK-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; CHECK-NEXT: cvt.u32.u16 %r4, %rs8; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs7; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs2; -; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs1; -; CHECK-NEXT: mov.b32 %r7, {%rs10, %rs9}; -; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r7; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs12; -; CHECK-NEXT: cvt.u32.u16 %r9, %rs11; -; CHECK-NEXT: prmt.b32 %r10, %r9, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r6, 0x5410U; -; CHECK-NEXT: st.param.b32 [func_retval0], %r11; -; CHECK-NEXT: ret; +; O0-LABEL: test_fptoui_4xhalf_to_4xi8( +; O0: { +; O0-NEXT: .reg .b16 %rs<13>; +; O0-NEXT: .reg .b32 %r<12>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0]; +; O0-NEXT: cvt.rzi.u16.f16 %rs5, %rs4; +; O0-NEXT: cvt.rzi.u16.f16 %rs6, %rs3; +; O0-NEXT: mov.b32 %r3, {%rs6, %rs5}; +; O0-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; O0-NEXT: cvt.u32.u16 %r4, %rs8; +; O0-NEXT: cvt.u32.u16 %r5, %rs7; +; O0-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; +; O0-NEXT: cvt.rzi.u16.f16 %rs9, %rs2; +; O0-NEXT: cvt.rzi.u16.f16 %rs10, %rs1; +; O0-NEXT: mov.b32 %r7, {%rs10, %rs9}; +; O0-NEXT: mov.b32 {%rs11, %rs12}, %r7; +; O0-NEXT: cvt.u32.u16 %r8, %rs12; +; O0-NEXT: cvt.u32.u16 %r9, %rs11; +; O0-NEXT: prmt.b32 %r10, %r9, %r8, 0x3340U; +; O0-NEXT: prmt.b32 %r11, %r10, %r6, 0x5410U; +; O0-NEXT: st.param.b32 [func_retval0], %r11; +; O0-NEXT: ret; +; +; O3-LABEL: test_fptoui_4xhalf_to_4xi8( +; O3: { +; O3-NEXT: .reg .b16 %rs<13>; +; O3-NEXT: .reg .b32 %r<10>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0]; +; O3-NEXT: cvt.rzi.u16.f16 %rs5, %rs4; +; O3-NEXT: cvt.rzi.u16.f16 %rs6, %rs3; +; O3-NEXT: mov.b32 %r1, {%rs6, %rs5}; +; O3-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; O3-NEXT: cvt.u32.u16 %r2, %rs8; +; O3-NEXT: cvt.u32.u16 %r3, %rs7; +; O3-NEXT: prmt.b32 %r4, %r3, %r2, 0x3340U; +; O3-NEXT: cvt.rzi.u16.f16 %rs9, %rs2; +; O3-NEXT: cvt.rzi.u16.f16 %rs10, %rs1; +; O3-NEXT: mov.b32 %r5, {%rs10, %rs9}; +; O3-NEXT: mov.b32 {%rs11, %rs12}, %r5; +; O3-NEXT: cvt.u32.u16 %r6, %rs12; +; O3-NEXT: cvt.u32.u16 %r7, %rs11; +; O3-NEXT: prmt.b32 %r8, %r7, %r6, 0x3340U; +; O3-NEXT: prmt.b32 %r9, %r8, %r4, 0x5410U; +; O3-NEXT: st.param.b32 [func_retval0], %r9; +; O3-NEXT: ret; %r = fptoui <4 x half> %a to <4 x i8> ret <4 x i8> %r } define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { -; CHECK-LABEL: test_srem_v4i8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.b64 %rd3, [test_srem_v4i8_param_2]; -; CHECK-NEXT: ld.param.b64 %rd2, [test_srem_v4i8_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_srem_v4i8_param_0]; -; CHECK-NEXT: ld.b32 %r1, [%rd1]; -; CHECK-NEXT: ld.b32 %r2, [%rd2]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0xbbb3U; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xbbb3U; -; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; -; CHECK-NEXT: rem.s16 %rs3, %rs2, %rs1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0xaaa2U; -; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0xaaa2U; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: rem.s16 %rs6, %rs5, %rs4; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; CHECK-NEXT: prmt.b32 %r10, %r2, 0, 0x9991U; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: prmt.b32 %r11, %r1, 0, 0x9991U; -; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; -; CHECK-NEXT: rem.s16 %rs9, %rs8, %rs7; -; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: prmt.b32 %r13, %r2, 0, 0x8880U; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: prmt.b32 %r14, %r1, 0, 0x8880U; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; -; CHECK-NEXT: rem.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; -; CHECK-NEXT: st.b32 [%rd3], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_srem_v4i8( +; O0: { +; O0-NEXT: .reg .b16 %rs<13>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-NEXT: .reg .b64 %rd<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: // %entry +; O0-NEXT: ld.param.b64 %rd3, [test_srem_v4i8_param_2]; +; O0-NEXT: ld.param.b64 %rd2, [test_srem_v4i8_param_1]; +; O0-NEXT: ld.param.b64 %rd1, [test_srem_v4i8_param_0]; +; O0-NEXT: ld.b32 %r1, [%rd1]; +; O0-NEXT: ld.b32 %r2, [%rd2]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0xbbb3U; +; O0-NEXT: cvt.u16.u32 %rs1, %r3; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0xbbb3U; +; O0-NEXT: cvt.u16.u32 %rs2, %r4; +; O0-NEXT: rem.s16 %rs3, %rs2, %rs1; +; O0-NEXT: cvt.u32.u16 %r5, %rs3; +; O0-NEXT: prmt.b32 %r6, %r2, 0, 0xaaa2U; +; O0-NEXT: cvt.u16.u32 %rs4, %r6; +; O0-NEXT: prmt.b32 %r7, %r1, 0, 0xaaa2U; +; O0-NEXT: cvt.u16.u32 %rs5, %r7; +; O0-NEXT: rem.s16 %rs6, %rs5, %rs4; +; O0-NEXT: cvt.u32.u16 %r8, %rs6; +; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O0-NEXT: prmt.b32 %r10, %r2, 0, 0x9991U; +; O0-NEXT: cvt.u16.u32 %rs7, %r10; +; O0-NEXT: prmt.b32 %r11, %r1, 0, 0x9991U; +; O0-NEXT: cvt.u16.u32 %rs8, %r11; +; O0-NEXT: rem.s16 %rs9, %rs8, %rs7; +; O0-NEXT: cvt.u32.u16 %r12, %rs9; +; O0-NEXT: prmt.b32 %r13, %r2, 0, 0x8880U; +; O0-NEXT: cvt.u16.u32 %rs10, %r13; +; O0-NEXT: prmt.b32 %r14, %r1, 0, 0x8880U; +; O0-NEXT: cvt.u16.u32 %rs11, %r14; +; O0-NEXT: rem.s16 %rs12, %rs11, %rs10; +; O0-NEXT: cvt.u32.u16 %r15, %rs12; +; O0-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; +; O0-NEXT: st.b32 [%rd3], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_srem_v4i8( +; O3: { +; O3-NEXT: .reg .b16 %rs<13>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-NEXT: .reg .b64 %rd<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: // %entry +; O3-NEXT: ld.param.b64 %rd1, [test_srem_v4i8_param_0]; +; O3-NEXT: ld.b32 %r1, [%rd1]; +; O3-NEXT: ld.param.b64 %rd2, [test_srem_v4i8_param_1]; +; O3-NEXT: ld.b32 %r2, [%rd2]; +; O3-NEXT: ld.param.b64 %rd3, [test_srem_v4i8_param_2]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0xbbb3U; +; O3-NEXT: cvt.u16.u32 %rs1, %r3; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0xbbb3U; +; O3-NEXT: cvt.u16.u32 %rs2, %r4; +; O3-NEXT: rem.s16 %rs3, %rs2, %rs1; +; O3-NEXT: cvt.u32.u16 %r5, %rs3; +; O3-NEXT: prmt.b32 %r6, %r2, 0, 0xaaa2U; +; O3-NEXT: cvt.u16.u32 %rs4, %r6; +; O3-NEXT: prmt.b32 %r7, %r1, 0, 0xaaa2U; +; O3-NEXT: cvt.u16.u32 %rs5, %r7; +; O3-NEXT: rem.s16 %rs6, %rs5, %rs4; +; O3-NEXT: cvt.u32.u16 %r8, %rs6; +; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O3-NEXT: prmt.b32 %r10, %r2, 0, 0x9991U; +; O3-NEXT: cvt.u16.u32 %rs7, %r10; +; O3-NEXT: prmt.b32 %r11, %r1, 0, 0x9991U; +; O3-NEXT: cvt.u16.u32 %rs8, %r11; +; O3-NEXT: rem.s16 %rs9, %rs8, %rs7; +; O3-NEXT: cvt.u32.u16 %r12, %rs9; +; O3-NEXT: prmt.b32 %r13, %r2, 0, 0x8880U; +; O3-NEXT: cvt.u16.u32 %rs10, %r13; +; O3-NEXT: prmt.b32 %r14, %r1, 0, 0x8880U; +; O3-NEXT: cvt.u16.u32 %rs11, %r14; +; O3-NEXT: rem.s16 %rs12, %rs11, %rs10; +; O3-NEXT: cvt.u32.u16 %r15, %rs12; +; O3-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; +; O3-NEXT: st.b32 [%rd3], %r17; +; O3-NEXT: ret; entry: %t57 = load <4 x i8>, ptr %a, align 4 %t59 = load <4 x i8>, ptr %b, align 4 @@ -1328,52 +2135,97 @@ entry: ;; Ideally we want to split it into element-wise ops, but legalizer can't handle ;; odd-sized vectors. TL;DR; don't use odd-sized vectors of v8. define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { -; CHECK-LABEL: test_srem_v3i8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b32 %r<14>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.b64 %rd3, [test_srem_v3i8_param_2]; -; CHECK-NEXT: ld.param.b64 %rd2, [test_srem_v3i8_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_srem_v3i8_param_0]; -; CHECK-NEXT: ld.b8 %rs1, [%rd1]; -; CHECK-NEXT: ld.b8 %rs2, [%rd1+1]; -; CHECK-NEXT: shl.b16 %rs3, %rs2, 8; -; CHECK-NEXT: or.b16 %rs4, %rs3, %rs1; -; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; -; CHECK-NEXT: ld.s8 %rs5, [%rd1+2]; -; CHECK-NEXT: ld.b8 %rs6, [%rd2]; -; CHECK-NEXT: ld.b8 %rs7, [%rd2+1]; -; CHECK-NEXT: shl.b16 %rs8, %rs7, 8; -; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r2, %rs9; -; CHECK-NEXT: ld.s8 %rs10, [%rd2+2]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x9991U; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r3; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U; -; CHECK-NEXT: cvt.u16.u32 %rs12, %r4; -; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs13; -; CHECK-NEXT: prmt.b32 %r6, %r2, 0, 0x8880U; -; CHECK-NEXT: cvt.u16.u32 %rs14, %r6; -; CHECK-NEXT: prmt.b32 %r7, %r1, 0, 0x8880U; -; CHECK-NEXT: cvt.u16.u32 %rs15, %r7; -; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs16; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; -; CHECK-NEXT: // implicit-def: %r11 -; CHECK-NEXT: // implicit-def: %r12 -; CHECK-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U; -; CHECK-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U; -; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10; -; CHECK-NEXT: mov.b32 {%rs18, _}, %r13; -; CHECK-NEXT: st.b8 [%rd3], %rs18; -; CHECK-NEXT: shr.u16 %rs19, %rs18, 8; -; CHECK-NEXT: st.b8 [%rd3+1], %rs19; -; CHECK-NEXT: st.b8 [%rd3+2], %rs17; -; CHECK-NEXT: ret; +; O0-LABEL: test_srem_v3i8( +; O0: { +; O0-NEXT: .reg .b16 %rs<20>; +; O0-NEXT: .reg .b32 %r<14>; +; O0-NEXT: .reg .b64 %rd<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: // %entry +; O0-NEXT: ld.param.b64 %rd3, [test_srem_v3i8_param_2]; +; O0-NEXT: ld.param.b64 %rd2, [test_srem_v3i8_param_1]; +; O0-NEXT: ld.param.b64 %rd1, [test_srem_v3i8_param_0]; +; O0-NEXT: ld.b8 %rs1, [%rd1]; +; O0-NEXT: ld.b8 %rs2, [%rd1+1]; +; O0-NEXT: shl.b16 %rs3, %rs2, 8; +; O0-NEXT: or.b16 %rs4, %rs3, %rs1; +; O0-NEXT: cvt.u32.u16 %r1, %rs4; +; O0-NEXT: ld.s8 %rs5, [%rd1+2]; +; O0-NEXT: ld.b8 %rs6, [%rd2]; +; O0-NEXT: ld.b8 %rs7, [%rd2+1]; +; O0-NEXT: shl.b16 %rs8, %rs7, 8; +; O0-NEXT: or.b16 %rs9, %rs8, %rs6; +; O0-NEXT: cvt.u32.u16 %r2, %rs9; +; O0-NEXT: ld.s8 %rs10, [%rd2+2]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x9991U; +; O0-NEXT: cvt.u16.u32 %rs11, %r3; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U; +; O0-NEXT: cvt.u16.u32 %rs12, %r4; +; O0-NEXT: rem.s16 %rs13, %rs12, %rs11; +; O0-NEXT: cvt.u32.u16 %r5, %rs13; +; O0-NEXT: prmt.b32 %r6, %r2, 0, 0x8880U; +; O0-NEXT: cvt.u16.u32 %rs14, %r6; +; O0-NEXT: prmt.b32 %r7, %r1, 0, 0x8880U; +; O0-NEXT: cvt.u16.u32 %rs15, %r7; +; O0-NEXT: rem.s16 %rs16, %rs15, %rs14; +; O0-NEXT: cvt.u32.u16 %r8, %rs16; +; O0-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O0-NEXT: // implicit-def: %r11 +; O0-NEXT: // implicit-def: %r12 +; O0-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U; +; O0-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U; +; O0-NEXT: rem.s16 %rs17, %rs5, %rs10; +; O0-NEXT: cvt.u16.u32 %rs18, %r13; +; O0-NEXT: st.b8 [%rd3], %rs18; +; O0-NEXT: shr.u16 %rs19, %rs18, 8; +; O0-NEXT: st.b8 [%rd3+1], %rs19; +; O0-NEXT: st.b8 [%rd3+2], %rs17; +; O0-NEXT: ret; +; +; O3-LABEL: test_srem_v3i8( +; O3: { +; O3-NEXT: .reg .b16 %rs<20>; +; O3-NEXT: .reg .b32 %r<14>; +; O3-NEXT: .reg .b64 %rd<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: // %entry +; O3-NEXT: ld.param.b64 %rd1, [test_srem_v3i8_param_0]; +; O3-NEXT: ld.b8 %rs1, [%rd1]; +; O3-NEXT: ld.b8 %rs2, [%rd1+1]; +; O3-NEXT: shl.b16 %rs3, %rs2, 8; +; O3-NEXT: or.b16 %rs4, %rs3, %rs1; +; O3-NEXT: cvt.u32.u16 %r1, %rs4; +; O3-NEXT: ld.s8 %rs5, [%rd1+2]; +; O3-NEXT: ld.param.b64 %rd2, [test_srem_v3i8_param_1]; +; O3-NEXT: ld.b8 %rs6, [%rd2]; +; O3-NEXT: ld.b8 %rs7, [%rd2+1]; +; O3-NEXT: shl.b16 %rs8, %rs7, 8; +; O3-NEXT: or.b16 %rs9, %rs8, %rs6; +; O3-NEXT: cvt.u32.u16 %r2, %rs9; +; O3-NEXT: ld.s8 %rs10, [%rd2+2]; +; O3-NEXT: ld.param.b64 %rd3, [test_srem_v3i8_param_2]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x9991U; +; O3-NEXT: cvt.u16.u32 %rs11, %r3; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x9991U; +; O3-NEXT: cvt.u16.u32 %rs12, %r4; +; O3-NEXT: rem.s16 %rs13, %rs12, %rs11; +; O3-NEXT: cvt.u32.u16 %r5, %rs13; +; O3-NEXT: prmt.b32 %r6, %r2, 0, 0x8880U; +; O3-NEXT: cvt.u16.u32 %rs14, %r6; +; O3-NEXT: prmt.b32 %r7, %r1, 0, 0x8880U; +; O3-NEXT: cvt.u16.u32 %rs15, %r7; +; O3-NEXT: rem.s16 %rs16, %rs15, %rs14; +; O3-NEXT: cvt.u32.u16 %r8, %rs16; +; O3-NEXT: prmt.b32 %r9, %r8, %r5, 0x3340U; +; O3-NEXT: prmt.b32 %r10, %r11, %r12, 0x3340U; +; O3-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U; +; O3-NEXT: rem.s16 %rs17, %rs5, %rs10; +; O3-NEXT: st.b8 [%rd3+2], %rs17; +; O3-NEXT: cvt.u16.u32 %rs18, %r13; +; O3-NEXT: st.b8 [%rd3], %rs18; +; O3-NEXT: shr.u16 %rs19, %rs18, 8; +; O3-NEXT: st.b8 [%rd3+1], %rs19; +; O3-NEXT: ret; entry: %t57 = load <3 x i8>, ptr %a, align 1 %t59 = load <3 x i8>, ptr %b, align 1 @@ -1383,39 +2235,73 @@ entry: } define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { -; CHECK-LABEL: test_sext_v4i1_to_v4i8( -; CHECK: { -; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; -; CHECK-NEXT: .reg .b64 %rd<4>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2]; -; CHECK-NEXT: ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1]; -; CHECK-NEXT: ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; -; CHECK-NEXT: ld.b32 %r1, [%rd1]; -; CHECK-NEXT: ld.b32 %r2, [%rd2]; -; CHECK-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; -; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; -; CHECK-NEXT: setp.gt.u32 %p1, %r4, %r3; -; CHECK-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; -; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; -; CHECK-NEXT: setp.gt.u32 %p2, %r6, %r5; -; CHECK-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; -; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; -; CHECK-NEXT: setp.gt.u32 %p3, %r8, %r7; -; CHECK-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; -; CHECK-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; -; CHECK-NEXT: setp.gt.u32 %p4, %r10, %r9; -; CHECK-NEXT: selp.b32 %r11, -1, 0, %p4; -; CHECK-NEXT: selp.b32 %r12, -1, 0, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; -; CHECK-NEXT: selp.b32 %r14, -1, 0, %p2; -; CHECK-NEXT: selp.b32 %r15, -1, 0, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; CHECK-NEXT: st.b32 [%rd3], %r17; -; CHECK-NEXT: ret; +; O0-LABEL: test_sext_v4i1_to_v4i8( +; O0: { +; O0-NEXT: .reg .pred %p<5>; +; O0-NEXT: .reg .b32 %r<18>; +; O0-NEXT: .reg .b64 %rd<4>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: // %entry +; O0-NEXT: ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2]; +; O0-NEXT: ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1]; +; O0-NEXT: ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; +; O0-NEXT: ld.b32 %r1, [%rd1]; +; O0-NEXT: ld.b32 %r2, [%rd2]; +; O0-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O0-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O0-NEXT: setp.gt.u32 %p1, %r4, %r3; +; O0-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O0-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O0-NEXT: setp.gt.u32 %p2, %r6, %r5; +; O0-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O0-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O0-NEXT: setp.gt.u32 %p3, %r8, %r7; +; O0-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O0-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O0-NEXT: setp.gt.u32 %p4, %r10, %r9; +; O0-NEXT: selp.b32 %r11, -1, 0, %p4; +; O0-NEXT: selp.b32 %r12, -1, 0, %p3; +; O0-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O0-NEXT: selp.b32 %r14, -1, 0, %p2; +; O0-NEXT: selp.b32 %r15, -1, 0, %p1; +; O0-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O0-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O0-NEXT: st.b32 [%rd3], %r17; +; O0-NEXT: ret; +; +; O3-LABEL: test_sext_v4i1_to_v4i8( +; O3: { +; O3-NEXT: .reg .pred %p<5>; +; O3-NEXT: .reg .b32 %r<18>; +; O3-NEXT: .reg .b64 %rd<4>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: // %entry +; O3-NEXT: ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; +; O3-NEXT: ld.b32 %r1, [%rd1]; +; O3-NEXT: ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1]; +; O3-NEXT: ld.b32 %r2, [%rd2]; +; O3-NEXT: ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2]; +; O3-NEXT: prmt.b32 %r3, %r2, 0, 0x7770U; +; O3-NEXT: prmt.b32 %r4, %r1, 0, 0x7770U; +; O3-NEXT: setp.gt.u32 %p1, %r4, %r3; +; O3-NEXT: prmt.b32 %r5, %r2, 0, 0x7771U; +; O3-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; O3-NEXT: setp.gt.u32 %p2, %r6, %r5; +; O3-NEXT: prmt.b32 %r7, %r2, 0, 0x7772U; +; O3-NEXT: prmt.b32 %r8, %r1, 0, 0x7772U; +; O3-NEXT: setp.gt.u32 %p3, %r8, %r7; +; O3-NEXT: prmt.b32 %r9, %r2, 0, 0x7773U; +; O3-NEXT: prmt.b32 %r10, %r1, 0, 0x7773U; +; O3-NEXT: setp.gt.u32 %p4, %r10, %r9; +; O3-NEXT: selp.b32 %r11, -1, 0, %p4; +; O3-NEXT: selp.b32 %r12, -1, 0, %p3; +; O3-NEXT: prmt.b32 %r13, %r12, %r11, 0x3340U; +; O3-NEXT: selp.b32 %r14, -1, 0, %p2; +; O3-NEXT: selp.b32 %r15, -1, 0, %p1; +; O3-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; +; O3-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; +; O3-NEXT: st.b32 [%rd3], %r17; +; O3-NEXT: ret; entry: %t1 = load <4 x i8>, ptr %a, align 4 %t2 = load <4 x i8>, ptr %b, align 4 diff --git a/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll b/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll new file mode 100644 index 0000000000000..7afead127c84f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/prmt-const-folding.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -verify-machineinstrs -mcpu=sm_50 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -verify-machineinstrs -mcpu=sm_50 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +@g = global i32 0 + +define void @test_prmt_f4e() { +; CHECK-LABEL: test_prmt_f4e( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: st.volatile.global.b32 [g], 50462976; +; CHECK-NEXT: st.volatile.global.b32 [g], 67305985; +; CHECK-NEXT: st.volatile.global.b32 [g], 84148994; +; CHECK-NEXT: st.volatile.global.b32 [g], 100992003; +; CHECK-NEXT: st.volatile.global.b32 [g], 50462976; +; CHECK-NEXT: ret; + %v1 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x0) + store volatile i32 %v1, ptr @g + %v2 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x1) + store volatile i32 %v2, ptr @g + %v3 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x2) + store volatile i32 %v3, ptr @g + %v4 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x3) + store volatile i32 %v4, ptr @g + %v5 = call i32 @llvm.nvvm.prmt.f4e(i32 u0x03020100, i32 u0x07060504, i32 u0x4) + store volatile i32 %v5, ptr @g + ret void +} + +define void @test_prmt_b4e() { +; CHECK-LABEL: test_prmt_b4e( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: st.volatile.global.b32 [g], 84281088; +; CHECK-NEXT: st.volatile.global.b32 [g], 101122049; +; CHECK-NEXT: st.volatile.global.b32 [g], 117440770; +; CHECK-NEXT: st.volatile.global.b32 [g], 66051; +; CHECK-NEXT: st.volatile.global.b32 [g], 84281088; +; CHECK-NEXT: ret; + %v1 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x0) + store volatile i32 %v1, ptr @g + %v2 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x1) + store volatile i32 %v2, ptr @g + %v3 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x2) + store volatile i32 %v3, ptr @g + %v4 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x3) + store volatile i32 %v4, ptr @g + %v5 = call i32 @llvm.nvvm.prmt.b4e(i32 u0x03020100, i32 u0x07060504, i32 u0x4) + store volatile i32 %v5, ptr @g + ret void +} + +define void @test_prmt_ecl() { +; CHECK-LABEL: test_prmt_ecl( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: st.volatile.global.b32 [g], 50462976; +; CHECK-NEXT: st.volatile.global.b32 [g], 50462977; +; CHECK-NEXT: st.volatile.global.b32 [g], 50463234; +; CHECK-NEXT: st.volatile.global.b32 [g], 50529027; +; CHECK-NEXT: ret; + %v1 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x0) + store volatile i32 %v1, ptr @g + %v2 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x1) + store volatile i32 %v2, ptr @g + %v3 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x2) + store volatile i32 %v3, ptr @g + %v4 = call i32 @llvm.nvvm.prmt.ecl(i32 u0x03020100, i32 u0x3) + store volatile i32 %v4, ptr @g + ret void +} + +define void @test_prmt_ecr() { +; CHECK-LABEL: test_prmt_ecr( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: st.volatile.global.b32 [g], 0; +; CHECK-NEXT: st.volatile.global.b32 [g], 16843008; +; CHECK-NEXT: st.volatile.global.b32 [g], 33685760; +; CHECK-NEXT: st.volatile.global.b32 [g], 50462976; +; CHECK-NEXT: ret; + %v1 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x0) + store volatile i32 %v1, ptr @g + %v2 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x1) + store volatile i32 %v2, ptr @g + %v3 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x2) + store volatile i32 %v3, ptr @g + %v4 = call i32 @llvm.nvvm.prmt.ecr(i32 u0x03020100, i32 u0x3) + store volatile i32 %v4, ptr @g + ret void +} + +define void @test_prmt_rc8() { +; CHECK-LABEL: test_prmt_rc8( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: st.volatile.global.b32 [g], 0; +; CHECK-NEXT: st.volatile.global.b32 [g], 16843009; +; CHECK-NEXT: st.volatile.global.b32 [g], 33686018; +; CHECK-NEXT: st.volatile.global.b32 [g], 50529027; +; CHECK-NEXT: st.volatile.global.b32 [g], 0; +; CHECK-NEXT: ret; + %v1 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x0) + store volatile i32 %v1, ptr @g + %v2 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x1) + store volatile i32 %v2, ptr @g + %v3 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x2) + store volatile i32 %v3, ptr @g + %v4 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x3) + store volatile i32 %v4, ptr @g + %v5 = call i32 @llvm.nvvm.prmt.rc8(i32 u0x03020100, i32 u0x4) + store volatile i32 %v5, ptr @g + ret void +} + +define void @test_prmt_rc16() { +; CHECK-LABEL: test_prmt_rc16( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: st.volatile.global.b32 [g], 16777472; +; CHECK-NEXT: st.volatile.global.b32 [g], 50463490; +; CHECK-NEXT: st.volatile.global.b32 [g], 16777472; +; CHECK-NEXT: ret; + %v1 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x0) + store volatile i32 %v1, ptr @g + %v2 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x1) + store volatile i32 %v2, ptr @g + %v3 = call i32 @llvm.nvvm.prmt.rc16(i32 u0x03020100, i32 u0x2) + store volatile i32 %v3, ptr @g + ret void +} + + +define void @test_prmt_basic() { +; CHECK-LABEL: test_prmt_basic( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: st.volatile.global.b32 [g], 66051; +; CHECK-NEXT: st.volatile.global.b32 [g], 117507841; +; CHECK-NEXT: st.volatile.global.b32 [g], 1146447479; +; CHECK-NEXT: st.volatile.global.b32 [g], 0; +; CHECK-NEXT: st.volatile.global.b32 [g], -16711936; +; CHECK-NEXT: ret; + %v1 = call i32 @llvm.nvvm.prmt(i32 u0x03020100, i32 u0x07060504, i32 u0x0123) + store volatile i32 %v1, ptr @g + %v2 = call i32 @llvm.nvvm.prmt(i32 u0x03020100, i32 u0x07060504, i32 u0x7171) + store volatile i32 %v2, ptr @g + %v3 = call i32 @llvm.nvvm.prmt(i32 u0x33221100, i32 u0x77665544, i32 u0x4567) + store volatile i32 %v3, ptr @g + %v4 = call i32 @llvm.nvvm.prmt(i32 u0x33221100, i32 u0x77665544, i32 u0xBA98) + store volatile i32 %v4, ptr @g + %v5 = call i32 @llvm.nvvm.prmt(i32 u0xF322F100, i32 u0x77665544, i32 u0xBA98) + store volatile i32 %v5, ptr @g + ret void +} From 3fa07ed5b38774656a2cff1bebc1785ce8e7feb8 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 17 Jul 2025 11:12:29 -0700 Subject: [PATCH 218/813] Rename config.host_os to config.target_os. config.host_os is derived from CMAKE_SYSTEM_NAME which specifies the target. See: https://cmake.org/cmake/help/latest/variable/CMAKE_SYSTEM_NAME.html To reduce confusion, rename it to config.target_os. The variable name config.target_os was already being used by the Orc tests. Rename it to config.orc_test_target_os with a FIXME to remove. Reviewers: JDevlieghere, MaskRay Reviewed By: MaskRay Pull Request: https://github.com/llvm/llvm-project/pull/149015 --- compiler-rt/lib/tysan/lit.cfg | 2 +- compiler-rt/test/asan/lit.cfg.py | 26 ++++++------- compiler-rt/test/asan_abi/lit.cfg.py | 2 +- compiler-rt/test/builtins/Unit/lit.cfg.py | 6 +-- compiler-rt/test/builtins/lit.cfg.py | 2 +- compiler-rt/test/ctx_profile/lit.cfg.py | 2 +- compiler-rt/test/dfsan/lit.cfg.py | 2 +- compiler-rt/test/fuzzer/lit.cfg.py | 2 +- compiler-rt/test/gwp_asan/lit.cfg.py | 2 +- compiler-rt/test/hwasan/lit.cfg.py | 2 +- compiler-rt/test/lit.common.cfg.py | 38 +++++++++---------- compiler-rt/test/lsan/lit.common.cfg.py | 12 +++--- compiler-rt/test/memprof/lit.cfg.py | 2 +- compiler-rt/test/metadata/lit.cfg.py | 2 +- compiler-rt/test/msan/lit.cfg.py | 6 +-- compiler-rt/test/nsan/lit.cfg.py | 2 +- compiler-rt/test/orc/lit.cfg.py | 12 +++--- compiler-rt/test/orc/lit.site.cfg.py.in | 3 +- compiler-rt/test/profile/lit.cfg.py | 10 ++--- .../test/rtsan/Unit/lit.site.cfg.py.in | 2 +- compiler-rt/test/rtsan/lit.cfg.py | 8 ++-- compiler-rt/test/safestack/lit.cfg.py | 2 +- .../test/sanitizer_common/lit.common.cfg.py | 10 ++--- compiler-rt/test/scudo/lit.cfg.py | 2 +- compiler-rt/test/shadowcallstack/lit.cfg.py | 2 +- compiler-rt/test/tsan/Unit/lit.site.cfg.py.in | 2 +- .../test/tsan/libdispatch/lit.local.cfg.py | 2 +- compiler-rt/test/tsan/lit.cfg.py | 10 ++--- compiler-rt/test/tysan/lit.cfg.py | 8 ++-- .../TypeCheck/Function/lit.local.cfg.py | 2 +- compiler-rt/test/ubsan/lit.common.cfg.py | 4 +- .../test/ubsan_minimal/lit.common.cfg.py | 2 +- compiler-rt/test/xray/lit.cfg.py | 8 ++-- compiler-rt/unittests/lit.common.unit.cfg.py | 2 +- .../unittests/lit.common.unit.configured.in | 2 +- lldb/test/API/lit.cfg.py | 4 +- lldb/test/API/lit.site.cfg.py.in | 2 +- llvm/test/lit.cfg.py | 2 +- llvm/test/lit.site.cfg.py.in | 2 +- 39 files changed, 107 insertions(+), 106 deletions(-) diff --git a/compiler-rt/lib/tysan/lit.cfg b/compiler-rt/lib/tysan/lit.cfg index e3ef6c9c97147..c906c03cc3fb2 100644 --- a/compiler-rt/lib/tysan/lit.cfg +++ b/compiler-rt/lib/tysan/lit.cfg @@ -27,7 +27,7 @@ config.substitutions.append( ("%clangxx_tysan ", build_invocation(clang_tysan_cx config.suffixes = ['.c', '.cc', '.cpp'] # TypeSanitizer tests are currently supported on Linux only. -if config.host_os not in ['Linux']: +if config.target_os not in ['Linux']: config.unsupported = True if config.target_arch != 'aarch64': diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py index 3da073332c458..96201e679b0a3 100644 --- a/compiler-rt/test/asan/lit.cfg.py +++ b/compiler-rt/test/asan/lit.cfg.py @@ -28,7 +28,7 @@ def get_required_attr(config, attr_name): # tests to prevent regressions. # Currently, detect_leaks for asan tests only work on Intel MacOS. if ( - config.host_os == "Darwin" + config.target_os == "Darwin" and config.apple_platform == "osx" and config.target_arch == "x86_64" ): @@ -45,7 +45,7 @@ def get_required_attr(config, attr_name): # Setup source root. config.test_source_root = os.path.dirname(__file__) -if config.host_os not in ["FreeBSD", "NetBSD"]: +if config.target_os not in ["FreeBSD", "NetBSD"]: libdl_flag = "-ldl" else: libdl_flag = "" @@ -125,17 +125,17 @@ def build_invocation(compile_flags, with_lto=False): ("%clangxx_asan_lto ", build_invocation(clang_asan_cxxflags, True)) ) if config.asan_dynamic: - if config.host_os in ["Linux", "FreeBSD", "NetBSD", "SunOS"]: + if config.target_os in ["Linux", "FreeBSD", "NetBSD", "SunOS"]: shared_libasan_path = os.path.join( config.compiler_rt_libdir, "libclang_rt.asan{}.so".format(config.target_suffix), ) - elif config.host_os == "Darwin": + elif config.target_os == "Darwin": shared_libasan_path = os.path.join( config.compiler_rt_libdir, "libclang_rt.asan_{}_dynamic.dylib".format(config.apple_platform), ) - elif config.host_os == "Windows": + elif config.target_os == "Windows": shared_libasan_path = os.path.join( config.compiler_rt_libdir, "clang_rt.asan_dynamic-{}.lib".format(config.target_suffix), @@ -274,16 +274,16 @@ def build_invocation(compile_flags, with_lto=False): and (config.target_arch in ["x86_64", "i386", "i686", "aarch64"]) ) leak_detection_linux = ( - (config.host_os == "Linux") + (config.target_os == "Linux") and (not config.android) and (config.target_arch in ["x86_64", "i386", "riscv64", "loongarch64"]) ) leak_detection_mac = ( - (config.host_os == "Darwin") + (config.target_os == "Darwin") and (config.apple_platform == "osx") and (config.target_arch == "x86_64") ) -leak_detection_netbsd = (config.host_os == "NetBSD") and ( +leak_detection_netbsd = (config.target_os == "NetBSD") and ( config.target_arch in ["x86_64", "i386"] ) if ( @@ -296,7 +296,7 @@ def build_invocation(compile_flags, with_lto=False): # Add the RT libdir to PATH directly so that we can successfully run the gtest # binary to list its tests. -if config.host_os == "Windows": +if config.target_os == "Windows": os.environ["PATH"] = os.path.pathsep.join( [config.compiler_rt_libdir, os.environ.get("PATH", "")] ) @@ -310,10 +310,10 @@ def build_invocation(compile_flags, with_lto=False): # Default test suffixes. config.suffixes = [".c", ".cpp"] -if config.host_os == "Darwin": +if config.target_os == "Darwin": config.suffixes.append(".mm") -if config.host_os == "Windows": +if config.target_os == "Windows": config.substitutions.append(("%fPIC", "")) config.substitutions.append(("%fPIE", "")) config.substitutions.append(("%pie", "")) @@ -323,11 +323,11 @@ def build_invocation(compile_flags, with_lto=False): config.substitutions.append(("%pie", "-pie")) # Only run the tests on supported OSs. -if config.host_os not in ["Linux", "Darwin", "FreeBSD", "SunOS", "Windows", "NetBSD"]: +if config.target_os not in ["Linux", "Darwin", "FreeBSD", "SunOS", "Windows", "NetBSD"]: config.unsupported = True if not config.parallelism_group: config.parallelism_group = "shadow-memory" -if config.host_os == "NetBSD": +if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) diff --git a/compiler-rt/test/asan_abi/lit.cfg.py b/compiler-rt/test/asan_abi/lit.cfg.py index 5bc1881ed9c32..dd99a5373e7b6 100644 --- a/compiler-rt/test/asan_abi/lit.cfg.py +++ b/compiler-rt/test/asan_abi/lit.cfg.py @@ -68,7 +68,7 @@ def build_invocation(compile_flags): config.suffixes = ['.c', '.cpp'] -if config.host_os == 'Darwin': +if config.target_os == 'Darwin': config.suffixes.append('.mm') else: config.unsupported = True diff --git a/compiler-rt/test/builtins/Unit/lit.cfg.py b/compiler-rt/test/builtins/Unit/lit.cfg.py index c030f89c66e42..59da054848f3c 100644 --- a/compiler-rt/test/builtins/Unit/lit.cfg.py +++ b/compiler-rt/test/builtins/Unit/lit.cfg.py @@ -80,10 +80,10 @@ def get_libgcc_file_name(): config.compiler_rt_libdir, "clang_rt.builtins%s.lib " % config.target_suffix ) config.substitutions.append(("%librt ", base_lib)) -elif config.host_os == "Darwin": +elif config.target_os == "Darwin": base_lib = os.path.join(config.compiler_rt_libdir, "libclang_rt.osx.a ") config.substitutions.append(("%librt ", base_lib + " -lSystem ")) -elif config.host_os == "Windows": +elif config.target_os == "Windows": base_lib = os.path.join( config.compiler_rt_libdir, "libclang_rt.builtins%s.a" % config.target_suffix ) @@ -104,7 +104,7 @@ def get_libgcc_file_name(): if sys.platform in ["win32"] and execute_external: # Don't pass dosish path separator to msys bash.exe. base_lib = base_lib.replace("\\", "/") - if config.host_os == "Haiku": + if config.target_os == "Haiku": config.substitutions.append(("%librt ", base_lib + " -lroot ")) else: config.substitutions.append(("%librt ", base_lib + " -lc -lm ")) diff --git a/compiler-rt/test/builtins/lit.cfg.py b/compiler-rt/test/builtins/lit.cfg.py index 9300488c8428d..6491f4735b9e6 100644 --- a/compiler-rt/test/builtins/lit.cfg.py +++ b/compiler-rt/test/builtins/lit.cfg.py @@ -21,7 +21,7 @@ ("%clang ", " " + config.clang + " " + " ".join(extra_flags) + " ") ) -if config.host_os == "Darwin": +if config.target_os == "Darwin": config.substitutions.append( ("%macos_version_major", str(config.darwin_osx_version[0])) ) diff --git a/compiler-rt/test/ctx_profile/lit.cfg.py b/compiler-rt/test/ctx_profile/lit.cfg.py index 74d9bfd11ae28..75367d95a47bd 100644 --- a/compiler-rt/test/ctx_profile/lit.cfg.py +++ b/compiler-rt/test/ctx_profile/lit.cfg.py @@ -7,7 +7,7 @@ import lit.formats # Only run the tests on supported OSs. -if config.host_os not in ["Linux"]: +if config.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/dfsan/lit.cfg.py b/compiler-rt/test/dfsan/lit.cfg.py index e947c51f99a5b..b26ff3e367942 100644 --- a/compiler-rt/test/dfsan/lit.cfg.py +++ b/compiler-rt/test/dfsan/lit.cfg.py @@ -25,5 +25,5 @@ def build_invocation(compile_flags): config.suffixes = [".c", ".cpp"] # DataFlowSanitizer tests are currently supported on Linux only. -if not (config.host_os in ["Linux"] and config.target_arch in ["aarch64", "x86_64", "loongarch64"]): +if not (config.target_os in ["Linux"] and config.target_arch in ["aarch64", "x86_64", "loongarch64"]): config.unsupported = True diff --git a/compiler-rt/test/fuzzer/lit.cfg.py b/compiler-rt/test/fuzzer/lit.cfg.py index 75d4cf2e4c529..1689f53d0b021 100644 --- a/compiler-rt/test/fuzzer/lit.cfg.py +++ b/compiler-rt/test/fuzzer/lit.cfg.py @@ -149,5 +149,5 @@ def generate_compiler_cmd(is_cpp=True, fuzzer_enabled=True, msan_enabled=False): if not config.parallelism_group: config.parallelism_group = "shadow-memory" -if config.host_os == "NetBSD": +if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) diff --git a/compiler-rt/test/gwp_asan/lit.cfg.py b/compiler-rt/test/gwp_asan/lit.cfg.py index 7f68682162e3f..1592cf400023e 100644 --- a/compiler-rt/test/gwp_asan/lit.cfg.py +++ b/compiler-rt/test/gwp_asan/lit.cfg.py @@ -67,5 +67,5 @@ def build_invocation(compile_flags): ) # GWP-ASan tests are currently supported on Linux only. -if config.host_os not in ["Linux"]: +if config.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/hwasan/lit.cfg.py b/compiler-rt/test/hwasan/lit.cfg.py index bbf23e683240a..3a1c8e1466aea 100644 --- a/compiler-rt/test/hwasan/lit.cfg.py +++ b/compiler-rt/test/hwasan/lit.cfg.py @@ -86,5 +86,5 @@ def build_invocation(compile_flags): # Default test suffixes. config.suffixes = [".c", ".cpp"] -if config.host_os not in ["Linux", "Android"] or not config.has_lld: +if config.target_os not in ["Linux", "Android"] or not config.has_lld: config.unsupported = True diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index 3c7323ecf4473..8328b407dcc36 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -66,7 +66,7 @@ def find_compiler_libdir(): # Fall back for older AppleClang that doesn't support `-print-runtime-dir` # Note `-print-file-name=` was broken for Apple # platforms so we can't use that approach here (see https://reviews.llvm.org/D101682). - if config.host_os == "Darwin": + if config.target_os == "Darwin": lib_dir, _ = get_path_from_clang(["-print-file-name=lib"], allow_failure=False) runtime_dir = os.path.join(lib_dir, "darwin") if not os.path.exists(runtime_dir): @@ -312,7 +312,7 @@ def push_dynamic_library_lookup_path(config, new_path): if platform.system() == "Windows" and target_is_msvc: config.environment["LIB"] = os.environ["LIB"] -config.available_features.add(config.host_os.lower()) +config.available_features.add(config.target_os.lower()) if config.target_triple.startswith("ppc") or config.target_triple.startswith("powerpc"): config.available_features.add("ppc") @@ -344,7 +344,7 @@ def push_dynamic_library_lookup_path(config, new_path): ) ) -if config.host_os == "NetBSD": +if config.target_os == "NetBSD": nb_commands_dir = os.path.join( config.compiler_rt_src_root, "test", "sanitizer_common", "netbsd_commands" ) @@ -395,7 +395,7 @@ def get_ios_commands_dir(): if sanitizer not in config.environment: config.environment[sanitizer] = symbolizer_path -env_utility = "/opt/freeware/bin/env" if config.host_os == "AIX" else "env" +env_utility = "/opt/freeware/bin/env" if config.target_os == "AIX" else "env" env_unset_command = " ".join(f"-u {var}" for var in tool_symbolizer_path_list) config.substitutions.append( ("%env_unset_tool_symbolizer_path", f"{env_utility} {env_unset_command}") @@ -410,7 +410,7 @@ def get_ios_commands_dir(): lit_config.warning("%device_rm is not implemented") config.substitutions.append(("%device_rm", "echo ")) config.compile_wrapper = "" -elif config.host_os == "Darwin" and config.apple_platform != "osx": +elif config.target_os == "Darwin" and config.apple_platform != "osx": # Darwin tests can be targetting macOS, a device or a simulator. All devices # are declared as "ios", even for iOS derivatives (tvOS, watchOS). Similarly, # all simulators are "iossim". See the table below. @@ -498,7 +498,7 @@ def get_ios_commands_dir(): config.compile_wrapper = "" # Define CHECK-%os to check for OS-dependent output. -config.substitutions.append(("CHECK-%os", ("CHECK-" + config.host_os))) +config.substitutions.append(("CHECK-%os", ("CHECK-" + config.target_os))) # Define %arch to check for architecture-dependent output. config.substitutions.append(("%arch", (config.host_arch))) @@ -519,7 +519,7 @@ def get_ios_commands_dir(): config.available_features.add(target_arch + "-target-arch") if target_arch in ["x86_64", "i386"]: config.available_features.add("x86-target-arch") - config.available_features.add(target_arch + "-" + config.host_os.lower()) + config.available_features.add(target_arch + "-" + config.target_os.lower()) compiler_rt_debug = getattr(config, "compiler_rt_debug", False) if not compiler_rt_debug: @@ -565,7 +565,7 @@ def get_ios_commands_dir(): ("%darwin_min_target_with_tls_support", "%min_macos_deployment_target=10.12") ) -if config.host_os == "Darwin": +if config.target_os == "Darwin": osx_version = (10, 0, 0) try: osx_version = subprocess.check_output( @@ -708,7 +708,7 @@ def get_macos_aligned_version(macos_vers): config.substitutions.append(("%push_to_device", "echo ")) config.substitutions.append(("%adb_shell", "echo ")) -if config.host_os == "Linux": +if config.target_os == "Linux": def add_glibc_versions(ver_string): if config.android: return @@ -806,10 +806,10 @@ def is_windows_lto_supported(): return os.path.exists(os.path.join(config.llvm_tools_dir, "lld-link.exe")) -if config.host_os == "Darwin" and is_darwin_lto_supported(): +if config.target_os == "Darwin" and is_darwin_lto_supported(): config.lto_supported = True config.lto_flags = ["-Wl,-lto_library," + liblto_path()] -elif config.host_os in ["Linux", "FreeBSD", "NetBSD"]: +elif config.target_os in ["Linux", "FreeBSD", "NetBSD"]: config.lto_supported = False if config.use_lld and is_lld_lto_supported(): config.lto_supported = True @@ -822,7 +822,7 @@ def is_windows_lto_supported(): config.lto_flags = ["-fuse-ld=lld"] else: config.lto_flags = ["-fuse-ld=gold"] -elif config.host_os == "Windows" and is_windows_lto_supported(): +elif config.target_os == "Windows" and is_windows_lto_supported(): config.lto_supported = True config.lto_flags = ["-fuse-ld=lld"] else: @@ -871,7 +871,7 @@ def is_windows_lto_supported(): # Note that substitutions with numbers have to be defined first to avoid # being subsumed by substitutions with smaller postfix. for postfix in ["2", "1", ""]: - if config.host_os == "Darwin": + if config.target_os == "Darwin": config.substitutions.append( ( "%ld_flags_rpath_exe" + postfix, @@ -884,7 +884,7 @@ def is_windows_lto_supported(): "-install_name @rpath/`basename %dynamiclib{}`".format(postfix), ) ) - elif config.host_os in ("FreeBSD", "NetBSD", "OpenBSD"): + elif config.target_os in ("FreeBSD", "NetBSD", "OpenBSD"): config.substitutions.append( ( "%ld_flags_rpath_exe" + postfix, @@ -893,7 +893,7 @@ def is_windows_lto_supported(): ) ) config.substitutions.append(("%ld_flags_rpath_so" + postfix, "")) - elif config.host_os == "Linux": + elif config.target_os == "Linux": config.substitutions.append( ( "%ld_flags_rpath_exe" + postfix, @@ -901,7 +901,7 @@ def is_windows_lto_supported(): ) ) config.substitutions.append(("%ld_flags_rpath_so" + postfix, "")) - elif config.host_os == "SunOS": + elif config.target_os == "SunOS": config.substitutions.append( ( "%ld_flags_rpath_exe" + postfix, @@ -923,7 +923,7 @@ def is_windows_lto_supported(): config.substitutions.append(("%xdynamiclib_namespec", "%basename_t.dynamic")) config.default_sanitizer_opts = [] -if config.host_os == "Darwin": +if config.target_os == "Darwin": # On Darwin, we default to `abort_on_error=1`, which would make tests run # much slower. Let's override this and run lit tests with 'abort_on_error=0'. config.default_sanitizer_opts += ["abort_on_error=0"] @@ -983,7 +983,7 @@ def is_windows_lto_supported(): elif config.use_lld and (not config.has_lld): config.unsupported = True -if config.host_os == "Darwin": +if config.target_os == "Darwin": if getattr(config, "darwin_linker_version", None): extra_cflags += ["-mlinker-version=" + config.darwin_linker_version] @@ -998,7 +998,7 @@ def is_windows_lto_supported(): ) config.target_cflags = " " + " ".join(target_cflags + extra_cflags) + " " -if config.host_os == "Darwin": +if config.target_os == "Darwin": config.substitutions.append( ( "%get_pid_from_output", diff --git a/compiler-rt/test/lsan/lit.common.cfg.py b/compiler-rt/test/lsan/lit.common.cfg.py index 9426b7d108bbf..1e2679438b114 100644 --- a/compiler-rt/test/lsan/lit.common.cfg.py +++ b/compiler-rt/test/lsan/lit.common.cfg.py @@ -34,7 +34,7 @@ def get_required_attr(config, attr_name): config.name = "LeakSanitizer-AddressSanitizer" lsan_cflags = ["-fsanitize=address"] config.available_features.add("asan") - if config.host_os == "NetBSD": + if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) elif lsan_lit_test_mode == "HWAddressSanitizer": config.name = "LeakSanitizer-HWAddressSanitizer" @@ -42,7 +42,7 @@ def get_required_attr(config, attr_name): if target_arch == "x86_64": lsan_cflags = lsan_cflags + ["-fsanitize-hwaddress-experimental-aliasing"] config.available_features.add("hwasan") - if config.host_os == "NetBSD": + if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) else: lit_config.fatal("Unknown LSan test mode: %r" % lsan_lit_test_mode) @@ -51,7 +51,7 @@ def get_required_attr(config, attr_name): # Platform-specific default LSAN_OPTIONS for lit tests. default_common_opts_str = ":".join(list(config.default_sanitizer_opts)) default_lsan_opts = default_common_opts_str + ":detect_leaks=1" -if config.host_os == "Darwin": +if config.target_os == "Darwin": # On Darwin, we default to `abort_on_error=1`, which would make tests run # much slower. Let's override this and run lit tests with 'abort_on_error=0'. # Also, make sure we do not overwhelm the syslog while testing. @@ -101,7 +101,7 @@ def build_invocation(compile_flags): ) supported_linux = ( (not config.android) - and config.host_os == "Linux" + and config.target_os == "Linux" and config.host_arch in [ "aarch64", @@ -117,8 +117,8 @@ def build_invocation(compile_flags): "loongarch64", ] ) -supported_darwin = config.host_os == "Darwin" and config.target_arch in ["x86_64"] -supported_netbsd = config.host_os == "NetBSD" and config.target_arch in [ +supported_darwin = config.target_os == "Darwin" and config.target_arch in ["x86_64"] +supported_netbsd = config.target_os == "NetBSD" and config.target_arch in [ "x86_64", "i386", ] diff --git a/compiler-rt/test/memprof/lit.cfg.py b/compiler-rt/test/memprof/lit.cfg.py index 4057da0c65b51..e28507be4dc9e 100644 --- a/compiler-rt/test/memprof/lit.cfg.py +++ b/compiler-rt/test/memprof/lit.cfg.py @@ -106,7 +106,7 @@ def build_invocation(compile_flags): config.substitutions.append(("%pie", "-pie")) # Only run the tests on supported OSs. -if config.host_os not in ["Linux"]: +if config.target_os not in ["Linux"]: config.unsupported = True if not config.parallelism_group: diff --git a/compiler-rt/test/metadata/lit.cfg.py b/compiler-rt/test/metadata/lit.cfg.py index 73ba27ad3a4e2..9980e93b3a6ec 100644 --- a/compiler-rt/test/metadata/lit.cfg.py +++ b/compiler-rt/test/metadata/lit.cfg.py @@ -5,5 +5,5 @@ config.suffixes = [".cpp"] # Binary metadata is currently emitted only for ELF binaries # and sizes of stack arguments depend on the arch. -if config.host_os not in ["Linux"] or config.target_arch not in ["x86_64"]: +if config.target_os not in ["Linux"] or config.target_arch not in ["x86_64"]: config.unsupported = True diff --git a/compiler-rt/test/msan/lit.cfg.py b/compiler-rt/test/msan/lit.cfg.py index 361be79e2557e..d9e83c67b84c8 100644 --- a/compiler-rt/test/msan/lit.cfg.py +++ b/compiler-rt/test/msan/lit.cfg.py @@ -20,7 +20,7 @@ + config.debug_info_flags ) # Some Msan tests leverage backtrace() which requires libexecinfo on FreeBSD. -if config.host_os == "FreeBSD": +if config.target_os == "FreeBSD": clang_msan_cflags += ["-lexecinfo", "-fPIC"] # On SystemZ we need -mbackchain to make the fast unwinder work. if config.target_arch == "s390x": @@ -44,7 +44,7 @@ def build_invocation(compile_flags): # Default test suffixes. config.suffixes = [".c", ".cpp"] -if config.host_os not in ["Linux", "NetBSD", "FreeBSD"]: +if config.target_os not in ["Linux", "NetBSD", "FreeBSD"]: config.unsupported = True # For mips64, mips64el we have forced store_context_size to 1 because these @@ -55,5 +55,5 @@ def build_invocation(compile_flags): else: config.substitutions.append(("CHECK-%short-stack", "CHECK-FULL-STACK")) -if config.host_os == "NetBSD": +if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) diff --git a/compiler-rt/test/nsan/lit.cfg.py b/compiler-rt/test/nsan/lit.cfg.py index 2d67911a7d5d8..8225c85c41b81 100644 --- a/compiler-rt/test/nsan/lit.cfg.py +++ b/compiler-rt/test/nsan/lit.cfg.py @@ -32,5 +32,5 @@ def build_invocation(compile_flags): ) # NSan tests are currently supported on Linux only. -if config.host_os not in ["Linux"]: +if config.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/orc/lit.cfg.py b/compiler-rt/test/orc/lit.cfg.py index 7a6eb4e7de325..3c3badb642ff7 100644 --- a/compiler-rt/test/orc/lit.cfg.py +++ b/compiler-rt/test/orc/lit.cfg.py @@ -18,11 +18,11 @@ config.available_features.add("host-arch-compatible") # If the target OS hasn't been set then assume host. -if not config.target_os: - config.target_os = config.host_os +if not config.orc_test_target_os: + config.orc_test_target_os = config.target_os config.test_target_is_host_executable = ( - config.target_os == config.host_os and host_arch_compatible + config.orc_test_target_os == config.target_os and host_arch_compatible ) # Assume that llvm-jitlink is in the config.llvm_tools_dir. @@ -31,7 +31,7 @@ config.compiler_rt_obj_root, "lib/orc/tests/tools/orc-rt-executor" ) lli = os.path.join(config.llvm_tools_dir, "lli") -if config.host_os == "Darwin": +if config.target_os == "Darwin": orc_rt_path = "%s/liborc_rt_osx.a" % config.compiler_rt_libdir else: orc_rt_path = "%s/liborc_rt%s.a" % (config.compiler_rt_libdir, config.target_suffix) @@ -53,7 +53,7 @@ def build_invocation(compile_flags): config.substitutions.append( ("%clang_cl ", build_invocation(["--driver-mode=cl"] + [config.target_cflags])) ) -if config.host_os == "Windows": +if config.target_os == "Windows": config.substitutions.append( ( "%llvm_jitlink", @@ -86,7 +86,7 @@ def build_invocation(compile_flags): # Exclude Inputs directories. config.excludes = ["Inputs"] -if config.host_os not in ["Darwin", "FreeBSD", "Linux", "Windows"]: +if config.target_os not in ["Darwin", "FreeBSD", "Linux", "Windows"]: config.unsupported = True # Ask llvm-config about assertion mode. diff --git a/compiler-rt/test/orc/lit.site.cfg.py.in b/compiler-rt/test/orc/lit.site.cfg.py.in index a33ef3d7d7207..d0625f6ace15c 100644 --- a/compiler-rt/test/orc/lit.site.cfg.py.in +++ b/compiler-rt/test/orc/lit.site.cfg.py.in @@ -5,7 +5,8 @@ config.name_suffix = "@ORC_TEST_CONFIG_SUFFIX@" config.orc_lit_source_dir = "@ORC_LIT_SOURCE_DIR@" config.target_cflags = "@ORC_TEST_TARGET_CFLAGS@" config.target_arch = "@ORC_TEST_TARGET_ARCH@" -config.target_os = "@ORC_TEST_TARGET_OS@" +# FIXME: Remove this variable, the target OS is available in config.target_os. +config.orc_test_target_os = "@ORC_TEST_TARGET_OS@" config.built_with_llvm = ("@COMPILER_RT_STANDALONE_BUILD@" != "TRUE") config.libunwind_shared = "@LIBUNWIND_ENABLE_SHARED@" config.libunwind_install_dir = "@LLVM_BINARY_DIR@/@LIBUNWIND_INSTALL_LIBRARY_DIR@" diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py index c9a716abeccd8..df7f11e2b286b 100644 --- a/compiler-rt/test/profile/lit.cfg.py +++ b/compiler-rt/test/profile/lit.cfg.py @@ -30,7 +30,7 @@ def get_required_attr(config, attr_name): target_is_msvc = bool(re.match(r".*-windows-msvc$", config.target_triple)) -if config.host_os in ["Linux"]: +if config.target_os in ["Linux"]: extra_link_flags = ["-ldl"] elif target_is_msvc: # InstrProf is incompatible with incremental linking. Disable it as a @@ -154,7 +154,7 @@ def exclude_unsupported_files_for_aix(dirname): ) ) -if config.host_os not in [ +if config.target_os not in [ "Windows", "Darwin", "FreeBSD", @@ -167,10 +167,10 @@ def exclude_unsupported_files_for_aix(dirname): config.unsupported = True config.substitutions.append( - ("%shared_lib_flag", "-dynamiclib" if (config.host_os == "Darwin") else "-shared") + ("%shared_lib_flag", "-dynamiclib" if (config.target_os == "Darwin") else "-shared") ) -if config.host_os in ["AIX"]: +if config.target_os in ["AIX"]: config.available_features.add("system-aix") exclude_unsupported_files_for_aix(config.test_source_root) exclude_unsupported_files_for_aix(config.test_source_root + "/Posix") @@ -184,5 +184,5 @@ def exclude_unsupported_files_for_aix(dirname): if config.have_curl: config.available_features.add("curl") -if config.host_os in ("AIX", "Darwin", "Linux"): +if config.target_os in ("AIX", "Darwin", "Linux"): config.available_features.add("continuous-mode") diff --git a/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in b/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in index 59e1e10360b52..41fcb32e5009b 100644 --- a/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in +++ b/compiler-rt/test/rtsan/Unit/lit.site.cfg.py.in @@ -15,7 +15,7 @@ config.test_source_root = config.test_exec_root if not config.parallelism_group: config.parallelism_group = 'shadow-memory' -if config.host_os == 'Darwin': +if config.target_os == 'Darwin': # On Darwin, we default to ignore_noninstrumented_modules=1, which also # suppresses some races the tests are supposed to find. See rtsan/lit.cfg.py. if 'RTSAN_OPTIONS' in config.environment: diff --git a/compiler-rt/test/rtsan/lit.cfg.py b/compiler-rt/test/rtsan/lit.cfg.py index 7c75515a7608d..6d880c10ecd45 100644 --- a/compiler-rt/test/rtsan/lit.cfg.py +++ b/compiler-rt/test/rtsan/lit.cfg.py @@ -6,7 +6,7 @@ default_rtsan_opts = "atexit_sleep_ms=0" -if config.host_os == "Darwin": +if config.target_os == "Darwin": # On Darwin, we default to `abort_on_error=1`, which would make tests run # much slower. Let's override this and run lit tests with 'abort_on_error=0'. default_rtsan_opts += ":abort_on_error=0" @@ -36,7 +36,7 @@ def build_invocation(compile_flags): llvm_rtsan = os.path.join(config.llvm_tools_dir, "llvm-rtsan") # Setup substitutions. -if config.host_os == "Linux": +if config.target_os == "Linux": libdl_flag = "-ldl" else: libdl_flag = "" @@ -52,7 +52,7 @@ def build_invocation(compile_flags): # Default test suffixes. config.suffixes = [".c", ".cpp"] -if config.host_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD", "OpenBSD"]: +if config.target_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD", "OpenBSD"]: config.unsupported = True elif "64" not in config.host_arch: if "arm" in config.host_arch: @@ -61,5 +61,5 @@ def build_invocation(compile_flags): else: config.unsupported = True -if config.host_os == "NetBSD": +if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_nomprotect_prefix)) diff --git a/compiler-rt/test/safestack/lit.cfg.py b/compiler-rt/test/safestack/lit.cfg.py index 4ab9c1ce70bac..3f5565caa65c6 100644 --- a/compiler-rt/test/safestack/lit.cfg.py +++ b/compiler-rt/test/safestack/lit.cfg.py @@ -33,5 +33,5 @@ ) ) -if config.host_os not in ["Linux", "FreeBSD", "NetBSD", "SunOS"]: +if config.target_os not in ["Linux", "FreeBSD", "NetBSD", "SunOS"]: config.unsupported = True diff --git a/compiler-rt/test/sanitizer_common/lit.common.cfg.py b/compiler-rt/test/sanitizer_common/lit.common.cfg.py index 88d3ea9bc5ad2..5614229d9a126 100644 --- a/compiler-rt/test/sanitizer_common/lit.common.cfg.py +++ b/compiler-rt/test/sanitizer_common/lit.common.cfg.py @@ -40,7 +40,7 @@ config.available_features.add(config.tool_name) if ( - config.host_os == "Linux" + config.target_os == "Linux" and config.tool_name == "lsan" and config.target_arch == "i386" ): @@ -49,7 +49,7 @@ if config.arm_thumb: config.available_features.add("thumb") -if config.host_os == "Darwin": +if config.target_os == "Darwin": # On Darwin, we default to `abort_on_error=1`, which would make tests run # much slower. Let's override this and run lit tests with 'abort_on_error=0'. default_tool_options += ["abort_on_error=0"] @@ -68,7 +68,7 @@ extra_link_flags = [] -if config.host_os in ["Linux"]: +if config.target_os in ["Linux"]: extra_link_flags += ["-ldl"] clang_cflags = config.debug_info_flags + tool_cflags + [config.target_cflags] @@ -92,13 +92,13 @@ def build_invocation(compile_flags): config.suffixes = [".c", ".cpp"] -if config.host_os not in ["Linux", "Darwin", "NetBSD", "FreeBSD", "SunOS"]: +if config.target_os not in ["Linux", "Darwin", "NetBSD", "FreeBSD", "SunOS"]: config.unsupported = True if not config.parallelism_group: config.parallelism_group = "shadow-memory" -if config.host_os == "NetBSD": +if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) if os.path.exists("/etc/services"): diff --git a/compiler-rt/test/scudo/lit.cfg.py b/compiler-rt/test/scudo/lit.cfg.py index 5d45bd99804c7..b09c996e9ccc5 100644 --- a/compiler-rt/test/scudo/lit.cfg.py +++ b/compiler-rt/test/scudo/lit.cfg.py @@ -70,5 +70,5 @@ def build_invocation(compile_flags): ) # Hardened Allocator tests are currently supported on Linux only. -if config.host_os not in ["Linux"]: +if config.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/shadowcallstack/lit.cfg.py b/compiler-rt/test/shadowcallstack/lit.cfg.py index 70a6b16174c4b..5b95deb1b0986 100644 --- a/compiler-rt/test/shadowcallstack/lit.cfg.py +++ b/compiler-rt/test/shadowcallstack/lit.cfg.py @@ -32,5 +32,5 @@ ) ) -if config.host_os not in ["Linux"] or config.target_arch not in ["aarch64", "riscv64"]: +if config.target_os not in ["Linux"] or config.target_arch not in ["aarch64", "riscv64"]: config.unsupported = True diff --git a/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in b/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in index a9c6261ba48d4..b90af4f2c0d3a 100644 --- a/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in +++ b/compiler-rt/test/tsan/Unit/lit.site.cfg.py.in @@ -15,7 +15,7 @@ config.test_source_root = config.test_exec_root if not config.parallelism_group: config.parallelism_group = 'shadow-memory' -if config.host_os == 'Darwin': +if config.target_os == 'Darwin': # On Darwin, we default to ignore_noninstrumented_modules=1, which also # suppresses some races the tests are supposed to find. See tsan/lit.cfg.py. if 'TSAN_OPTIONS' in config.environment: diff --git a/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py b/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py index a7653f4305952..27edf611a0522 100644 --- a/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py +++ b/compiler-rt/test/tsan/libdispatch/lit.local.cfg.py @@ -14,5 +14,5 @@ def getRoot(config): else: config.unsupported = True -if config.host_os == "Darwin": +if config.target_os == "Darwin": config.environment["TSAN_OPTIONS"] += ":ignore_noninstrumented_modules=1" diff --git a/compiler-rt/test/tsan/lit.cfg.py b/compiler-rt/test/tsan/lit.cfg.py index a93333e2e593d..8803a7bda9aa5 100644 --- a/compiler-rt/test/tsan/lit.cfg.py +++ b/compiler-rt/test/tsan/lit.cfg.py @@ -23,7 +23,7 @@ def get_required_attr(config, attr_name): # Setup environment variables for running ThreadSanitizer. default_tsan_opts = "atexit_sleep_ms=0" -if config.host_os == "Darwin": +if config.target_os == "Darwin": # On Darwin, we default to `abort_on_error=1`, which would make tests run # much slower. Let's override this and run lit tests with 'abort_on_error=0'. default_tsan_opts += ":abort_on_error=0" @@ -61,7 +61,7 @@ def get_required_attr(config, attr_name): ) # Add additional flags if we're using instrumented libc++. # Instrumented libcxx currently not supported on Darwin. -if config.has_libcxx and config.host_os != "Darwin": +if config.has_libcxx and config.target_os != "Darwin": # FIXME: Dehardcode this path somehow. libcxx_path = os.path.join( config.compiler_rt_obj_root, @@ -86,7 +86,7 @@ def build_invocation(compile_flags): config.substitutions.append(("%clangxx_tsan ", build_invocation(clang_tsan_cxxflags))) # Define CHECK-%os to check for OS-dependent output. -config.substitutions.append(("CHECK-%os", ("CHECK-" + config.host_os))) +config.substitutions.append(("CHECK-%os", ("CHECK-" + config.target_os))) config.substitutions.append( ( @@ -101,7 +101,7 @@ def build_invocation(compile_flags): # Default test suffixes. config.suffixes = [".c", ".cpp", ".m", ".mm"] -if config.host_os not in ["FreeBSD", "Linux", "Darwin", "NetBSD"]: +if config.target_os not in ["FreeBSD", "Linux", "Darwin", "NetBSD"]: config.unsupported = True if config.android: @@ -110,5 +110,5 @@ def build_invocation(compile_flags): if not config.parallelism_group: config.parallelism_group = "shadow-memory" -if config.host_os == "NetBSD": +if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) diff --git a/compiler-rt/test/tysan/lit.cfg.py b/compiler-rt/test/tysan/lit.cfg.py index f38e0211639da..26846017b1957 100644 --- a/compiler-rt/test/tysan/lit.cfg.py +++ b/compiler-rt/test/tysan/lit.cfg.py @@ -71,7 +71,7 @@ def push_dynamic_library_lookup_path(config, new_path): # Setup source root. config.test_source_root = os.path.dirname(__file__) -if config.host_os not in ["FreeBSD", "NetBSD"]: +if config.target_os not in ["FreeBSD", "NetBSD"]: libdl_flag = "-ldl" else: libdl_flag = "" @@ -127,10 +127,10 @@ def build_invocation(compile_flags): # Default test suffixes. config.suffixes = [".c", ".cpp"] -if config.host_os == "Darwin": +if config.target_os == "Darwin": config.suffixes.append(".mm") -if config.host_os == "Windows": +if config.target_os == "Windows": config.substitutions.append(("%fPIC", "")) config.substitutions.append(("%fPIE", "")) config.substitutions.append(("%pie", "")) @@ -140,7 +140,7 @@ def build_invocation(compile_flags): config.substitutions.append(("%pie", "-pie")) # Only run the tests on supported OSs. -if config.host_os not in [ +if config.target_os not in [ "Linux", "Darwin", ]: diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py index e69d15f5b141c..4342649532865 100644 --- a/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py +++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Function/lit.local.cfg.py @@ -1,4 +1,4 @@ -if config.host_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD"]: +if config.target_os not in ["Darwin", "FreeBSD", "Linux", "NetBSD"]: config.unsupported = True # Work around "Cannot represent a difference across sections" if config.target_arch == "powerpc64": diff --git a/compiler-rt/test/ubsan/lit.common.cfg.py b/compiler-rt/test/ubsan/lit.common.cfg.py index 04d6f24de5a9f..25e527903788e 100644 --- a/compiler-rt/test/ubsan/lit.common.cfg.py +++ b/compiler-rt/test/ubsan/lit.common.cfg.py @@ -74,7 +74,7 @@ def build_invocation(compile_flags): config.suffixes = [".c", ".cpp", ".m"] # Check that the host supports UndefinedBehaviorSanitizer tests -if config.host_os not in [ +if config.target_os not in [ "Linux", "Darwin", "FreeBSD", @@ -90,5 +90,5 @@ def build_invocation(compile_flags): if ubsan_lit_test_mode in ["AddressSanitizer", "MemorySanitizer", "ThreadSanitizer"]: if not config.parallelism_group: config.parallelism_group = "shadow-memory" - if config.host_os == "NetBSD": + if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_noaslr_prefix)) diff --git a/compiler-rt/test/ubsan_minimal/lit.common.cfg.py b/compiler-rt/test/ubsan_minimal/lit.common.cfg.py index 714241a580f9d..bcc0e46fbef91 100644 --- a/compiler-rt/test/ubsan_minimal/lit.common.cfg.py +++ b/compiler-rt/test/ubsan_minimal/lit.common.cfg.py @@ -35,7 +35,7 @@ def build_invocation(compile_flags): config.suffixes = [".c", ".cpp"] # Check that the host supports UndefinedBehaviorSanitizerMinimal tests -if config.host_os not in [ +if config.target_os not in [ "Linux", "FreeBSD", "NetBSD", diff --git a/compiler-rt/test/xray/lit.cfg.py b/compiler-rt/test/xray/lit.cfg.py index f73ae3acd7715..e56ed85d1d822 100644 --- a/compiler-rt/test/xray/lit.cfg.py +++ b/compiler-rt/test/xray/lit.cfg.py @@ -14,7 +14,7 @@ # If libc++ was used to build XRAY libraries, libc++ is needed. Fix applied # to Linux only since -rpath may not be portable. This can be extended to # other platforms. -if config.libcxx_used == "1" and config.host_os == "Linux": +if config.libcxx_used == "1" and config.target_os == "Linux": clang_xray_cflags = clang_xray_cflags + ( ["-L%s -lc++ -Wl,-rpath=%s" % (config.llvm_shlib_dir, config.llvm_shlib_dir)] ) @@ -30,7 +30,7 @@ def build_invocation(compile_flags): llvm_xray = os.path.join(config.llvm_tools_dir, "llvm-xray") # Setup substitutions. -if config.host_os == "Linux": +if config.target_os == "Linux": libdl_flag = "-ldl" else: libdl_flag = "" @@ -56,7 +56,7 @@ def build_invocation(compile_flags): # Default test suffixes. config.suffixes = [".c", ".cpp"] -if config.host_os not in ["FreeBSD", "Linux", "NetBSD", "OpenBSD"]: +if config.target_os not in ["FreeBSD", "Linux", "NetBSD", "OpenBSD"]: config.unsupported = True elif "64" not in config.host_arch: if "arm" in config.host_arch: @@ -65,5 +65,5 @@ def build_invocation(compile_flags): else: config.unsupported = True -if config.host_os == "NetBSD": +if config.target_os == "NetBSD": config.substitutions.insert(0, ("%run", config.netbsd_nomprotect_prefix)) diff --git a/compiler-rt/unittests/lit.common.unit.cfg.py b/compiler-rt/unittests/lit.common.unit.cfg.py index 557a42893ec15..93f417c1d50ae 100644 --- a/compiler-rt/unittests/lit.common.unit.cfg.py +++ b/compiler-rt/unittests/lit.common.unit.cfg.py @@ -42,7 +42,7 @@ def get_lit_conf(name, default=None): if "TEMP" in os.environ: config.environment["TEMP"] = os.environ["TEMP"] -if config.host_os == "Darwin": +if config.target_os == "Darwin": # Only run up to 3 processes that require shadow memory simultaneously on # 64-bit Darwin. Using more scales badly and hogs the system due to # inefficient handling of large mmap'd regions (terabytes) by the kernel. diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in index 3e42e83c9e70a..30ccf452ac71f 100644 --- a/compiler-rt/unittests/lit.common.unit.configured.in +++ b/compiler-rt/unittests/lit.common.unit.configured.in @@ -10,7 +10,7 @@ config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@ config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@") config.host_arch = "@HOST_ARCH@" -config.host_os = "@HOST_OS@" +config.target_os = "@HOST_OS@" config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@" config.gwp_asan = @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@ config.emulator = "@COMPILER_RT_EMULATOR@" diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index 83713213ce1fe..7ab9749f6266d 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -130,14 +130,14 @@ def delete_module_cache(path): config.environment["MallocNanoZone"] = "0" if "Address" in config.llvm_use_sanitizer: config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1" - if "Darwin" in config.host_os: + if "Darwin" in config.target_os: config.environment["DYLD_INSERT_LIBRARIES"] = find_sanitizer_runtime( "libclang_rt.asan_osx_dynamic.dylib" ) if "Thread" in config.llvm_use_sanitizer: config.environment["TSAN_OPTIONS"] = "halt_on_error=1" - if "Darwin" in config.host_os: + if "Darwin" in config.target_os: config.environment["DYLD_INSERT_LIBRARIES"] = find_sanitizer_runtime( "libclang_rt.tsan_osx_dynamic.dylib" ) diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in index 86d58889cc4ad..c4e4352fe7915 100644 --- a/lldb/test/API/lit.site.cfg.py.in +++ b/lldb/test/API/lit.site.cfg.py.in @@ -13,7 +13,7 @@ config.lldb_src_root = "@LLDB_SOURCE_DIR@" config.lldb_libs_dir = lit_config.substitute("@LLDB_LIBS_DIR@") config.lldb_framework_dir = lit_config.substitute("@LLDB_FRAMEWORK_DIR@") config.cmake_cxx_compiler = "@CMAKE_CXX_COMPILER@" -config.host_os = "@HOST_OS@" +config.target_os = "@HOST_OS@" config.host_triple = "@LLVM_HOST_TRIPLE@" config.shared_libs = @LLVM_ENABLE_SHARED_LIBS@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@" diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index bd6e37c848d8c..1076456a4aef0 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -63,7 +63,7 @@ def get_asan_rtlib(): if ( not "Address" in config.llvm_use_sanitizer - or not "Darwin" in config.host_os + or not "Darwin" in config.target_os or not "x86" in config.host_triple ): return "" diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in index caee6c1db92ee..ee76beb51cce6 100644 --- a/llvm/test/lit.site.cfg.py.in +++ b/llvm/test/lit.site.cfg.py.in @@ -26,7 +26,7 @@ config.enable_assertions = @ENABLE_ASSERTIONS@ config.targets_to_build = "@TARGETS_TO_BUILD@" config.native_target = "@LLVM_NATIVE_ARCH@" config.llvm_bindings = "@LLVM_BINDINGS@".split(' ') -config.host_os = "@HOST_OS@" +config.target_os = "@HOST_OS@" config.host_cc = "@HOST_CC@" config.host_cxx = "@HOST_CXX@" # Note: ldflags can contain double-quoted paths, so must use single quotes here. From afff28e4cb4b56dc5c77ecdb5aad9ec10e170999 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Thu, 17 Jul 2025 11:17:52 -0700 Subject: [PATCH 219/813] [CI][Github] Enable CIR CI build and test (#147430) This change modifies CI scripts to add a pseudo-project for CIR and detect when CIR-specific files are modified. It also enables building clang with CIR enabled whenever both the clang and mlir projects are being built. Building and testing CIR is only enabled on Linux at this time, as CIR doesn't properly support Windows or MacOS yet. --- .ci/compute_projects.py | 19 ++++++++++++++++++ .ci/compute_projects_test.py | 34 ++++++++++++++++++++++++++++++++- .ci/monolithic-linux.sh | 2 ++ .github/workflows/premerge.yaml | 2 +- 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py index c3cf714ce6c10..8e25fd61d6b32 100644 --- a/.ci/compute_projects.py +++ b/.ci/compute_projects.py @@ -19,6 +19,7 @@ PROJECT_DEPENDENCIES = { "llvm": set(), "clang": {"llvm"}, + "CIR": {"clang", "mlir"}, "bolt": {"clang", "lld", "llvm"}, "clang-tools-extra": {"clang", "llvm"}, "compiler-rt": {"clang", "lld"}, @@ -55,6 +56,7 @@ ".ci": { "llvm", "clang", + "CIR", "lld", "lldb", "bolt", @@ -128,6 +130,7 @@ "lldb": "check-lldb", "llvm": "check-llvm", "clang": "check-clang", + "CIR": "check-clang-cir", "bolt": "check-bolt", "lld": "check-lld", "flang": "check-flang", @@ -247,6 +250,14 @@ def _get_modified_projects(modified_files: list[str]) -> Set[str]: # capacity. if len(path_parts) > 3 and path_parts[:3] == ("llvm", "utils", "gn"): continue + # If the file is in the clang/lib/CIR directory, add the CIR project. + if len(path_parts) > 3 and ( + path_parts[:3] == ("clang", "lib", "CIR") + or path_parts[:3] == ("clang", "test", "CIR") + or path_parts[:4] == ("clang", "include", "clang", "CIR") + ): + modified_projects.add("CIR") + # Fall through to add clang. modified_projects.add(pathlib.Path(modified_file).parts[0]) return modified_projects @@ -267,6 +278,13 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]: runtimes_check_targets_needs_reconfig = _compute_project_check_targets( runtimes_to_test_needs_reconfig ) + + # CIR is used as a pseudo-project in this script. It is built as part of the + # clang build, but it requires an explicit option to enable. We set that + # option here, and remove it from the projects_to_build list. + enable_cir = "ON" if "CIR" in projects_to_build else "OFF" + projects_to_build.discard("CIR") + # We use a semicolon to separate the projects/runtimes as they get passed # to the CMake invocation and thus we need to use the CMake list separator # (;). We use spaces to separate the check targets as they end up getting @@ -279,6 +297,7 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]: "runtimes_check_targets_needs_reconfig": " ".join( sorted(runtimes_check_targets_needs_reconfig) ), + "enable_cir": enable_cir, } diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py index 6299931e1ec34..732514c96f5a6 100644 --- a/.ci/compute_projects_test.py +++ b/.ci/compute_projects_test.py @@ -104,6 +104,10 @@ def test_clang(self): env_variables["runtimes_check_targets_needs_reconfig"], "check-cxx check-cxxabi check-unwind", ) + self.assertEqual( + env_variables["enable_cir"], + "OFF", + ) def test_clang_windows(self): env_variables = compute_projects.get_env_variables( @@ -126,6 +130,32 @@ def test_clang_windows(self): env_variables["runtimes_check_targets_needs_reconfig"], "check-cxx check-cxxabi check-unwind", ) + self.assertEqual(env_variables["enable_cir"], "OFF") + + def test_cir(self): + env_variables = compute_projects.get_env_variables( + ["clang/lib/CIR/CMakeLists.txt"], "Linux" + ) + self.assertEqual( + env_variables["projects_to_build"], + "clang;clang-tools-extra;lld;llvm;mlir", + ) + self.assertEqual( + env_variables["project_check_targets"], + "check-clang check-clang-cir check-clang-tools", + ) + self.assertEqual( + env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind" + ) + self.assertEqual( + env_variables["runtimes_check_targets"], + "check-compiler-rt", + ) + self.assertEqual( + env_variables["runtimes_check_targets_needs_reconfig"], + "check-cxx check-cxxabi check-unwind", + ) + self.assertEqual(env_variables["enable_cir"], "ON") def test_bolt(self): env_variables = compute_projects.get_env_variables( @@ -158,6 +188,7 @@ def test_mlir(self): self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") + self.assertEqual(env_variables["enable_cir"], "OFF") def test_flang(self): env_variables = compute_projects.get_env_variables( @@ -168,6 +199,7 @@ def test_flang(self): self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") + self.assertEqual(env_variables["enable_cir"], "OFF") def test_invalid_subproject(self): env_variables = compute_projects.get_env_variables( @@ -237,7 +269,7 @@ def test_ci(self): ) self.assertEqual( env_variables["project_check_targets"], - "check-bolt check-clang check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly", + "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly", ) self.assertEqual( env_variables["runtimes_to_build"], diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index d9f51ba9fd946..6db24d894eb73 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -48,6 +48,7 @@ targets="${2}" runtimes="${3}" runtime_targets="${4}" runtime_targets_needs_reconfig="${5}" +enable_cir="${6}" lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests" @@ -67,6 +68,7 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \ -G Ninja \ -D CMAKE_PREFIX_PATH="${HOME}/.local" \ -D CMAKE_BUILD_TYPE=Release \ + -D CLANG_ENABLE_CIR=${enable_cir} \ -D LLVM_ENABLE_ASSERTIONS=ON \ -D LLVM_BUILD_EXAMPLES=ON \ -D COMPILER_RT_BUILD_LIBFUZZER=OFF \ diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index 7b5ecd62080f3..73943bc86eadd 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -62,7 +62,7 @@ jobs: export CC=/opt/llvm/bin/clang export CXX=/opt/llvm/bin/clang++ - ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}" + ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}" "${enable_cir}" - name: Upload Artifacts if: '!cancelled()' uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 From 413e71b700562f914b369c7eab6ad41c18910bdf Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Thu, 17 Jul 2025 14:18:21 -0400 Subject: [PATCH 220/813] [flang] Main program symbol no longer conflicts with the other symbols (#149169) The following code is now accepted: ``` module m end program m use m end ``` The PROGRAM name doesn't really have an effect on the compilation result, so it shouldn't result in symbol name conflicts. This change makes the main program symbol name all uppercase in the cooked character stream. This makes it distinct from all other symbol names that are all lowercase in cooked character stream. Modified the tests that were checking for lower case main program name. --- flang/docs/Extensions.md | 18 ++++- flang/lib/Semantics/check-omp-structure.cpp | 3 +- flang/lib/Semantics/resolve-labels.cpp | 25 ++++-- flang/test/Driver/cuda-option.f90 | 2 +- flang/test/Driver/unparse-use-analyzed.f95 | 4 +- flang/test/Driver/unparse-with-modules.f90 | 2 +- .../test/Integration/debug-common-block-1.f90 | 2 +- flang/test/Integration/debug-local-var-2.f90 | 2 +- flang/test/Lower/CUDA/cuda-derived.cuf | 2 +- flang/test/Lower/CUDA/cuda-return01.cuf | 2 +- flang/test/Lower/CUDA/cuda-return02.cuf | 2 +- .../Lower/HLFIR/intrinsic-subroutines.f90 | 2 +- ...ointer-component-structure-constructor.f90 | 2 +- flang/test/Lower/OpenACC/acc-atomic-read.f90 | 2 +- flang/test/Lower/OpenACC/acc-atomic-write.f90 | 2 +- flang/test/Lower/OpenACC/acc-routine04.f90 | 2 +- flang/test/Lower/OpenMP/atomic-read.f90 | 2 +- flang/test/Lower/OpenMP/atomic-write.f90 | 2 +- .../Lower/OpenMP/common-atomic-lowering.f90 | 2 +- flang/test/Lower/OpenMP/cray-pointers02.f90 | 2 +- .../Lower/OpenMP/default-clause-byref.f90 | 2 +- flang/test/Lower/OpenMP/default-clause.f90 | 2 +- .../parallel-reduction-allocatable-array.f90 | 2 +- .../OpenMP/parallel-reduction-array-lb.f90 | 2 +- .../OpenMP/parallel-reduction-array2.f90 | 2 +- .../Lower/OpenMP/parallel-reduction-byref.f90 | 2 +- .../parallel-reduction-pointer-array.f90 | 2 +- .../OpenMP/parallel-reduction-rename.f90 | 2 +- .../test/Lower/OpenMP/parallel-reduction.f90 | 2 +- flang/test/Lower/OpenMP/sections.f90 | 2 +- .../threadprivate-host-association-2.f90 | 2 +- .../threadprivate-host-association-3.f90 | 2 +- .../OpenMP/threadprivate-host-association.f90 | 2 +- flang/test/Lower/OpenMP/wsloop-chunks.f90 | 2 +- flang/test/Lower/OpenMP/wsloop-collapse.f90 | 2 +- ...oop-reduction-allocatable-array-minmax.f90 | 2 +- .../OpenMP/wsloop-reduction-allocatable.f90 | 2 +- .../OpenMP/wsloop-reduction-array-lb.f90 | 2 +- .../OpenMP/wsloop-reduction-array-lb2.f90 | 2 +- .../Lower/OpenMP/wsloop-reduction-array.f90 | 2 +- .../Lower/OpenMP/wsloop-reduction-array2.f90 | 2 +- .../Lower/OpenMP/wsloop-reduction-min2.f90 | 2 +- .../wsloop-reduction-multiple-clauses.f90 | 2 +- .../Lower/OpenMP/wsloop-reduction-pointer.f90 | 2 +- flang/test/Lower/array-character.f90 | 2 +- flang/test/Lower/array-expression-slice-1.f90 | 2 +- flang/test/Lower/basic-program.f90 | 6 +- flang/test/Lower/big-integer-parameter.f90 | 2 +- .../test/Lower/derived-type-finalization.f90 | 2 +- flang/test/Lower/location.f90 | 2 +- flang/test/Lower/nested-where.f90 | 2 +- flang/test/Lower/polymorphic.f90 | 2 +- flang/test/Lower/pre-fir-tree02.f90 | 2 +- flang/test/Lower/pre-fir-tree03.f90 | 2 +- flang/test/Lower/pre-fir-tree06.f90 | 4 +- .../test/Lower/program-units-fir-mangling.f90 | 2 +- flang/test/Lower/return-statement.f90 | 2 +- flang/test/Lower/volatile-openmp1.f90 | 2 +- flang/test/Lower/volatile-string.f90 | 2 +- flang/test/Lower/volatile3.f90 | 2 +- flang/test/Parser/acc-unparse.f90 | 2 +- .../test/Semantics/OpenACC/acc-symbols01.f90 | 26 +++---- .../OpenMP/critical_within_default.f90 | 2 +- .../OpenMP/declare-mapper-symbols.f90 | 2 +- .../OpenMP/declare-reduction-mangled.f90 | 2 +- .../OpenMP/declare-reduction-operators.f90 | 2 +- .../OpenMP/declare-reduction-renamedop.f90 | 2 +- .../Semantics/OpenMP/declare-reduction.f90 | 2 +- .../Semantics/OpenMP/declare-target03.f90 | 6 +- flang/test/Semantics/OpenMP/do-schedule03.f90 | 32 ++++---- .../Semantics/OpenMP/do01-positivecase.f90 | 10 +-- .../Semantics/OpenMP/do04-positivecase.f90 | 18 ++--- .../Semantics/OpenMP/do05-positivecase.f90 | 36 ++++----- .../Semantics/OpenMP/do06-positivecases.f90 | 14 ++-- flang/test/Semantics/OpenMP/do11.f90 | 20 ++--- flang/test/Semantics/OpenMP/do12.f90 | 76 +++++++++---------- flang/test/Semantics/OpenMP/do14.f90 | 66 ++++++++-------- flang/test/Semantics/OpenMP/do17.f90 | 42 +++++----- .../Semantics/OpenMP/map-clause-symbols.f90 | 2 +- flang/test/Semantics/OpenMP/reduction08.f90 | 52 ++++++------- flang/test/Semantics/OpenMP/reduction09.f90 | 42 +++++----- flang/test/Semantics/OpenMP/reduction11.f90 | 2 +- flang/test/Semantics/OpenMP/scan2.f90 | 2 +- flang/test/Semantics/OpenMP/symbol01.f90 | 54 ++++++------- flang/test/Semantics/OpenMP/symbol05.f90 | 8 +- flang/test/Semantics/OpenMP/symbol07.f90 | 6 +- flang/test/Semantics/OpenMP/symbol09.f90 | 6 +- .../test/Semantics/OpenMP/threadprivate03.f90 | 6 +- flang/test/Semantics/getsymbols03-a.f90 | 2 +- flang/test/Semantics/long-name.f90 | 2 +- flang/test/Semantics/modproc01.f90 | 2 +- flang/test/Semantics/multi-programs04.f90 | 2 +- flang/test/Semantics/pointer01.f90 | 1 - flang/test/Semantics/procinterface01.f90 | 26 +++---- flang/test/Semantics/resolve05.f90 | 1 - flang/test/Semantics/resolve125.f90 | 4 +- flang/test/Semantics/symbol03.f90 | 20 ++--- flang/test/Semantics/symbol06.f90 | 72 +++++++++--------- flang/test/Semantics/symbol07.f90 | 48 ++++++------ flang/test/Semantics/symbol08.f90 | 16 ++-- flang/test/Semantics/symbol15.f90 | 12 +-- flang/test/Semantics/symbol16.f90 | 20 ++--- flang/test/Semantics/symbol17.f90 | 52 ++++++------- flang/test/Semantics/symbol18.f90 | 22 +++--- flang/test/Semantics/symbol20.f90 | 14 ++-- flang/test/Semantics/symbol25.f90 | 14 ++-- flang/test/Semantics/symbol26.f90 | 10 +-- .../Transforms/DoConcurrent/basic_host.f90 | 2 +- 108 files changed, 537 insertions(+), 513 deletions(-) diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 72d12cd92600d..c167a55bc486d 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -1,9 +1,9 @@ - # Fortran Extensions supported by Flang @@ -170,6 +170,18 @@ end In the case of `DEFERRED` bindings in an `ABSTRACT` derived type, however, overrides are necessary, so they are permitted for inaccessible bindings with an optional warning. +* Main program name is allowed to be the same as the other symbols used + in the main program, for example: +``` +module m +end +program m +use m +end +``` + Note that internally the main program symbol name is all uppercase, unlike + the names of all other symbols, which are usually all lowercase. This + may make a difference in testing/debugging. ## Extensions, deletions, and legacy features supported by default diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 2425265e196c6..e4a94efcc6b55 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -1156,8 +1156,7 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar( (sym->has() || sym->has())) { context_.Say(name->source, - "The module name or main program name cannot be in a " - "%s " + "The module name cannot be in a %s " "directive"_err_en_US, ContextDirectiveAsFortran()); } else if (!IsSaved(*name->symbol) && diff --git a/flang/lib/Semantics/resolve-labels.cpp b/flang/lib/Semantics/resolve-labels.cpp index b0cbc4b56e889..27e259fab3873 100644 --- a/flang/lib/Semantics/resolve-labels.cpp +++ b/flang/lib/Semantics/resolve-labels.cpp @@ -489,15 +489,30 @@ class ParseTreeAnalyzer { // C1401 void Post(const parser::MainProgram &mainProgram) { + // Uppercase the name of the main program, so that its symbol name + // would be unique from similarly named non-main-program symbols. + auto upperCaseCharBlock = [](const parser::CharBlock &cb) { + char *ch{const_cast(cb.begin())}; + char *endCh{ch + cb.size()}; + while (ch != endCh) { + *ch++ = parser::ToUpperCaseLetter(*ch); + } + }; + const parser::CharBlock *progName{nullptr}; + if (const auto &program{ + std::get>>( + mainProgram.t)}) { + progName = &program->statement.v.source; + upperCaseCharBlock(*progName); + } if (const parser::CharBlock * endName{GetStmtName(std::get>( mainProgram.t))}) { - if (const auto &program{ - std::get>>( - mainProgram.t)}) { - if (*endName != program->statement.v.source) { + upperCaseCharBlock(*endName); + if (progName) { + if (*endName != *progName) { context_.Say(*endName, "END PROGRAM name mismatch"_err_en_US) - .Attach(program->statement.v.source, "should be"_en_US); + .Attach(*progName, "should be"_en_US); } } else { context_.Say(*endName, diff --git a/flang/test/Driver/cuda-option.f90 b/flang/test/Driver/cuda-option.f90 index 0740ed509a077..f55e88dab20ce 100644 --- a/flang/test/Driver/cuda-option.f90 +++ b/flang/test/Driver/cuda-option.f90 @@ -8,7 +8,7 @@ program main integer, device :: dvar end program -! CHECK-LABEL: PROGRAM main +! CHECK-LABEL: PROGRAM MAIN ! CHECK: INTEGER :: var = 1 ! CHECK: INTEGER, DEVICE :: dvar diff --git a/flang/test/Driver/unparse-use-analyzed.f95 b/flang/test/Driver/unparse-use-analyzed.f95 index eb6046aebba54..4bcd72c9a5f50 100644 --- a/flang/test/Driver/unparse-use-analyzed.f95 +++ b/flang/test/Driver/unparse-use-analyzed.f95 @@ -6,12 +6,12 @@ ! RUN: %flang_fc1 -fdebug-unparse %s | FileCheck %s --check-prefix=DEFAULT ! RUN: %flang_fc1 -fdebug-unparse -fno-analyzed-objects-for-unparse %s | FileCheck %s --check-prefix=DISABLED -! DEFAULT: PROGRAM test +! DEFAULT: PROGRAM TEST ! DEFAULT-NEXT: REAL, PARAMETER :: val = 3.43e2_4 ! DEFAULT-NEXT: PRINT *, 3.47e2_4 ! DEFAULT-NEXT: END PROGRAM -! DISABLED: PROGRAM test +! DISABLED: PROGRAM TEST ! DISABLED-NEXT: REAL, PARAMETER :: val = 343.0 ! DISABLED-NEXT: PRINT *, val+4 ! DISABLED-NEXT: END PROGRAM diff --git a/flang/test/Driver/unparse-with-modules.f90 b/flang/test/Driver/unparse-with-modules.f90 index 53997f7804efa..f6444afbe47c1 100644 --- a/flang/test/Driver/unparse-with-modules.f90 +++ b/flang/test/Driver/unparse-with-modules.f90 @@ -25,7 +25,7 @@ program test !CHECK: implicit none !CHECK: real(kind=real32) x !CHECK: end module -!CHECK: program test +!CHECK: program TEST !CHECK: use :: m1 !CHECK: use :: basictestmoduletwo !CHECK: implicit none diff --git a/flang/test/Integration/debug-common-block-1.f90 b/flang/test/Integration/debug-common-block-1.f90 index 18217637be0fa..77f47daea4a91 100644 --- a/flang/test/Integration/debug-common-block-1.f90 +++ b/flang/test/Integration/debug-common-block-1.f90 @@ -89,7 +89,7 @@ program test ! CHECK-DAG: ![[CBF3]] = !DICommonBlock(scope: ![[F3]], declaration: null, name: "__BLNK__"{{.*}}) ! CHECK-DAG: ![[CBAF3]] = !DICommonBlock(scope: ![[F3]], declaration: null, name: "a"{{.*}}) -! CHECK-DAG: ![[MAIN:[0-9]+]] = {{.*}}!DISubprogram(name: "test"{{.*}}) +! CHECK-DAG: ![[MAIN:[0-9]+]] = {{.*}}!DISubprogram(name: "TEST"{{.*}}) ! CHECK-DAG: ![[CBM]] = !DICommonBlock(scope: ![[MAIN]], declaration: null, name: "__BLNK__"{{.*}}) ! CHECK-DAG: ![[CBAM]] = !DICommonBlock(scope: ![[MAIN]], declaration: null, name: "a"{{.*}}) diff --git a/flang/test/Integration/debug-local-var-2.f90 b/flang/test/Integration/debug-local-var-2.f90 index b97be141cc8d0..0ddac633a5b1e 100644 --- a/flang/test/Integration/debug-local-var-2.f90 +++ b/flang/test/Integration/debug-local-var-2.f90 @@ -37,7 +37,7 @@ ! BOTH-LABEL: } program mn -! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "mn", {{.*}}) +! BOTH-DAG: ![[MAIN:.*]] = distinct !DISubprogram(name: "MN", {{.*}}) ! BOTH-DAG: ![[TYI32:.*]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed) ! BOTH-DAG: ![[TYI64:.*]] = !DIBasicType(name: "integer", size: 64, encoding: DW_ATE_signed) diff --git a/flang/test/Lower/CUDA/cuda-derived.cuf b/flang/test/Lower/CUDA/cuda-derived.cuf index 96250d88d81c4..d419ee074f7a0 100644 --- a/flang/test/Lower/CUDA/cuda-derived.cuf +++ b/flang/test/Lower/CUDA/cuda-derived.cuf @@ -25,6 +25,6 @@ program main type(t2) :: b end -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"} +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} ! CHECK: %{{.*}} = cuf.alloc !fir.type<_QMm1Tty_device{x:!fir.box>>}> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFEa"} ! CHECK: %{{.*}} = cuf.alloc !fir.type<_QMm1Tt2{b:!fir.type<_QMm1Tt1{a:!fir.box>>}>}> {bindc_name = "b", data_attr = #cuf.cuda, uniq_name = "_QFEb"} diff --git a/flang/test/Lower/CUDA/cuda-return01.cuf b/flang/test/Lower/CUDA/cuda-return01.cuf index 47e69a903efd3..ed7c640a71082 100644 --- a/flang/test/Lower/CUDA/cuda-return01.cuf +++ b/flang/test/Lower/CUDA/cuda-return01.cuf @@ -28,6 +28,6 @@ program main return end -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"} +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} ! CHECK: cuf.alloc !fir.box>> {bindc_name = "a", data_attr = #cuf.cuda, uniq_name = "_QFEa"} -> !fir.ref>>> ! CHECK-NOT: cuf.free diff --git a/flang/test/Lower/CUDA/cuda-return02.cuf b/flang/test/Lower/CUDA/cuda-return02.cuf index e450d7e796f22..e54818444e49c 100644 --- a/flang/test/Lower/CUDA/cuda-return02.cuf +++ b/flang/test/Lower/CUDA/cuda-return02.cuf @@ -13,7 +13,7 @@ program test return end -! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test"} { +! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} { ! CHECK: %[[DECL:.*]]:2 = hlfir.declare ! CHECK: cf.cond_br %{{.*}}, ^bb1, ^bb2 ! CHECK-NEXT: ^bb1: diff --git a/flang/test/Lower/HLFIR/intrinsic-subroutines.f90 b/flang/test/Lower/HLFIR/intrinsic-subroutines.f90 index 07c4f012781d4..cbc56ca1e395b 100644 --- a/flang/test/Lower/HLFIR/intrinsic-subroutines.f90 +++ b/flang/test/Lower/HLFIR/intrinsic-subroutines.f90 @@ -24,7 +24,7 @@ program main call mvbits(from, 2, 2, to, 0) if (any(to /= 5)) STOP 1 end program -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} { ! CHECK: %[[VAL_0:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.array<3xi32> {bindc_name = "from", uniq_name = "_QFEfrom"} ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_0]] : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90 b/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90 index 7b64634d10d4b..a097b1522307e 100644 --- a/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90 +++ b/flang/test/Lower/HLFIR/procedure-pointer-component-structure-constructor.f90 @@ -35,7 +35,7 @@ FUNCTION BAR() RESULT(res) END END -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} { ! CHECK: %[[VAL_0:.*]] = fir.alloca !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref) -> i32>}> ! CHECK: %[[VAL_1:.*]] = fir.alloca !fir.type<_QMmTdt{pp1:!fir.boxproc<(!fir.ref) -> i32>}> ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.boxproc<(!fir.ref) -> i32> {bindc_name = "pp2", uniq_name = "_QFEpp2"} diff --git a/flang/test/Lower/OpenACC/acc-atomic-read.f90 b/flang/test/Lower/OpenACC/acc-atomic-read.f90 index 639a98051e3a2..76751a0fa63a8 100644 --- a/flang/test/Lower/OpenACC/acc-atomic-read.f90 +++ b/flang/test/Lower/OpenACC/acc-atomic-read.f90 @@ -8,7 +8,7 @@ program acc_atomic_test g = h end program acc_atomic_test -! CHECK: func @_QQmain() attributes {fir.bindc_name = "acc_atomic_test"} { +! CHECK: func @_QQmain() attributes {fir.bindc_name = "ACC_ATOMIC_TEST"} { ! CHECK: %[[VAR_G:.*]] = fir.alloca f32 {bindc_name = "g", uniq_name = "_QFEg"} ! CHECK: %[[G_DECL:.*]]:2 = hlfir.declare %[[VAR_G]] {uniq_name = "_QFEg"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAR_H:.*]] = fir.alloca f32 {bindc_name = "h", uniq_name = "_QFEh"} diff --git a/flang/test/Lower/OpenACC/acc-atomic-write.f90 b/flang/test/Lower/OpenACC/acc-atomic-write.f90 index 3c55394021abf..e0116e3281820 100644 --- a/flang/test/Lower/OpenACC/acc-atomic-write.f90 +++ b/flang/test/Lower/OpenACC/acc-atomic-write.f90 @@ -2,7 +2,7 @@ ! This test checks the lowering of atomic write -!CHECK: func @_QQmain() attributes {fir.bindc_name = "acc_atomic_write_test"} { +!CHECK: func @_QQmain() attributes {fir.bindc_name = "ACC_ATOMIC_WRITE_TEST"} { !CHECK: %[[VAR_X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"} !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[VAR_X]] {uniq_name = "_QFEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[VAR_Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"} diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90 index f603376163901..655e2762b9694 100644 --- a/flang/test/Lower/OpenACC/acc-routine04.f90 +++ b/flang/test/Lower/OpenACC/acc-routine04.f90 @@ -30,5 +30,5 @@ subroutine sub2() ! CHECK: acc.routine @acc_routine_1 func(@_QFPsub2) seq ! CHECK: acc.routine @acc_routine_0 func(@_QMdummy_modPsub1) seq ! CHECK: func.func @_QMdummy_modPsub1(%arg0: !fir.ref {fir.bindc_name = "i"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>} -! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test_acc_routine"} +! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "TEST_ACC_ROUTINE"} ! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage} diff --git a/flang/test/Lower/OpenMP/atomic-read.f90 b/flang/test/Lower/OpenMP/atomic-read.f90 index 68dcaac90eef5..30313e240efa3 100644 --- a/flang/test/Lower/OpenMP/atomic-read.f90 +++ b/flang/test/Lower/OpenMP/atomic-read.f90 @@ -4,7 +4,7 @@ ! This test checks the lowering of atomic read -!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomic"} { +!CHECK: func @_QQmain() attributes {fir.bindc_name = "OMPATOMIC"} { !CHECK: %[[A_REF:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"} !CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] {uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[B_REF:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"} diff --git a/flang/test/Lower/OpenMP/atomic-write.f90 b/flang/test/Lower/OpenMP/atomic-write.f90 index 6eded49b0b15d..742fd475c0f04 100644 --- a/flang/test/Lower/OpenMP/atomic-write.f90 +++ b/flang/test/Lower/OpenMP/atomic-write.f90 @@ -4,7 +4,7 @@ ! This test checks the lowering of atomic write -!CHECK: func @_QQmain() attributes {fir.bindc_name = "ompatomicwrite"} { +!CHECK: func @_QQmain() attributes {fir.bindc_name = "OMPATOMICWRITE"} { !CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"} !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFEx"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[Y_REF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"} diff --git a/flang/test/Lower/OpenMP/common-atomic-lowering.f90 b/flang/test/Lower/OpenMP/common-atomic-lowering.f90 index a53cc101024c6..f729bbb00ac8e 100644 --- a/flang/test/Lower/OpenMP/common-atomic-lowering.f90 +++ b/flang/test/Lower/OpenMP/common-atomic-lowering.f90 @@ -1,6 +1,6 @@ !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s -!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "sample"} { +!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "SAMPLE"} { !CHECK: %[[val_0:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"} !CHECK: %[[val_1:.*]]:2 = hlfir.declare %[[val_0]] {uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[val_2:.*]] = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFEb"} diff --git a/flang/test/Lower/OpenMP/cray-pointers02.f90 b/flang/test/Lower/OpenMP/cray-pointers02.f90 index 19e4cd09fe50a..79d838702e4b0 100644 --- a/flang/test/Lower/OpenMP/cray-pointers02.f90 +++ b/flang/test/Lower/OpenMP/cray-pointers02.f90 @@ -1,7 +1,7 @@ ! Test lowering of Cray pointee references. ! RUN: flang -fc1 -emit-hlfir -fopenmp %s -o - 2>&1 | FileCheck %s -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test_cray_pointers_02"} +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST_CRAY_POINTERS_02"} program test_cray_pointers_02 implicit none diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90 index c44c6bb966580..af51c4cc3e814 100644 --- a/flang/test/Lower/OpenMP/default-clause-byref.f90 +++ b/flang/test/Lower/OpenMP/default-clause-byref.f90 @@ -34,7 +34,7 @@ !CHECK: omp.yield(%[[PRIV_X]] : !fir.ref) !CHECK: } -!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} { +!CHECK: func @_QQmain() attributes {fir.bindc_name = "DEFAULT_CLAUSE_LOWERING"} { !CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"} !CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"} diff --git a/flang/test/Lower/OpenMP/default-clause.f90 b/flang/test/Lower/OpenMP/default-clause.f90 index ee5f579f06b91..505fa4f0f5d63 100644 --- a/flang/test/Lower/OpenMP/default-clause.f90 +++ b/flang/test/Lower/OpenMP/default-clause.f90 @@ -8,7 +8,7 @@ ! RUN: | FileCheck %s -!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} { +!CHECK: func @_QQmain() attributes {fir.bindc_name = "DEFAULT_CLAUSE_LOWERING"} { !CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"} !CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"} diff --git a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 index 4bfd5d8d19261..0036670317a8e 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-allocatable-array.f90 @@ -80,7 +80,7 @@ program reduce ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref>>> diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 index ec54294c7104f..ea0aa9ec3f53b 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array-lb.f90 @@ -68,7 +68,7 @@ program reduce ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: %[[VAL_7:.*]] = fir.alloca !fir.box> ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref> ! CHECK: %[[VAL_1:.*]] = arith.constant 2 : index diff --git a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 index 488ecc353af8e..eb0df2b3a17de 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-array2.f90 @@ -63,7 +63,7 @@ program reduce ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref> ! CHECK: %[[VAL_1:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1> diff --git a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 index 596276a99cafc..2caec0384a6ab 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 @@ -18,7 +18,7 @@ !CHECK: fir.store %[[CR]] to %[[C0]] : !fir.ref !CHECK: omp.yield(%[[C0]] : !fir.ref) !CHECK: } -!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "mn"} { +!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MN"} { !CHECK: %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"} !CHECK: %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32 diff --git a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 index f638688bc2cc1..3c1daa0eb983f 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-pointer-array.f90 @@ -82,7 +82,7 @@ program reduce ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref>>> diff --git a/flang/test/Lower/OpenMP/parallel-reduction-rename.f90 b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90 index c06343e997bfd..2be154f4bbaf5 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction-rename.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction-rename.f90 @@ -25,7 +25,7 @@ end program main ! CHECK: omp.yield(%[[VAL_2]] : i32) ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} { ! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "n", uniq_name = "_QFEn"} ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32 diff --git a/flang/test/Lower/OpenMP/parallel-reduction.f90 b/flang/test/Lower/OpenMP/parallel-reduction.f90 index 612549fb32de5..15e8cc325916d 100644 --- a/flang/test/Lower/OpenMP/parallel-reduction.f90 +++ b/flang/test/Lower/OpenMP/parallel-reduction.f90 @@ -10,7 +10,7 @@ !CHECK: %[[CR:[_a-z0-9]+]] = arith.addi %[[C0]], %[[C1]] : i32 !CHECK: omp.yield(%[[CR]] : i32) !CHECK: } -!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "mn"} { +!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MN"} { !CHECK: %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"} !CHECK: %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32 diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90 index d11925cafdc12..3d5c0326fb6b9 100644 --- a/flang/test/Lower/OpenMP/sections.f90 +++ b/flang/test/Lower/OpenMP/sections.f90 @@ -5,7 +5,7 @@ ! RUN: %flang_fc1 -emit-hlfir %openmp_flags %s -o - | FileCheck %s ! RUN: bbc -hlfir -emit-hlfir %openmp_flags %s -o - | FileCheck %s -!CHECK: func @_QQmain() attributes {fir.bindc_name = "sample"} { +!CHECK: func @_QQmain() attributes {fir.bindc_name = "SAMPLE"} { !CHECK: %[[COUNT:.*]] = fir.address_of(@_QFEcount) : !fir.ref !CHECK: %[[COUNT_DECL:.*]]:2 = hlfir.declare %[[COUNT]] {uniq_name = "_QFEcount"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[ETA:.*]] = fir.alloca f32 {bindc_name = "eta", uniq_name = "_QFEeta"} diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 index 5e54cef8c29db..5c90ef7d84f89 100644 --- a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 @@ -3,7 +3,7 @@ !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s -!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} { !CHECK: %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"} !CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 index 21547b47cf381..0e61261e8853e 100644 --- a/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-host-association-3.f90 @@ -3,7 +3,7 @@ !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s -!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} { !CHECK: %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"} !CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association.f90 index 7a27efa2f84aa..1887e8aa68fdc 100644 --- a/flang/test/Lower/OpenMP/threadprivate-host-association.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-host-association.f90 @@ -3,7 +3,7 @@ !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s -!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +!CHECK: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} { !CHECK: %[[A:.*]] = fir.address_of(@_QFEa) : !fir.ref !CHECK: %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) !CHECK: %[[TP_A:.*]] = omp.threadprivate %[[A_DECL]]#0 : !fir.ref -> !fir.ref diff --git a/flang/test/Lower/OpenMP/wsloop-chunks.f90 b/flang/test/Lower/OpenMP/wsloop-chunks.f90 index 29c02a3b3c8d5..f3f11d8c4a6c2 100644 --- a/flang/test/Lower/OpenMP/wsloop-chunks.f90 +++ b/flang/test/Lower/OpenMP/wsloop-chunks.f90 @@ -7,7 +7,7 @@ program wsloop integer :: i integer :: chunk -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "wsloop"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "WSLOOP"} { ! CHECK: %[[CHUNK_REF:.*]] = fir.alloca i32 {bindc_name = "chunk", uniq_name = "_QFEchunk"} ! CHECK: %[[VAL_0:.*]]:2 = hlfir.declare %[[CHUNK_REF]] {uniq_name = "_QFEchunk"} : (!fir.ref) -> (!fir.ref, !fir.ref) diff --git a/flang/test/Lower/OpenMP/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/wsloop-collapse.f90 index a4d5cbdc03d3e..7ec40ab4b2f43 100644 --- a/flang/test/Lower/OpenMP/wsloop-collapse.f90 +++ b/flang/test/Lower/OpenMP/wsloop-collapse.f90 @@ -2,7 +2,7 @@ ! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s -!CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "wsloop_collapse"} { +!CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "WSLOOP_COLLAPSE"} { program wsloop_collapse !CHECK: %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"} !CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFEa"} : (!fir.ref) -> (!fir.ref, !fir.ref) diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 index 58b68e5ec4cfd..e2f75bc8e4481 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable-array-minmax.f90 @@ -156,7 +156,7 @@ program reduce15 ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce15"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE15"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEarr) : !fir.ref>>> ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = {{.*}}, uniq_name = "_QFEarr"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) ! CHECK: %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 index 0a536eb34e7af..663851cba46c6 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-allocatable.f90 @@ -63,7 +63,7 @@ program reduce ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box> {bindc_name = "r", uniq_name = "_QFEr"} diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 index 9f0dd16002baf..2233a74600948 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb.f90 @@ -31,5 +31,5 @@ program reduce ! CHECK: omp.yield(%[[ARG0]] : !fir.ref>>) ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: omp.wsloop {{.*}} reduction(byref @add_reduction_byref_box_2xi32 %{{.*}} -> %{{.*}} : !fir.ref>>) diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 index 5ada623a0ed23..211bde19da8db 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array-lb2.f90 @@ -40,5 +40,5 @@ subroutine sub(a, lb, ub) ! CHECK: omp.yield(%[[ARG0]] : !fir.ref>>) ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: omp.wsloop {{.*}} reduction(byref @add_reduction_byref_box_Uxi32 %{{.*}} -> %{{.*}} : !fir.ref>>) diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 index 21261da49710c..b7882bcbc0d13 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array.f90 @@ -65,7 +65,7 @@ program reduce ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 index ab8dcf1f076c0..7d90335a13a87 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-array2.f90 @@ -65,7 +65,7 @@ program reduce ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref> diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 index 1e26f5a24d41e..d776bd7cfdd03 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 @@ -28,7 +28,7 @@ program reduce ! CHECK: omp.yield(%[[VAL_2]] : i32) ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 index e0a3b469f40c1..5133db0347034 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-multiple-clauses.f90 @@ -93,7 +93,7 @@ program main ! CHECK: omp.yield(%[[VAL_2]] : f64) ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} { ! CHECK: %[[VAL_0:.*]] = fir.address_of(@_QFEarray) : !fir.ref> ! CHECK: %[[VAL_1:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 3 : index diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 index 40b4302f24cd4..27b726376fbeb 100644 --- a/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 +++ b/flang/test/Lower/OpenMP/wsloop-reduction-pointer.f90 @@ -64,7 +64,7 @@ program reduce_pointer ! CHECK: omp.yield ! CHECK: } -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "reduce_pointer"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "REDUCE_POINTER"} { ! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"} ! CHECK: %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.box> {bindc_name = "v", uniq_name = "_QFEv"} diff --git a/flang/test/Lower/array-character.f90 b/flang/test/Lower/array-character.f90 index 1bc73dae44235..e2899d967c80d 100644 --- a/flang/test/Lower/array-character.f90 +++ b/flang/test/Lower/array-character.f90 @@ -32,7 +32,7 @@ program p call charlit end program p -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "p"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "P"} { ! CHECK: %[[VAL_0:.*]] = arith.constant 4 : index ! CHECK: %[[VAL_1:.*]] = arith.constant 3 : index ! CHECK: %[[VAL_2:.*]] = fir.alloca !fir.array<3x!fir.char<1,4>> {bindc_name = "c1", uniq_name = "_QFEc1"} diff --git a/flang/test/Lower/array-expression-slice-1.f90 b/flang/test/Lower/array-expression-slice-1.f90 index b597814bc0d9f..73943137cb18d 100644 --- a/flang/test/Lower/array-expression-slice-1.f90 +++ b/flang/test/Lower/array-expression-slice-1.f90 @@ -1,6 +1,6 @@ ! RUN: bbc -hlfir=false -fwrapv -o - --outline-intrinsics %s | FileCheck %s -! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "p"} { +! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "P"} { ! CHECK-DAG: %[[VAL_0:.*]] = arith.constant 10 : index ! CHECK-DAG: %[[VAL_4:.*]] = arith.constant 2 : index ! CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : index diff --git a/flang/test/Lower/basic-program.f90 b/flang/test/Lower/basic-program.f90 index 5a0e4bdc7b4a1..7e5b40d9e2f0a 100644 --- a/flang/test/Lower/basic-program.f90 +++ b/flang/test/Lower/basic-program.f90 @@ -4,10 +4,10 @@ program basic end program -! CHECK: 1 Program basic +! CHECK: 1 Program BASIC ! CHECK: 1 EndProgramStmt: end program -! CHECK: End Program basic +! CHECK: End Program BASIC -! FIR-LABEL: func @_QQmain() attributes {fir.bindc_name = "basic"} { +! FIR-LABEL: func @_QQmain() attributes {fir.bindc_name = "BASIC"} { ! FIR: return ! FIR: } diff --git a/flang/test/Lower/big-integer-parameter.f90 b/flang/test/Lower/big-integer-parameter.f90 index a413b1224ebc2..ca90b8adfb318 100644 --- a/flang/test/Lower/big-integer-parameter.f90 +++ b/flang/test/Lower/big-integer-parameter.f90 @@ -13,7 +13,7 @@ program i128 print*,y end -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "i128"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "I128"} { ! CHECK-COUNT-2: %{{.*}} = fir.call @_FortranAioOutputInteger128(%{{.*}}, %{{.*}}) {{.*}}: (!fir.ref, i128) -> i1 diff --git a/flang/test/Lower/derived-type-finalization.f90 b/flang/test/Lower/derived-type-finalization.f90 index 3ea58cd719f4a..71cef34899603 100644 --- a/flang/test/Lower/derived-type-finalization.f90 +++ b/flang/test/Lower/derived-type-finalization.f90 @@ -255,5 +255,5 @@ program p type(t1) :: t end program -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "p"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "P"} { ! CHECK-NOT: fir.call @_FortranADestroy diff --git a/flang/test/Lower/location.f90 b/flang/test/Lower/location.f90 index a6ece31bbebed..95bf2260fc107 100644 --- a/flang/test/Lower/location.f90 +++ b/flang/test/Lower/location.f90 @@ -5,7 +5,7 @@ program test end -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} { ! CHECK: fir.call @_FortranAioOutputAscii(%{{.*}}, %{{.*}}, %{{.*}}) fastmath : (!fir.ref, !fir.ref, i64) -> i1 loc(fused<#fir>["{{.*}}location1.inc":1:10, "{{.*}}location0.inc":1:1, "{{.*}}location.f90":4:1]) ! CHECK: return loc("{{.*}}location.f90":6:1) ! CHECK: } loc("{{.*}}location.f90":3:1) diff --git a/flang/test/Lower/nested-where.f90 b/flang/test/Lower/nested-where.f90 index ab457280b80ce..28aced2325813 100644 --- a/flang/test/Lower/nested-where.f90 +++ b/flang/test/Lower/nested-where.f90 @@ -1,6 +1,6 @@ ! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s -! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "nested_where"} { +! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "NESTED_WHERE"} { program nested_where ! CHECK: %[[VAL_0:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"} diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90 index b7be5f685d9e3..a84b495dd09d0 100644 --- a/flang/test/Lower/polymorphic.f90 +++ b/flang/test/Lower/polymorphic.f90 @@ -1146,7 +1146,7 @@ program test l = i < o%inner end program -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "test"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} { ! CHECK: %[[ADDR_O:.*]] = fir.address_of(@_QFEo) : !fir.ref}>>>> ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ADDR_O]] : (!fir.ref}>>>>) -> !fir.ref> ! CHECK: %{{.*}} = fir.call @_FortranAAllocatableAllocate(%[[BOX_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}} : (!fir.ref>, !fir.ref, i1, !fir.box, !fir.ref, i32) -> i32 diff --git a/flang/test/Lower/pre-fir-tree02.f90 b/flang/test/Lower/pre-fir-tree02.f90 index f4fa626ba6548..65c33e9b364fe 100644 --- a/flang/test/Lower/pre-fir-tree02.f90 +++ b/flang/test/Lower/pre-fir-tree02.f90 @@ -3,7 +3,7 @@ ! Test Pre-FIR Tree captures all the intended nodes from the parse-tree ! Coarray and OpenMP related nodes are tested in other files. -! CHECK: Program test_prog +! CHECK: Program TEST_PROG program test_prog ! Check specification part is not part of the tree. interface diff --git a/flang/test/Lower/pre-fir-tree03.f90 b/flang/test/Lower/pre-fir-tree03.f90 index 313dab4d6ec7c..1de66e3f8d016 100644 --- a/flang/test/Lower/pre-fir-tree03.f90 +++ b/flang/test/Lower/pre-fir-tree03.f90 @@ -2,7 +2,7 @@ ! Test Pre-FIR Tree captures OpenMP related constructs -! CHECK: Program test_omp +! CHECK: Program TEST_OMP program test_omp ! CHECK: PrintStmt print *, "sequential" diff --git a/flang/test/Lower/pre-fir-tree06.f90 b/flang/test/Lower/pre-fir-tree06.f90 index f84bcd8b58b2d..ed1e76cb375bd 100644 --- a/flang/test/Lower/pre-fir-tree06.f90 +++ b/flang/test/Lower/pre-fir-tree06.f90 @@ -25,13 +25,13 @@ subroutine sub2() end ! CHECK: End Module m2 -! CHECK: Program main +! CHECK: Program MAIN program main real :: y ! CHECK-NEXT: OpenMPDeclarativeConstruct !$omp threadprivate(y) end -! CHECK: End Program main +! CHECK: End Program MAIN ! CHECK: Subroutine sub1 subroutine sub1() diff --git a/flang/test/Lower/program-units-fir-mangling.f90 b/flang/test/Lower/program-units-fir-mangling.f90 index e0af6f065f34d..65940b4e1ff17 100644 --- a/flang/test/Lower/program-units-fir-mangling.f90 +++ b/flang/test/Lower/program-units-fir-mangling.f90 @@ -124,7 +124,7 @@ subroutine should_not_collide() ! CHECK: } end subroutine -! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "test"} { +! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "TEST"} { program test ! CHECK: } contains diff --git a/flang/test/Lower/return-statement.f90 b/flang/test/Lower/return-statement.f90 index 6351a6859eb4f..8ab69e3146e2f 100644 --- a/flang/test/Lower/return-statement.f90 +++ b/flang/test/Lower/return-statement.f90 @@ -4,7 +4,7 @@ program basic return end program -! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "basic"} { +! CHECK-LABEL: func @_QQmain() attributes {fir.bindc_name = "BASIC"} { ! CHECK: return ! CHECK: } diff --git a/flang/test/Lower/volatile-openmp1.f90 b/flang/test/Lower/volatile-openmp1.f90 index 163db953b6b80..07d81a1aeb240 100644 --- a/flang/test/Lower/volatile-openmp1.f90 +++ b/flang/test/Lower/volatile-openmp1.f90 @@ -13,7 +13,7 @@ program main !$omp end parallel end program -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "main"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "MAIN"} { ! CHECK: %[[VAL_0:.*]] = arith.constant 1 : i32 ! CHECK: %[[VAL_1:.*]] = arith.constant 1000 : i32 ! CHECK: %[[VAL_2:.*]] = arith.constant 0 : i32 diff --git a/flang/test/Lower/volatile-string.f90 b/flang/test/Lower/volatile-string.f90 index 88b21d7b245e9..f263db7abb5fc 100644 --- a/flang/test/Lower/volatile-string.f90 +++ b/flang/test/Lower/volatile-string.f90 @@ -21,7 +21,7 @@ subroutine assign_different_length(string) end subroutine end program -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "p"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "P"} { ! CHECK: %[[VAL_0:.*]] = arith.constant 11 : i32 ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_2:.*]] = arith.constant true diff --git a/flang/test/Lower/volatile3.f90 b/flang/test/Lower/volatile3.f90 index 8825f8f3afbcb..a32f29d2bb9e7 100644 --- a/flang/test/Lower/volatile3.f90 +++ b/flang/test/Lower/volatile3.f90 @@ -70,7 +70,7 @@ subroutine sub_select_rank(arr) end program -! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "p"} { +! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "P"} { ! CHECK: %[[VAL_0:.*]] = arith.constant 1 : index ! CHECK: %[[VAL_1:.*]] = arith.constant 0 : index ! CHECK: %[[VAL_2:.*]] = arith.constant 10 : index diff --git a/flang/test/Parser/acc-unparse.f90 b/flang/test/Parser/acc-unparse.f90 index 62e0d4487f3f7..12e6dec19f272 100644 --- a/flang/test/Parser/acc-unparse.f90 +++ b/flang/test/Parser/acc-unparse.f90 @@ -15,7 +15,7 @@ program bug47659 end do label1 end program -!CHECK-LABEL: PROGRAM bug47659 +!CHECK-LABEL: PROGRAM BUG47659 !CHECK: !$ACC PARALLEL LOOP diff --git a/flang/test/Semantics/OpenACC/acc-symbols01.f90 b/flang/test/Semantics/OpenACC/acc-symbols01.f90 index 375445bad13a5..51a7a3a23e8ce 100644 --- a/flang/test/Semantics/OpenACC/acc-symbols01.f90 +++ b/flang/test/Semantics/OpenACC/acc-symbols01.f90 @@ -1,24 +1,24 @@ ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenacc -!DEF: /mm MainProgram -program mm - !DEF: /mm/x ObjectEntity REAL(4) - !DEF: /mm/y ObjectEntity REAL(4) +!DEF: /MM MainProgram +program MM + !DEF: /MM/x ObjectEntity REAL(4) + !DEF: /MM/y ObjectEntity REAL(4) real x, y - !DEF: /mm/a ObjectEntity INTEGER(4) - !DEF: /mm/b ObjectEntity INTEGER(4) - !DEF: /mm/c ObjectEntity INTEGER(4) - !DEF: /mm/i ObjectEntity INTEGER(4) + !DEF: /MM/a ObjectEntity INTEGER(4) + !DEF: /MM/b ObjectEntity INTEGER(4) + !DEF: /MM/c ObjectEntity INTEGER(4) + !DEF: /MM/i ObjectEntity INTEGER(4) integer a(10), b(10), c(10), i - !REF: /mm/b + !REF: /MM/b b = 2 !$acc parallel present(c) firstprivate(b) private(a) !$acc loop - !REF: /mm/i + !REF: /MM/i do i=1,10 - !REF: /mm/a - !REF: /mm/i - !REF: /mm/b + !REF: /MM/a + !REF: /MM/i + !REF: /MM/b a(i) = b(i) end do !$acc end parallel diff --git a/flang/test/Semantics/OpenMP/critical_within_default.f90 b/flang/test/Semantics/OpenMP/critical_within_default.f90 index dd972e6e52949..a5fe30eeb7de0 100644 --- a/flang/test/Semantics/OpenMP/critical_within_default.f90 +++ b/flang/test/Semantics/OpenMP/critical_within_default.f90 @@ -1,7 +1,7 @@ ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols %s | FileCheck %s ! Test that we do not make a private copy of the critical name -!CHECK: MainProgram scope: mn +!CHECK: MainProgram scope: MN !CHECK-NEXT: j size=4 offset=0: ObjectEntity type: INTEGER(4) !CHECK-NEXT: OtherConstruct scope: !CHECK-NEXT: j (OmpPrivate): HostAssoc diff --git a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 index 06f41ab8ce76f..e57a5c0c1cea6 100644 --- a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 +++ b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 @@ -1,7 +1,7 @@ ! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s program main -!CHECK-LABEL: MainProgram scope: main +!CHECK-LABEL: MainProgram scope: MAIN implicit none type ty diff --git a/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90 b/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90 index 9d0a097fb1991..fc977f2f1b839 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-mangled.f90 @@ -17,7 +17,7 @@ end function mymax end module mymod program omp_examples -!CHECK-LABEL: MainProgram scope: omp_examples +!CHECK-LABEL: MainProgram scope: OMP_EXAMPLES use mymod implicit none integer, parameter :: n = 100 diff --git a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 index d7a9f2fc0a36b..84dbe1af01877 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-operators.f90 @@ -49,7 +49,7 @@ function my_add(x, y) end module m1 program test_vector -!CHECK-LABEL: MainProgram scope: test_vector +!CHECK-LABEL: MainProgram scope: TEST_VECTOR use vector_mod !CHECK: add_vectors (Function): Use from add_vectors in vector_mod implicit none diff --git a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 index 12e80cbf7b327..9cd638d796091 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction-renamedop.f90 @@ -22,7 +22,7 @@ end function my_mul end module module1 program test_omp_reduction -!CHECK: MainProgram scope: test_omp_reduction +!CHECK: MainProgram scope: TEST_OMP_REDUCTION use module1, only: t1, operator(.modmul.) => operator(.mul.) !CHECK: .modmul. (Function): Use from .mul. in module1 diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90 index ddca38fd57812..1f39c57c54ad1 100644 --- a/flang/test/Semantics/OpenMP/declare-reduction.f90 +++ b/flang/test/Semantics/OpenMP/declare-reduction.f90 @@ -31,7 +31,7 @@ end subroutine initme end function func program main -!CHECK-LABEL: MainProgram scope: main +!CHECK-LABEL: MainProgram scope: MAIN !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) diff --git a/flang/test/Semantics/OpenMP/declare-target03.f90 b/flang/test/Semantics/OpenMP/declare-target03.f90 index 64a299d78224a..48cfc68393873 100644 --- a/flang/test/Semantics/OpenMP/declare-target03.f90 +++ b/flang/test/Semantics/OpenMP/declare-target03.f90 @@ -13,10 +13,10 @@ subroutine bar program main use mod1 - !ERROR: The module name or main program name cannot be in a DECLARE TARGET directive + !ERROR: The module name cannot be in a DECLARE TARGET directive !$omp declare target (mod1) - !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash] - !ERROR: The module name or main program name cannot be in a DECLARE TARGET directive + ! This is now allowed: "main" is implicitly declared symbol separate + ! from the main program symbol !$omp declare target (main) end diff --git a/flang/test/Semantics/OpenMP/do-schedule03.f90 b/flang/test/Semantics/OpenMP/do-schedule03.f90 index 8787b094d581a..05602ca57e4a9 100644 --- a/flang/test/Semantics/OpenMP/do-schedule03.f90 +++ b/flang/test/Semantics/OpenMP/do-schedule03.f90 @@ -2,27 +2,27 @@ ! OpenMP Version 4.5 ! 2.7.1 Schedule Clause ! Test that does not catch non constant integer expressions like xx - xx. - !DEF: /ompdoschedule MainProgram -program ompdoschedule - !DEF: /ompdoschedule/a ObjectEntity REAL(4) - !DEF: /ompdoschedule/y ObjectEntity REAL(4) - !DEF: /ompdoschedule/z ObjectEntity REAL(4) + !DEF: /OMPDOSCHEDULE MainProgram +program OMPDOSCHEDULE + !DEF: /OMPDOSCHEDULE/a ObjectEntity REAL(4) + !DEF: /OMPDOSCHEDULE/y ObjectEntity REAL(4) + !DEF: /OMPDOSCHEDULE/z ObjectEntity REAL(4) real a(100),y(100),z(100) - !DEF: /ompdoschedule/b ObjectEntity INTEGER(4) - !DEF: /ompdoschedule/i ObjectEntity INTEGER(4) - !DEF: /ompdoschedule/n ObjectEntity INTEGER(4) + !DEF: /OMPDOSCHEDULE/b ObjectEntity INTEGER(4) + !DEF: /OMPDOSCHEDULE/i ObjectEntity INTEGER(4) + !DEF: /OMPDOSCHEDULE/n ObjectEntity INTEGER(4) integer b,i,n - !REF: /ompdoschedule/b + !REF: /OMPDOSCHEDULE/b b = 10 !$omp do schedule(static,b-b) - !DEF: /ompdoschedule/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) - !REF: /ompdoschedule/n + !DEF: /OMPDOSCHEDULE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !REF: /OMPDOSCHEDULE/n do i = 2,n+1 - !REF: /ompdoschedule/y - !REF: /ompdoschedule/OtherConstruct1/i - !REF: /ompdoschedule/z - !REF: /ompdoschedule/a + !REF: /OMPDOSCHEDULE/y + !REF: /OMPDOSCHEDULE/OtherConstruct1/i + !REF: /OMPDOSCHEDULE/z + !REF: /OMPDOSCHEDULE/a y(i) = z(i-1) + a(i) end do !$omp end do -end program ompdoschedule +end program OMPDOSCHEDULE diff --git a/flang/test/Semantics/OpenMP/do01-positivecase.f90 b/flang/test/Semantics/OpenMP/do01-positivecase.f90 index 905fdbaf18476..50a6870f43896 100644 --- a/flang/test/Semantics/OpenMP/do01-positivecase.f90 +++ b/flang/test/Semantics/OpenMP/do01-positivecase.f90 @@ -4,16 +4,16 @@ ! The loop iteration variable may not appear in a firstprivate directive. ! A positive case -!DEF: /omp_do MainProgram -program omp_do - !DEF: /omp_do/i ObjectEntity INTEGER(4) +!DEF: /OMP_DO MainProgram +program OMP_DO + !DEF: /OMP_DO/i ObjectEntity INTEGER(4) integer i !$omp do firstprivate(k) - !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 print *, "Hello" end do !$omp end do -end program omp_do +end program OMP_DO diff --git a/flang/test/Semantics/OpenMP/do04-positivecase.f90 b/flang/test/Semantics/OpenMP/do04-positivecase.f90 index eb2d67bb8ceb2..51b69fce3c7cc 100644 --- a/flang/test/Semantics/OpenMP/do04-positivecase.f90 +++ b/flang/test/Semantics/OpenMP/do04-positivecase.f90 @@ -2,21 +2,21 @@ ! OpenMP Version 4.5 ! 2.7.1 Do Loop Constructs -!DEF: /omp_do1 MainProgram -program omp_do1 - !DEF: /omp_do1/i ObjectEntity INTEGER(4) - !DEF: /omp_do1/j ObjectEntity INTEGER(4) - !DEF: /omp_do1/k (OmpThreadprivate) ObjectEntity INTEGER(4) - !DEF: /omp_do1/n (OmpThreadprivate) ObjectEntity INTEGER(4) +!DEF: /OMP_DO1 MainProgram +program OMP_DO1 + !DEF: /OMP_DO1/i ObjectEntity INTEGER(4) + !DEF: /OMP_DO1/j ObjectEntity INTEGER(4) + !DEF: /OMP_DO1/k (OmpThreadprivate) ObjectEntity INTEGER(4) + !DEF: /OMP_DO1/n (OmpThreadprivate) ObjectEntity INTEGER(4) integer i, j, k, n !$omp threadprivate (k,n) !$omp do - !DEF: /omp_do1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_DO1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !REF: /omp_do1/j + !REF: /OMP_DO1/j do j=1,10 print *, "Hello" end do end do !$omp end do -end program omp_do1 +end program OMP_DO1 diff --git a/flang/test/Semantics/OpenMP/do05-positivecase.f90 b/flang/test/Semantics/OpenMP/do05-positivecase.f90 index eda04610535c2..d4eb1fd6bc3da 100644 --- a/flang/test/Semantics/OpenMP/do05-positivecase.f90 +++ b/flang/test/Semantics/OpenMP/do05-positivecase.f90 @@ -3,13 +3,13 @@ ! 2.7.1 Loop Construct restrictions on single directive. ! A positive case -!DEF: /omp_do MainProgram -program omp_do - !DEF: /omp_do/i ObjectEntity INTEGER(4) - !DEF: /omp_do/n ObjectEntity INTEGER(4) +!DEF: /OMP_DO MainProgram +program OMP_DO + !DEF: /OMP_DO/i ObjectEntity INTEGER(4) + !DEF: /OMP_DO/n ObjectEntity INTEGER(4) integer i,n !$omp parallel - !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 !$omp single print *, "hello" @@ -19,13 +19,13 @@ program omp_do !$omp parallel default(shared) !$omp do - !DEF: /omp_do/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) - !DEF: /omp_do/OtherConstruct2/OtherConstruct1/n HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/n HostAssoc INTEGER(4) do i=1,n !$omp parallel !$omp single !DEF: /work EXTERNAL (Subroutine) ProcEntity - !DEF: /omp_do/OtherConstruct2/OtherConstruct1/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct2/OtherConstruct1/OtherConstruct1/OtherConstruct1/i HostAssoc INTEGER(4) call work(i, 1) !$omp end single !$omp end parallel @@ -34,7 +34,7 @@ program omp_do !$omp end parallel !$omp parallel private(i) - !DEF: /omp_do/OtherConstruct3/i (OmpPrivate, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct3/i (OmpPrivate, OmpExplicit) HostAssoc INTEGER(4) do i=1,10 !$omp single print *, "hello" @@ -43,32 +43,32 @@ program omp_do !$omp end parallel !$omp target teams distribute parallel do - !DEF:/omp_do/OtherConstruct4/i (OmpPrivate ,OmpPreDetermined) HostAssoc INTEGER(4) + !DEF:/OMP_DO/OtherConstruct4/i (OmpPrivate ,OmpPreDetermined) HostAssoc INTEGER(4) do i=1,100 - !REF:/omp_do/OtherConstruct4/i + !REF:/OMP_DO/OtherConstruct4/i if(i<10) cycle end do !$omp end target teams distribute parallel do !$omp target teams distribute parallel do simd - !DEF:/omp_do/OtherConstruct5/i (OmpLinear,OmpPreDetermined) HostAssoc INTEGER(4) + !DEF:/OMP_DO/OtherConstruct5/i (OmpLinear,OmpPreDetermined) HostAssoc INTEGER(4) do i=1,100 - !REF:/omp_do/OtherConstruct5/i + !REF:/OMP_DO/OtherConstruct5/i if(i<10) cycle end do !$omp end target teams distribute parallel do simd !$omp target teams distribute - !DEF: /omp_do/OtherConstruct6/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct6/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,100 - !REF: /omp_do/OtherConstruct6/i + !REF: /OMP_DO/OtherConstruct6/i if(i < 5) cycle end do !$omp target teams distribute simd - !DEF: /omp_do/OtherConstruct7/i (OmpLinear, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct7/i (OmpLinear, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,100 - !REF: /omp_do/OtherConstruct7/i + !REF: /OMP_DO/OtherConstruct7/i if(i < 5) cycle end do -end program omp_do +end program OMP_DO diff --git a/flang/test/Semantics/OpenMP/do06-positivecases.f90 b/flang/test/Semantics/OpenMP/do06-positivecases.f90 index 2713b55fa2ecb..dfb1d999bbc53 100644 --- a/flang/test/Semantics/OpenMP/do06-positivecases.f90 +++ b/flang/test/Semantics/OpenMP/do06-positivecases.f90 @@ -5,14 +5,14 @@ ! region ever binds to a loop region arising from the loop construct. ! A positive case -!DEF: /omp_do MainProgram -program omp_do - !DEF: /omp_do/i ObjectEntity INTEGER(4) - !DEF: /omp_do/j ObjectEntity INTEGER(4) - !DEF: /omp_do/k ObjectEntity INTEGER(4) +!DEF: /OMP_DO MainProgram +program OMP_DO + !DEF: /OMP_DO/i ObjectEntity INTEGER(4) + !DEF: /OMP_DO/j ObjectEntity INTEGER(4) + !DEF: /OMP_DO/k ObjectEntity INTEGER(4) integer i, j, k !$omp do ordered - !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 !$omp ordered !DEF: /my_func EXTERNAL (Subroutine) ProcEntity @@ -20,4 +20,4 @@ program omp_do !$omp end ordered end do !$omp end do -end program omp_do +end program OMP_DO diff --git a/flang/test/Semantics/OpenMP/do11.f90 b/flang/test/Semantics/OpenMP/do11.f90 index faab457efff3c..472048d684276 100644 --- a/flang/test/Semantics/OpenMP/do11.f90 +++ b/flang/test/Semantics/OpenMP/do11.f90 @@ -2,24 +2,24 @@ ! OpenMP Version 4.5 ! 2.7.1 Do Loop Constructs -!DEF: /omp_do MainProgram -program omp_do - !DEF: /omp_do/i ObjectEntity INTEGER(4) - !DEF: /omp_do/j ObjectEntity INTEGER(4) - !DEF: /omp_do/k ObjectEntity INTEGER(4) +!DEF: /OMP_DO MainProgram +program OMP_DO + !DEF: /OMP_DO/i ObjectEntity INTEGER(4) + !DEF: /OMP_DO/j ObjectEntity INTEGER(4) + !DEF: /OMP_DO/k ObjectEntity INTEGER(4) integer i, j, k !$omp do - !DEF: /omp_do/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_DO/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !REF: /omp_do/j + !REF: /OMP_DO/j do j=1,10 - !REF: /omp_do/OtherConstruct1/i - !REF: /omp_do/j + !REF: /OMP_DO/OtherConstruct1/i + !REF: /OMP_DO/j print *, "it", i, j end do end do !$omp end do -end program omp_do +end program OMP_DO !DEF: /omp_do2 (Subroutine)Subprogram subroutine omp_do2 diff --git a/flang/test/Semantics/OpenMP/do12.f90 b/flang/test/Semantics/OpenMP/do12.f90 index a057a246f7a99..06055b7572a60 100644 --- a/flang/test/Semantics/OpenMP/do12.f90 +++ b/flang/test/Semantics/OpenMP/do12.f90 @@ -2,20 +2,20 @@ ! OpenMP Version 4.5 ! 2.7.1 Do Loop constructs. -!DEF: /omp_cycle MainProgram -program omp_cycle +!DEF: /OMP_CYCLE MainProgram +program OMP_CYCLE !$omp do collapse(1) - !DEF: /omp_cycle/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=0,10 - !REF: /omp_cycle/OtherConstruct1/i + !REF: /OMP_CYCLE/OtherConstruct1/i if (i<1) cycle - !DEF: /omp_cycle/j (Implicit) ObjectEntity INTEGER(4) + !DEF: /OMP_CYCLE/j (Implicit) ObjectEntity INTEGER(4) do j=0,10 - !DEF: /omp_cycle/k (Implicit) ObjectEntity INTEGER(4) + !DEF: /OMP_CYCLE/k (Implicit) ObjectEntity INTEGER(4) do k=0,10 - !REF: /omp_cycle/OtherConstruct1/i - !REF: /omp_cycle/j - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/OtherConstruct1/i + !REF: /OMP_CYCLE/j + !REF: /OMP_CYCLE/k print *, i, j, k end do end do @@ -23,17 +23,17 @@ program omp_cycle !$omp end do !$omp do collapse(1) - !DEF: /omp_cycle/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=0,10 - !REF: /omp_cycle/j + !REF: /OMP_CYCLE/j do j=0,10 - !REF: /omp_cycle/OtherConstruct2/i + !REF: /OMP_CYCLE/OtherConstruct2/i if (i<1) cycle - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/k do k=0,10 - !REF: /omp_cycle/OtherConstruct2/i - !REF: /omp_cycle/j - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/OtherConstruct2/i + !REF: /OMP_CYCLE/j + !REF: /OMP_CYCLE/k print *, i, j, k end do end do @@ -41,17 +41,17 @@ program omp_cycle !$omp end do !$omp do collapse(2) - !DEF: /omp_cycle/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=0,10 - !DEF: /omp_cycle/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do j=0,10 - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/k do k=0,10 - !REF: /omp_cycle/OtherConstruct3/i + !REF: /OMP_CYCLE/OtherConstruct3/i if (i<1) cycle - !REF: /omp_cycle/OtherConstruct3/i - !REF: /omp_cycle/OtherConstruct3/j - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/OtherConstruct3/i + !REF: /OMP_CYCLE/OtherConstruct3/j + !REF: /OMP_CYCLE/k print *, i, j, k end do end do @@ -59,17 +59,17 @@ program omp_cycle !$omp end do !$omp do collapse(3) - !DEF: /omp_cycle/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=0,10 - !DEF: /omp_cycle/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do j=0,10 - !DEF: /omp_cycle/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do k=0,10 - !REF: /omp_cycle/OtherConstruct4/i + !REF: /OMP_CYCLE/OtherConstruct4/i if (i<1) cycle - !REF: /omp_cycle/OtherConstruct4/i - !REF: /omp_cycle/OtherConstruct4/j - !REF: /omp_cycle/OtherConstruct4/k + !REF: /OMP_CYCLE/OtherConstruct4/i + !REF: /OMP_CYCLE/OtherConstruct4/j + !REF: /OMP_CYCLE/OtherConstruct4/k print *, i, j, k end do end do @@ -77,20 +77,20 @@ program omp_cycle !$omp end do !$omp do collapse(3) - !DEF: /omp_cycle/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo:do i=0,10 - !DEF: /omp_cycle/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo1:do j=0,10 - !DEF: /omp_cycle/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo2:do k=0,10 - !REF: /omp_cycle/OtherConstruct5/i + !REF: /OMP_CYCLE/OtherConstruct5/i if (i<1) cycle foo2 - !REF: /omp_cycle/OtherConstruct5/i - !REF: /omp_cycle/OtherConstruct5/j - !REF: /omp_cycle/OtherConstruct5/k + !REF: /OMP_CYCLE/OtherConstruct5/i + !REF: /OMP_CYCLE/OtherConstruct5/j + !REF: /OMP_CYCLE/OtherConstruct5/k print *, i, j, k end do foo2 end do foo1 end do foo !$omp end do -end program omp_cycle +end program OMP_CYCLE diff --git a/flang/test/Semantics/OpenMP/do14.f90 b/flang/test/Semantics/OpenMP/do14.f90 index 5e8a5a64c2979..e17647394fff7 100644 --- a/flang/test/Semantics/OpenMP/do14.f90 +++ b/flang/test/Semantics/OpenMP/do14.f90 @@ -2,19 +2,19 @@ ! OpenMP Version 4.5 ! 2.7.1 Do Loop constructs. -!DEF: /omp_cycle MainProgram -program omp_cycle +!DEF: /OMP_CYCLE MainProgram +program OMP_CYCLE !$omp do collapse(1) - !DEF: /omp_cycle/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=0,10 cycle - !DEF: /omp_cycle/j (Implicit) ObjectEntity INTEGER(4) + !DEF: /OMP_CYCLE/j (Implicit) ObjectEntity INTEGER(4) do j=0,10 - !DEF: /omp_cycle/k (Implicit) ObjectEntity INTEGER(4) + !DEF: /OMP_CYCLE/k (Implicit) ObjectEntity INTEGER(4) do k=0,10 - !REF: /omp_cycle/OtherConstruct1/i - !REF: /omp_cycle/j - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/OtherConstruct1/i + !REF: /OMP_CYCLE/j + !REF: /OMP_CYCLE/k print *, i, j, k end do end do @@ -22,16 +22,16 @@ program omp_cycle !$omp end do !$omp do collapse(1) - !DEF: /omp_cycle/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=0,10 - !REF: /omp_cycle/j + !REF: /OMP_CYCLE/j do j=0,10 cycle - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/k do k=0,10 - !REF: /omp_cycle/OtherConstruct2/i - !REF: /omp_cycle/j - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/OtherConstruct2/i + !REF: /OMP_CYCLE/j + !REF: /OMP_CYCLE/k print *, i, j, k end do end do @@ -39,16 +39,16 @@ program omp_cycle !$omp end do !$omp do collapse(2) - !DEF: /omp_cycle/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=0,10 - !DEF: /omp_cycle/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct3/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do j=0,10 - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/k do k=0,10 cycle - !REF: /omp_cycle/OtherConstruct3/i - !REF: /omp_cycle/OtherConstruct3/j - !REF: /omp_cycle/k + !REF: /OMP_CYCLE/OtherConstruct3/i + !REF: /OMP_CYCLE/OtherConstruct3/j + !REF: /OMP_CYCLE/k print *, i, j, k end do end do @@ -56,16 +56,16 @@ program omp_cycle !$omp end do !$omp do collapse(3) - !DEF: /omp_cycle/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=0,10 - !DEF: /omp_cycle/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct4/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do j=0,10 - !DEF: /omp_cycle/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct4/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do k=0,10 cycle - !REF: /omp_cycle/OtherConstruct4/i - !REF: /omp_cycle/OtherConstruct4/j - !REF: /omp_cycle/OtherConstruct4/k + !REF: /OMP_CYCLE/OtherConstruct4/i + !REF: /OMP_CYCLE/OtherConstruct4/j + !REF: /OMP_CYCLE/OtherConstruct4/k print *, i, j, k end do end do @@ -73,19 +73,19 @@ program omp_cycle !$omp end do !$omp do ordered(3) - !DEF: /omp_cycle/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo:do i=0,10 - !DEF: /omp_cycle/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct5/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo1:do j=0,10 - !DEF: /omp_cycle/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_CYCLE/OtherConstruct5/k (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo2:do k=0,10 cycle foo2 - !REF: /omp_cycle/OtherConstruct5/i - !REF: /omp_cycle/OtherConstruct5/j - !REF: /omp_cycle/OtherConstruct5/k + !REF: /OMP_CYCLE/OtherConstruct5/i + !REF: /OMP_CYCLE/OtherConstruct5/j + !REF: /OMP_CYCLE/OtherConstruct5/k print *, i, j, k end do foo2 end do foo1 end do foo !$omp end do -end program omp_cycle +end program OMP_CYCLE diff --git a/flang/test/Semantics/OpenMP/do17.f90 b/flang/test/Semantics/OpenMP/do17.f90 index c0c59f16dee1b..cac11f215f074 100644 --- a/flang/test/Semantics/OpenMP/do17.f90 +++ b/flang/test/Semantics/OpenMP/do17.f90 @@ -2,56 +2,56 @@ ! OpenMP Version 4.5 ! 2.7.1 Do Loop constructs. -!DEF: /test MainProgram -program test - !DEF: /test/i ObjectEntity INTEGER(4) - !DEF: /test/j ObjectEntity INTEGER(4) - !DEF: /test/k ObjectEntity INTEGER(4) +!DEF: /TEST MainProgram +program TEST + !DEF: /TEST/i ObjectEntity INTEGER(4) + !DEF: /TEST/j ObjectEntity INTEGER(4) + !DEF: /TEST/k ObjectEntity INTEGER(4) integer i, j, k !$omp do collapse(2) - !DEF: /test/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /TEST/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo: do i=0,10 - !DEF: /test/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /TEST/OtherConstruct1/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo1: do j=0,10 - !REF: /test/k + !REF: /TEST/k foo2: do k=0,10 - !REF: /test/OtherConstruct1/i + !REF: /TEST/OtherConstruct1/i select case (i) case (5) cycle foo1 case (7) cycle foo2 end select - !REF: /test/OtherConstruct1/i - !REF: /test/OtherConstruct1/j - !REF: /test/k + !REF: /TEST/OtherConstruct1/i + !REF: /TEST/OtherConstruct1/j + !REF: /TEST/k print *, i, j, k end do foo2 end do foo1 end do foo !$omp do collapse(2) - !DEF: /test/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /TEST/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo: do i=0,10 - !DEF: /test/OtherConstruct2/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /TEST/OtherConstruct2/j (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) foo1: do j=0,10 - !REF: /test/k + !REF: /TEST/k foo2: do k=0,10 - !REF: /test/OtherConstruct2/i + !REF: /TEST/OtherConstruct2/i if (i<3) then cycle foo1 - !REF: /test/OtherConstruct2/i + !REF: /TEST/OtherConstruct2/i else if (i>8) then cycle foo1 else cycle foo2 end if - !REF: /test/OtherConstruct2/i - !REF: /test/OtherConstruct2/j - !REF: /test/k + !REF: /TEST/OtherConstruct2/i + !REF: /TEST/OtherConstruct2/j + !REF: /TEST/k print *, i, j, k end do foo2 end do foo1 end do foo !$omp end do -end program test +end program TEST diff --git a/flang/test/Semantics/OpenMP/map-clause-symbols.f90 b/flang/test/Semantics/OpenMP/map-clause-symbols.f90 index 8f984fcd2fa7e..1d6315b4a2312 100644 --- a/flang/test/Semantics/OpenMP/map-clause-symbols.f90 +++ b/flang/test/Semantics/OpenMP/map-clause-symbols.f90 @@ -1,6 +1,6 @@ ! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s program main -!CHECK-LABEL: MainProgram scope: main +!CHECK-LABEL: MainProgram scope: MAIN integer, parameter :: n = 256 real(8) :: a(256) !$omp target map(mapper(xx), from:a) diff --git a/flang/test/Semantics/OpenMP/reduction08.f90 b/flang/test/Semantics/OpenMP/reduction08.f90 index 01a06eb7d7414..b4a81e644c1e7 100644 --- a/flang/test/Semantics/OpenMP/reduction08.f90 +++ b/flang/test/Semantics/OpenMP/reduction08.f90 @@ -2,62 +2,62 @@ ! OpenMP Version 4.5 ! 2.15.3.6 Reduction Clause Positive cases -!DEF: /omp_reduction MainProgram -program omp_reduction - !DEF: /omp_reduction/i ObjectEntity INTEGER(4) +!DEF: /OMP_REDUCTION MainProgram +program OMP_REDUCTION + !DEF: /OMP_REDUCTION/i ObjectEntity INTEGER(4) integer i - !DEF: /omp_reduction/k ObjectEntity INTEGER(4) + !DEF: /OMP_REDUCTION/k ObjectEntity INTEGER(4) integer :: k = 10 - !DEF: /omp_reduction/m ObjectEntity INTEGER(4) + !DEF: /OMP_REDUCTION/m ObjectEntity INTEGER(4) integer :: m = 12 !$omp parallel do reduction(max:k) - !DEF: /omp_reduction/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) - !DEF: /omp_reduction/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity - !DEF: /omp_reduction/OtherConstruct1/m (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/max ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity + !DEF: /OMP_REDUCTION/OtherConstruct1/m (OmpShared) HostAssoc INTEGER(4) k = max(k, m) end do !$omp end parallel do !$omp parallel do reduction(min:k) - !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct2/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) - !DEF: /omp_reduction/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity - !DEF: /omp_reduction/OtherConstruct2/m (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct2/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/min ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity + !DEF: /OMP_REDUCTION/OtherConstruct2/m (OmpShared) HostAssoc INTEGER(4) k = min(k, m) end do !$omp end parallel do !$omp parallel do reduction(iand:k) - !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct3/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) - !DEF: /omp_reduction/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity - !DEF: /omp_reduction/OtherConstruct3/m (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct3/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/iand ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity + !DEF: /OMP_REDUCTION/OtherConstruct3/m (OmpShared) HostAssoc INTEGER(4) k = iand(k, m) end do !$omp end parallel do !$omp parallel do reduction(ior:k) - !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct4/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) - !DEF: /omp_reduction/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity - !DEF: /omp_reduction/OtherConstruct4/m (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct4/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/ior ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity + !DEF: /OMP_REDUCTION/OtherConstruct4/m (OmpShared) HostAssoc INTEGER(4) k = ior(k, m) end do !$omp end parallel do !$omp parallel do reduction(ieor:k) - !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct5/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) - !DEF: /omp_reduction/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity - !DEF: /omp_reduction/OtherConstruct5/m (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct5/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/ieor ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity + !DEF: /OMP_REDUCTION/OtherConstruct5/m (OmpShared) HostAssoc INTEGER(4) k = ieor(k,m) end do !$omp end parallel do -end program omp_reduction +end program OMP_REDUCTION diff --git a/flang/test/Semantics/OpenMP/reduction09.f90 b/flang/test/Semantics/OpenMP/reduction09.f90 index d6c71c30d2834..ca60805e8c416 100644 --- a/flang/test/Semantics/OpenMP/reduction09.f90 +++ b/flang/test/Semantics/OpenMP/reduction09.f90 @@ -1,22 +1,22 @@ ! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenmp ! OpenMP Version 4.5 ! 2.15.3.6 Reduction Clause Positive cases. -!DEF: /omp_reduction MainProgram -program omp_reduction - !DEF: /omp_reduction/i ObjectEntity INTEGER(4) +!DEF: /OMP_REDUCTION MainProgram +program OMP_REDUCTION + !DEF: /OMP_REDUCTION/i ObjectEntity INTEGER(4) integer i - !DEF: /omp_reduction/k ObjectEntity INTEGER(4) + !DEF: /OMP_REDUCTION/k ObjectEntity INTEGER(4) integer :: k = 10 - !DEF: /omp_reduction/a ObjectEntity INTEGER(4) + !DEF: /OMP_REDUCTION/a ObjectEntity INTEGER(4) integer a(10) - !DEF: /omp_reduction/b ObjectEntity INTEGER(4) + !DEF: /OMP_REDUCTION/b ObjectEntity INTEGER(4) integer b(10,10,10) !$omp parallel shared(k) !$omp do reduction(+:k) - !DEF: /omp_reduction/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct1/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct1/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct1/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) k = k+1 end do !$omp end do @@ -24,53 +24,53 @@ program omp_reduction !$omp parallel do reduction(+:a(10)) - !DEF: /omp_reduction/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct2/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct2/k (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct2/k (OmpShared) HostAssoc INTEGER(4) k = k+1 end do !$omp end parallel do !$omp parallel do reduction(+:a(1:10:1)) - !DEF: /omp_reduction/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct3/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct3/k (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct3/k (OmpShared) HostAssoc INTEGER(4) k = k+1 end do !$omp end parallel do !$omp parallel do reduction(+:b(1:10:1,1:5,2)) - !DEF: /omp_reduction/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct4/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct4/k (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct4/k (OmpShared) HostAssoc INTEGER(4) k = k+1 end do !$omp end parallel do !$omp parallel do reduction(+:b(1:10:1,1:5,2:5:1)) - !DEF: /omp_reduction/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct5/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct5/k (OmpShared) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct5/k (OmpShared) HostAssoc INTEGER(4) k = k+1 end do !$omp end parallel do !$omp parallel private(i) !$omp do reduction(+:k) reduction(+:j) - !DEF: /omp_reduction/OtherConstruct6/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct6/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct6/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct6/OtherConstruct1/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) k = k+1 end do !$omp end do !$omp end parallel !$omp do reduction(+:k) reduction(*:j) reduction(+:l) - !DEF: /omp_reduction/OtherConstruct7/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct7/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /omp_reduction/OtherConstruct7/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) + !DEF: /OMP_REDUCTION/OtherConstruct7/k (OmpReduction, OmpExplicit) HostAssoc INTEGER(4) k = k+1 end do !$omp end do -end program omp_reduction +end program OMP_REDUCTION diff --git a/flang/test/Semantics/OpenMP/reduction11.f90 b/flang/test/Semantics/OpenMP/reduction11.f90 index b2ad0f6a6ee11..dfb3986d37d78 100644 --- a/flang/test/Semantics/OpenMP/reduction11.f90 +++ b/flang/test/Semantics/OpenMP/reduction11.f90 @@ -1,7 +1,7 @@ ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols -o - %s 2>&1 | FileCheck %s ! Check intrinsic reduction symbols (in this case "max" are marked as INTRINSIC -! CHECK: MainProgram scope: omp_reduction +! CHECK: MainProgram scope: OMP_REDUCTION program omp_reduction ! CHECK: i size=4 offset=0: ObjectEntity type: INTEGER(4) integer i diff --git a/flang/test/Semantics/OpenMP/scan2.f90 b/flang/test/Semantics/OpenMP/scan2.f90 index ffe84910f88a2..1ae5e871595c4 100644 --- a/flang/test/Semantics/OpenMP/scan2.f90 +++ b/flang/test/Semantics/OpenMP/scan2.f90 @@ -1,7 +1,7 @@ ! RUN: %flang_fc1 -fopenmp -fdebug-dump-symbols -o - %s 2>&1 | FileCheck %s ! Check scan reduction -! CHECK: MainProgram scope: omp_reduction +! CHECK: MainProgram scope: OMP_REDUCTION program omp_reduction ! CHECK: i size=4 offset=0: ObjectEntity type: INTEGER(4) integer i diff --git a/flang/test/Semantics/OpenMP/symbol01.f90 b/flang/test/Semantics/OpenMP/symbol01.f90 index fbd9a0286c79b..74fb420cc517e 100644 --- a/flang/test/Semantics/OpenMP/symbol01.f90 +++ b/flang/test/Semantics/OpenMP/symbol01.f90 @@ -16,53 +16,53 @@ module md integer :: b end type myty end module md -!DEF: /mm MainProgram -program mm +!DEF: /MM MainProgram +program MM !REF: /md use :: md - !DEF: /mm/c CommonBlockDetails - !DEF: /mm/x (InCommonBlock) ObjectEntity REAL(4) - !DEF: /mm/y (InCommonBlock) ObjectEntity REAL(4) + !DEF: /MM/c CommonBlockDetails + !DEF: /MM/x (InCommonBlock) ObjectEntity REAL(4) + !DEF: /MM/y (InCommonBlock) ObjectEntity REAL(4) common /c/x, y - !REF: /mm/x - !REF: /mm/y + !REF: /MM/x + !REF: /MM/y real x, y - !DEF: /mm/myty Use - !DEF: /mm/t ObjectEntity TYPE(myty) + !DEF: /MM/myty Use + !DEF: /MM/t ObjectEntity TYPE(myty) type(myty) :: t - !DEF: /mm/b ObjectEntity INTEGER(4) + !DEF: /MM/b ObjectEntity INTEGER(4) integer b(10) - !REF: /mm/t + !REF: /MM/t !REF: /md/myty/a t%a = 3.14 - !REF: /mm/t + !REF: /MM/t !REF: /md/myty/b t%b = 1 - !REF: /mm/b + !REF: /MM/b b = 2 - !DEF: /mm/a (Implicit) ObjectEntity REAL(4) + !DEF: /MM/a (Implicit) ObjectEntity REAL(4) a = 1.0 - !DEF: /mm/c (Implicit) ObjectEntity REAL(4) + !DEF: /MM/c (Implicit) ObjectEntity REAL(4) c = 2.0 !$omp parallel do private(a,t,/c/) shared(c) - !DEF: /mm/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + !DEF: /MM/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) do i=1,10 - !DEF: /mm/OtherConstruct1/a (OmpPrivate, OmpExplicit) HostAssoc REAL(4) - !DEF: /mm/OtherConstruct1/b (OmpShared) HostAssoc INTEGER(4) - !REF: /mm/OtherConstruct1/i + !DEF: /MM/OtherConstruct1/a (OmpPrivate, OmpExplicit) HostAssoc REAL(4) + !DEF: /MM/OtherConstruct1/b (OmpShared) HostAssoc INTEGER(4) + !REF: /MM/OtherConstruct1/i a = a+b(i) - !DEF: /mm/OtherConstruct1/t (OmpPrivate, OmpExplicit) HostAssoc TYPE(myty) + !DEF: /MM/OtherConstruct1/t (OmpPrivate, OmpExplicit) HostAssoc TYPE(myty) !REF: /md/myty/a - !REF: /mm/OtherConstruct1/i + !REF: /MM/OtherConstruct1/i t%a = i - !DEF: /mm/OtherConstruct1/y (OmpPrivate, OmpExplicit) HostAssoc REAL(4) + !DEF: /MM/OtherConstruct1/y (OmpPrivate, OmpExplicit) HostAssoc REAL(4) y = 0. - !DEF: /mm/OtherConstruct1/x (OmpPrivate, OmpExplicit) HostAssoc REAL(4) - !REF: /mm/OtherConstruct1/a - !REF: /mm/OtherConstruct1/i - !REF: /mm/OtherConstruct1/y + !DEF: /MM/OtherConstruct1/x (OmpPrivate, OmpExplicit) HostAssoc REAL(4) + !REF: /MM/OtherConstruct1/a + !REF: /MM/OtherConstruct1/i + !REF: /MM/OtherConstruct1/y x = a+i+y - !DEF: /mm/OtherConstruct1/c (OmpShared, OmpExplicit) HostAssoc REAL(4) + !DEF: /MM/OtherConstruct1/c (OmpShared, OmpExplicit) HostAssoc REAL(4) c = 3.0 end do end program diff --git a/flang/test/Semantics/OpenMP/symbol05.f90 b/flang/test/Semantics/OpenMP/symbol05.f90 index fe01f15d20aa3..4f3d1926013dc 100644 --- a/flang/test/Semantics/OpenMP/symbol05.f90 +++ b/flang/test/Semantics/OpenMP/symbol05.f90 @@ -31,10 +31,10 @@ subroutine foo end block end subroutine foo end module mm -!DEF: /tt MainProgram -program tt +!DEF: /TT MainProgram +program TT !REF: /mm use :: mm - !DEF: /tt/foo (Subroutine) Use + !DEF: /TT/foo (Subroutine) Use call foo -end program tt +end program TT diff --git a/flang/test/Semantics/OpenMP/symbol07.f90 b/flang/test/Semantics/OpenMP/symbol07.f90 index 86b7305411347..1b0c25b7a04b0 100644 --- a/flang/test/Semantics/OpenMP/symbol07.f90 +++ b/flang/test/Semantics/OpenMP/symbol07.f90 @@ -30,8 +30,8 @@ subroutine function_call_in_region !REF: /function_call_in_region/b print *, a, b end subroutine function_call_in_region -!DEF: /mm MainProgram -program mm +!DEF: /MM MainProgram +program MM !REF: /function_call_in_region call function_call_in_region -end program mm +end program MM diff --git a/flang/test/Semantics/OpenMP/symbol09.f90 b/flang/test/Semantics/OpenMP/symbol09.f90 index 86b7305411347..1b0c25b7a04b0 100644 --- a/flang/test/Semantics/OpenMP/symbol09.f90 +++ b/flang/test/Semantics/OpenMP/symbol09.f90 @@ -30,8 +30,8 @@ subroutine function_call_in_region !REF: /function_call_in_region/b print *, a, b end subroutine function_call_in_region -!DEF: /mm MainProgram -program mm +!DEF: /MM MainProgram +program MM !REF: /function_call_in_region call function_call_in_region -end program mm +end program MM diff --git a/flang/test/Semantics/OpenMP/threadprivate03.f90 b/flang/test/Semantics/OpenMP/threadprivate03.f90 index 81e26ee327a9d..fda2fe608ac3c 100644 --- a/flang/test/Semantics/OpenMP/threadprivate03.f90 +++ b/flang/test/Semantics/OpenMP/threadprivate03.f90 @@ -10,11 +10,11 @@ program main use mod1 integer, parameter :: i = 1 - !ERROR: The module name or main program name cannot be in a THREADPRIVATE directive + !ERROR: The module name cannot be in a THREADPRIVATE directive !$omp threadprivate(mod1) - !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash] - !ERROR: The module name or main program name cannot be in a THREADPRIVATE directive + ! This is now allowed, since "main" is implicitly declared symbol, + ! separate from the main program symbol. !$omp threadprivate(main) !ERROR: The entity with PARAMETER attribute cannot be in a THREADPRIVATE directive diff --git a/flang/test/Semantics/getsymbols03-a.f90 b/flang/test/Semantics/getsymbols03-a.f90 index 95b7fb418367d..5c5e87575a9cb 100644 --- a/flang/test/Semantics/getsymbols03-a.f90 +++ b/flang/test/Semantics/getsymbols03-a.f90 @@ -8,7 +8,7 @@ program main end program ! RUN: %flang_fc1 -fget-symbols-sources %s 2>&1 | FileCheck %s +! CHECK:MAIN:{{.*}}getsymbols03-a.f90, 4, 9-13 ! CHECK:f:{{.*}}getsymbols03-b.f90, 2, 12-13 -! CHECK:main:{{.*}}getsymbols03-a.f90, 4, 9-13 ! CHECK:mm3:{{.*}}getsymbols03-a.f90, 5, 6-9 ! CHECK:x:{{.*}}getsymbols03-a.f90, 6, 13-14 diff --git a/flang/test/Semantics/long-name.f90 b/flang/test/Semantics/long-name.f90 index 44899b13edd5a..d5a795113e204 100644 --- a/flang/test/Semantics/long-name.f90 +++ b/flang/test/Semantics/long-name.f90 @@ -1,6 +1,6 @@ ! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror -pedantic -!PORTABILITY: aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg1 has length 64, which is greater than the maximum name length 63 [-Wlong-names] +!PORTABILITY: AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEFFFFFFFFFFGGG1 has length 64, which is greater than the maximum name length 63 [-Wlong-names] program aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg1 !PORTABILITY: aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeffffffffffggg2 has length 64, which is greater than the maximum name length 63 [-Wlong-names] diff --git a/flang/test/Semantics/modproc01.f90 b/flang/test/Semantics/modproc01.f90 index 5f45362e95093..e565ddcfbe0b1 100644 --- a/flang/test/Semantics/modproc01.f90 +++ b/flang/test/Semantics/modproc01.f90 @@ -125,7 +125,7 @@ program test x = mf(3, "abc", pdt1(1,3)()) ! call ms(mf) end program -!CHECK: MainProgram scope: test size=88 alignment=8 +!CHECK: MainProgram scope: TEST size=88 alignment=8 !CHECK: mf, MODULE (Function): Use from mf in m !CHECK: pdt1: Use from pdt1 in m !CHECK: pdt2: Use from pdt2 in m diff --git a/flang/test/Semantics/multi-programs04.f90 b/flang/test/Semantics/multi-programs04.f90 index 54b0235aa78f0..e69ac7325278e 100644 --- a/flang/test/Semantics/multi-programs04.f90 +++ b/flang/test/Semantics/multi-programs04.f90 @@ -4,6 +4,6 @@ program m end !ERROR: A source file cannot contain more than one main program -!ERROR: 'm' is already declared in this scoping unit +!ERROR: 'M' is already declared in this scoping unit program m end diff --git a/flang/test/Semantics/pointer01.f90 b/flang/test/Semantics/pointer01.f90 index eaa2426dd77e3..79d6016a6af46 100644 --- a/flang/test/Semantics/pointer01.f90 +++ b/flang/test/Semantics/pointer01.f90 @@ -7,7 +7,6 @@ subroutine msubr end module program main use m - !PORTABILITY: Name 'main' declared in a main program should not have the same name as the main program [-Wbenign-name-clash] pointer main !ERROR: Cannot change POINTER attribute on use-associated 'mobj' pointer mobj diff --git a/flang/test/Semantics/procinterface01.f90 b/flang/test/Semantics/procinterface01.f90 index 73040b0987bd0..70f4a889d6809 100644 --- a/flang/test/Semantics/procinterface01.f90 +++ b/flang/test/Semantics/procinterface01.f90 @@ -159,35 +159,35 @@ end function logical tan = "?" end function tan -!DEF: /main MainProgram -program main +!DEF: /MAIN MainProgram +program MAIN !REF: /module1 use :: module1 - !DEF: /main/derived1 Use - !DEF: /main/instance ObjectEntity TYPE(derived1) + !DEF: /MAIN/derived1 Use + !DEF: /MAIN/instance ObjectEntity TYPE(derived1) type(derived1) :: instance - !REF: /main/instance + !REF: /MAIN/instance !REF: /module1/derived1/p1 if (instance%p1(1.)/=2.) print *, "p1 failed" - !REF: /main/instance + !REF: /MAIN/instance !REF: /module1/derived1/p2 if (instance%p2(1.)/=2.) print *, "p2 failed" - !REF: /main/instance + !REF: /MAIN/instance !REF: /module1/derived1/p3 if (.not.instance%p3(1.)) print *, "p3 failed" - !REF: /main/instance + !REF: /MAIN/instance !REF: /module1/derived1/p4 if (.not.instance%p4(1.)) print *, "p4 failed" - !REF: /main/instance + !REF: /MAIN/instance !REF: /module1/derived1/p5 if (instance%p5(1.)/=(5.,6.)) print *, "p5 failed" - !REF: /main/instance + !REF: /MAIN/instance !REF: /module1/derived1/p6 if (instance%p6(1.)/=2.) print *, "p6 failed" - !REF: /main/instance + !REF: /MAIN/instance !REF: /module1/derived1/p7 if (instance%p7(0.)/=1.) print *, "p7 failed" - !REF: /main/instance + !REF: /MAIN/instance !REF: /module1/derived1/p8 if (instance%p8(1.)/="a") print *, "p8 failed" -end program main +end program MAIN diff --git a/flang/test/Semantics/resolve05.f90 b/flang/test/Semantics/resolve05.f90 index 0c9877af9b4e2..7b142d2ebd613 100644 --- a/flang/test/Semantics/resolve05.f90 +++ b/flang/test/Semantics/resolve05.f90 @@ -1,6 +1,5 @@ ! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic program p - !PORTABILITY: Name 'p' declared in a main program should not have the same name as the main program [-Wbenign-name-clash] integer :: p end module m diff --git a/flang/test/Semantics/resolve125.f90 b/flang/test/Semantics/resolve125.f90 index e040c006ec179..620c7d65578cd 100644 --- a/flang/test/Semantics/resolve125.f90 +++ b/flang/test/Semantics/resolve125.f90 @@ -43,7 +43,7 @@ subroutine reset end subroutine reset end module m2 -!CHECK: MainProgram scope: main +!CHECK: MainProgram scope: MAIN !CHECK: i: Use from i in m2 !CHECK: i2: Use from i2 in m2 !CHECK: init (Subroutine): Use from init in m2 @@ -61,4 +61,4 @@ program main else print *, "fail" end if -end program main \ No newline at end of file +end program main diff --git a/flang/test/Semantics/symbol03.f90 b/flang/test/Semantics/symbol03.f90 index a6b4b0bd15937..62472495d9736 100644 --- a/flang/test/Semantics/symbol03.f90 +++ b/flang/test/Semantics/symbol03.f90 @@ -1,23 +1,23 @@ ! RUN: %python %S/test_symbols.py %s %flang_fc1 ! Test host association in internal subroutine of main program. -!DEF: /main MainProgram -program main - !DEF: /main/x ObjectEntity INTEGER(4) +!DEF: /MAIN MainProgram +program MAIN + !DEF: /MAIN/x ObjectEntity INTEGER(4) integer x - !DEF: /main/s (Subroutine) Subprogram + !DEF: /MAIN/s (Subroutine) Subprogram call s contains - !REF: /main/s + !REF: /MAIN/s subroutine s - !DEF: /main/s/y (Implicit) ObjectEntity REAL(4) - !DEF: /main/s/x HostAssoc INTEGER(4) + !DEF: /MAIN/s/y (Implicit) ObjectEntity REAL(4) + !DEF: /MAIN/s/x HostAssoc INTEGER(4) y = x contains - !DEF: /main/s/s2 (Subroutine) Subprogram + !DEF: /MAIN/s/s2 (Subroutine) Subprogram subroutine s2 - !DEF: /main/s/s2/z (Implicit) ObjectEntity REAL(4) - !DEF: /main/s/s2/x HostAssoc INTEGER(4) + !DEF: /MAIN/s/s2/z (Implicit) ObjectEntity REAL(4) + !DEF: /MAIN/s/s2/x HostAssoc INTEGER(4) z = x end subroutine end subroutine diff --git a/flang/test/Semantics/symbol06.f90 b/flang/test/Semantics/symbol06.f90 index bbd6d4d071c89..b45edabcd5318 100644 --- a/flang/test/Semantics/symbol06.f90 +++ b/flang/test/Semantics/symbol06.f90 @@ -1,56 +1,56 @@ ! RUN: %python %S/test_symbols.py %s %flang_fc1 -!DEF: /main MainProgram -program main - !DEF: /main/t1 DerivedType +!DEF: /MAIN MainProgram +program MAIN + !DEF: /MAIN/t1 DerivedType type :: t1 - !DEF: /main/t1/a1 ObjectEntity INTEGER(4) + !DEF: /MAIN/t1/a1 ObjectEntity INTEGER(4) integer :: a1 end type - !REF: /main/t1 - !DEF: /main/t2 DerivedType + !REF: /MAIN/t1 + !DEF: /MAIN/t2 DerivedType type, extends(t1) :: t2 - !DEF: /main/t2/a2 ObjectEntity INTEGER(4) + !DEF: /MAIN/t2/a2 ObjectEntity INTEGER(4) integer :: a2 end type - !REF: /main/t2 - !DEF: /main/t3 DerivedType + !REF: /MAIN/t2 + !DEF: /MAIN/t3 DerivedType type, extends(t2) :: t3 - !DEF: /main/t3/a3 ObjectEntity INTEGER(4) + !DEF: /MAIN/t3/a3 ObjectEntity INTEGER(4) integer :: a3 end type - !REF: /main/t3 - !DEF: /main/x3 ObjectEntity TYPE(t3) + !REF: /MAIN/t3 + !DEF: /MAIN/x3 ObjectEntity TYPE(t3) type(t3) :: x3 - !DEF: /main/i ObjectEntity INTEGER(4) + !DEF: /MAIN/i ObjectEntity INTEGER(4) integer i - !REF: /main/i - !REF: /main/x3 - !REF: /main/t2/a2 + !REF: /MAIN/i + !REF: /MAIN/x3 + !REF: /MAIN/t2/a2 i = x3%a2 - !REF: /main/i - !REF: /main/x3 - !REF: /main/t1/a1 + !REF: /MAIN/i + !REF: /MAIN/x3 + !REF: /MAIN/t1/a1 i = x3%a1 - !REF: /main/i - !REF: /main/x3 - !DEF: /main/t3/t2 (ParentComp) ObjectEntity TYPE(t2) - !REF: /main/t2/a2 + !REF: /MAIN/i + !REF: /MAIN/x3 + !DEF: /MAIN/t3/t2 (ParentComp) ObjectEntity TYPE(t2) + !REF: /MAIN/t2/a2 i = x3%t2%a2 - !REF: /main/i - !REF: /main/x3 - !REF: /main/t3/t2 - !REF: /main/t1/a1 + !REF: /MAIN/i + !REF: /MAIN/x3 + !REF: /MAIN/t3/t2 + !REF: /MAIN/t1/a1 i = x3%t2%a1 - !REF: /main/i - !REF: /main/x3 - !DEF: /main/t2/t1 (ParentComp) ObjectEntity TYPE(t1) - !REF: /main/t1/a1 + !REF: /MAIN/i + !REF: /MAIN/x3 + !DEF: /MAIN/t2/t1 (ParentComp) ObjectEntity TYPE(t1) + !REF: /MAIN/t1/a1 i = x3%t1%a1 - !REF: /main/i - !REF: /main/x3 - !REF: /main/t3/t2 - !REF: /main/t2/t1 - !REF: /main/t1/a1 + !REF: /MAIN/i + !REF: /MAIN/x3 + !REF: /MAIN/t3/t2 + !REF: /MAIN/t2/t1 + !REF: /MAIN/t1/a1 i = x3%t2%t1%a1 end program diff --git a/flang/test/Semantics/symbol07.f90 b/flang/test/Semantics/symbol07.f90 index f3cc934e51b16..e1d8257b9e190 100644 --- a/flang/test/Semantics/symbol07.f90 +++ b/flang/test/Semantics/symbol07.f90 @@ -1,40 +1,40 @@ ! RUN: %python %S/test_symbols.py %s %flang_fc1 -!DEF: /main MainProgram -program main +!DEF: /MAIN MainProgram +program MAIN implicit complex(z) - !DEF: /main/t DerivedType + !DEF: /MAIN/t DerivedType type :: t - !DEF: /main/t/re ObjectEntity REAL(4) + !DEF: /MAIN/t/re ObjectEntity REAL(4) real :: re - !DEF: /main/t/im ObjectEntity REAL(4) + !DEF: /MAIN/t/im ObjectEntity REAL(4) real :: im end type - !DEF: /main/z1 ObjectEntity COMPLEX(4) + !DEF: /MAIN/z1 ObjectEntity COMPLEX(4) complex z1 - !REF: /main/t - !DEF: /main/w ObjectEntity TYPE(t) + !REF: /MAIN/t + !DEF: /MAIN/w ObjectEntity TYPE(t) type(t) :: w - !DEF: /main/x ObjectEntity REAL(4) - !DEF: /main/y ObjectEntity REAL(4) + !DEF: /MAIN/x ObjectEntity REAL(4) + !DEF: /MAIN/y ObjectEntity REAL(4) real x, y - !REF: /main/x - !REF: /main/z1 + !REF: /MAIN/x + !REF: /MAIN/z1 x = z1%re - !REF: /main/y - !REF: /main/z1 + !REF: /MAIN/y + !REF: /MAIN/z1 y = z1%im - !DEF: /main/z2 (Implicit) ObjectEntity COMPLEX(4) - !REF: /main/x + !DEF: /MAIN/z2 (Implicit) ObjectEntity COMPLEX(4) + !REF: /MAIN/x z2%re = x - !REF: /main/z2 - !REF: /main/y + !REF: /MAIN/z2 + !REF: /MAIN/y z2%im = y - !REF: /main/x - !REF: /main/w - !REF: /main/t/re + !REF: /MAIN/x + !REF: /MAIN/w + !REF: /MAIN/t/re x = w%re - !REF: /main/y - !REF: /main/w - !REF: /main/t/im + !REF: /MAIN/y + !REF: /MAIN/w + !REF: /MAIN/t/im y = w%im end program diff --git a/flang/test/Semantics/symbol08.f90 b/flang/test/Semantics/symbol08.f90 index 61dab798955c5..933ff6d0c2ba8 100644 --- a/flang/test/Semantics/symbol08.f90 +++ b/flang/test/Semantics/symbol08.f90 @@ -1,15 +1,15 @@ ! RUN: %python %S/test_symbols.py %s %flang_fc1 -!DEF: /main MainProgram -program main - !DEF: /main/x POINTER ObjectEntity REAL(4) +!DEF: /MAIN MainProgram +program MAIN + !DEF: /MAIN/x POINTER ObjectEntity REAL(4) pointer :: x - !REF: /main/x + !REF: /MAIN/x real x - !DEF: /main/y EXTERNAL, POINTER (Function) ProcEntity REAL(4) + !DEF: /MAIN/y EXTERNAL, POINTER (Function) ProcEntity REAL(4) pointer :: y - !REF: /main/y + !REF: /MAIN/y procedure (real) :: y - !DEF: /main/z (Implicit) ObjectEntity REAL(4) - !REF: /main/y + !DEF: /MAIN/z (Implicit) ObjectEntity REAL(4) + !REF: /MAIN/y z = y() end program diff --git a/flang/test/Semantics/symbol15.f90 b/flang/test/Semantics/symbol15.f90 index df10942e6af2d..79a45491306ef 100644 --- a/flang/test/Semantics/symbol15.f90 +++ b/flang/test/Semantics/symbol15.f90 @@ -249,15 +249,15 @@ subroutine ext2 !DEF: /ext3 (Subroutine) Subprogram subroutine ext3 end subroutine -!DEF: /main MainProgram -program main +!DEF: /MAIN MainProgram +program MAIN !REF: /m use :: m - !DEF: /main/pdt1 Use - !DEF: /main/pdt1y ObjectEntity TYPE(pdt1(k=2_4)) + !DEF: /MAIN/pdt1 Use + !DEF: /MAIN/pdt1y ObjectEntity TYPE(pdt1(k=2_4)) type(pdt1(2)) :: pdt1y - !DEF: /main/pdt2 Use - !DEF: /main/pdt2y ObjectEntity TYPE(pdt2(k=2_4)) + !DEF: /MAIN/pdt2 Use + !DEF: /MAIN/pdt2y ObjectEntity TYPE(pdt2(k=2_4)) type(pdt2(2)) :: pdt2y print *, "compiled" end program diff --git a/flang/test/Semantics/symbol16.f90 b/flang/test/Semantics/symbol16.f90 index 7a46092c36b53..547c4624d4cdb 100644 --- a/flang/test/Semantics/symbol16.f90 +++ b/flang/test/Semantics/symbol16.f90 @@ -1,18 +1,18 @@ ! RUN: %python %S/test_symbols.py %s %flang_fc1 ! Statement functions -!DEF: /p1 MainProgram -program p1 - !DEF: /p1/f (Function, StmtFunction) Subprogram INTEGER(4) - !DEF: /p1/i ObjectEntity INTEGER(4) - !DEF: /p1/j ObjectEntity INTEGER(4) +!DEF: /P1 MainProgram +program P1 + !DEF: /P1/f (Function, StmtFunction) Subprogram INTEGER(4) + !DEF: /P1/i ObjectEntity INTEGER(4) + !DEF: /P1/j ObjectEntity INTEGER(4) integer f, i, j - !REF: /p1/f - !REF: /p1/i - !DEF: /p1/f/i ObjectEntity INTEGER(4) + !REF: /P1/f + !REF: /P1/i + !DEF: /P1/f/i ObjectEntity INTEGER(4) f(i) = i + 1 - !REF: /p1/j - !REF: /p1/f + !REF: /P1/j + !REF: /P1/f j = f(2) end program diff --git a/flang/test/Semantics/symbol17.f90 b/flang/test/Semantics/symbol17.f90 index 434f124509a32..a0d916e55cfa4 100644 --- a/flang/test/Semantics/symbol17.f90 +++ b/flang/test/Semantics/symbol17.f90 @@ -1,44 +1,44 @@ ! RUN: %python %S/test_symbols.py %s %flang_fc1 ! Forward references to derived types (non-error cases) -!DEF: /main MainProgram -program main - !DEF: /main/t1 DerivedType +!DEF: /MAIN MainProgram +program MAIN + !DEF: /MAIN/t1 DerivedType type :: t1 - !DEF: /main/t2 DerivedType - !DEF: /main/t1/t1a ALLOCATABLE ObjectEntity TYPE(t2) + !DEF: /MAIN/t2 DerivedType + !DEF: /MAIN/t1/t1a ALLOCATABLE ObjectEntity TYPE(t2) type(t2), allocatable :: t1a - !REF: /main/t2 - !DEF: /main/t1/t1p POINTER ObjectEntity TYPE(t2) + !REF: /MAIN/t2 + !DEF: /MAIN/t1/t1p POINTER ObjectEntity TYPE(t2) type(t2), pointer :: t1p end type - !REF: /main/t2 + !REF: /MAIN/t2 type :: t2 - !REF: /main/t2 - !DEF: /main/t2/t2a ALLOCATABLE ObjectEntity TYPE(t2) + !REF: /MAIN/t2 + !DEF: /MAIN/t2/t2a ALLOCATABLE ObjectEntity TYPE(t2) type(t2), allocatable :: t2a - !REF: /main/t2 - !DEF: /main/t2/t2p POINTER ObjectEntity TYPE(t2) + !REF: /MAIN/t2 + !DEF: /MAIN/t2/t2p POINTER ObjectEntity TYPE(t2) type(t2), pointer :: t2p end type - !REF: /main/t1 - !DEF: /main/t1x TARGET ObjectEntity TYPE(t1) + !REF: /MAIN/t1 + !DEF: /MAIN/t1x TARGET ObjectEntity TYPE(t1) type(t1), target :: t1x - !REF: /main/t1x - !REF: /main/t1/t1a + !REF: /MAIN/t1x + !REF: /MAIN/t1/t1a allocate(t1x%t1a) - !REF: /main/t1x - !REF: /main/t1/t1p - !REF: /main/t1/t1a + !REF: /MAIN/t1x + !REF: /MAIN/t1/t1p + !REF: /MAIN/t1/t1a t1x%t1p => t1x%t1a - !REF: /main/t1x - !REF: /main/t1/t1a - !REF: /main/t2/t2a + !REF: /MAIN/t1x + !REF: /MAIN/t1/t1a + !REF: /MAIN/t2/t2a allocate(t1x%t1a%t2a) - !REF: /main/t1x - !REF: /main/t1/t1a - !REF: /main/t2/t2p - !REF: /main/t2/t2a + !REF: /MAIN/t1x + !REF: /MAIN/t1/t1a + !REF: /MAIN/t2/t2p + !REF: /MAIN/t2/t2a t1x%t1a%t2p => t1x%t1a%t2a end program !DEF: /f1/fwd DerivedType diff --git a/flang/test/Semantics/symbol18.f90 b/flang/test/Semantics/symbol18.f90 index a37792bce21d7..6e41bb5db91ee 100644 --- a/flang/test/Semantics/symbol18.f90 +++ b/flang/test/Semantics/symbol18.f90 @@ -2,21 +2,21 @@ ! Intrinsic function in type declaration statement: type is ignored -!DEF: /p1 MainProgram -program p1 - !DEF: /p1/cos ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity INTEGER(4) +!DEF: /P1 MainProgram +program P1 + !DEF: /P1/cos ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity INTEGER(4) integer cos - !DEF: /p1/y (Implicit) ObjectEntity REAL(4) - !REF: /p1/cos - !DEF: /p1/x (Implicit) ObjectEntity REAL(4) + !DEF: /P1/y (Implicit) ObjectEntity REAL(4) + !REF: /P1/cos + !DEF: /P1/x (Implicit) ObjectEntity REAL(4) y = cos(x) - !REF: /p1/y - !DEF: /p1/sin ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity - !REF: /p1/x + !REF: /P1/y + !DEF: /P1/sin ELEMENTAL, INTRINSIC, PURE (Function) ProcEntity + !REF: /P1/x y = sin(x) - !REF: /p1/y + !REF: /P1/y !DEF: /f EXTERNAL (Function, Implicit) ProcEntity REAL(4) - !REF: /p1/x + !REF: /P1/x y = f(x) end program diff --git a/flang/test/Semantics/symbol20.f90 b/flang/test/Semantics/symbol20.f90 index 8c82776933321..bf3aff489b3b9 100644 --- a/flang/test/Semantics/symbol20.f90 +++ b/flang/test/Semantics/symbol20.f90 @@ -32,16 +32,16 @@ subroutine bar print *, "in bar" end subroutine end module -!DEF: /demo MainProgram -program demo +!DEF: /DEMO MainProgram +program DEMO !REF: /m use :: m - !DEF: /demo/bar (Subroutine) Use - !DEF: /demo/p EXTERNAL, POINTER (Subroutine) ProcEntity + !DEF: /DEMO/bar (Subroutine) Use + !DEF: /DEMO/p EXTERNAL, POINTER (Subroutine) ProcEntity procedure(bar), pointer :: p - !REF: /demo/p - !DEF: /demo/foo (Function) Use + !REF: /DEMO/p + !DEF: /DEMO/foo (Function) Use p => foo() - !REF: /demo/p + !REF: /DEMO/p call p end program diff --git a/flang/test/Semantics/symbol25.f90 b/flang/test/Semantics/symbol25.f90 index ac3dd37ef92eb..ac47a19eae8cc 100644 --- a/flang/test/Semantics/symbol25.f90 +++ b/flang/test/Semantics/symbol25.f90 @@ -38,23 +38,23 @@ subroutine inner1 end subroutine inner1 end subroutine outer end module m -!DEF: /main MainProgram -program main +!DEF: /MAIN MainProgram +program MAIN !REF: /m use :: m !REF: /m/specific1 call generic - !DEF: /main/inner2 (Subroutine) Subprogram + !DEF: /MAIN/inner2 (Subroutine) Subprogram call inner2 contains - !REF: /main/inner2 + !REF: /MAIN/inner2 subroutine inner2 - !DEF: /main/inner2/generic (Subroutine) Generic + !DEF: /MAIN/inner2/generic (Subroutine) Generic interface generic - !DEF: /main/specific2 (Subroutine) Use + !DEF: /MAIN/specific2 (Subroutine) Use module procedure :: specific2 end interface - !REF: /main/specific2 + !REF: /MAIN/specific2 call generic end subroutine inner2 end program diff --git a/flang/test/Semantics/symbol26.f90 b/flang/test/Semantics/symbol26.f90 index f5e95853ca099..dded4b632c654 100644 --- a/flang/test/Semantics/symbol26.f90 +++ b/flang/test/Semantics/symbol26.f90 @@ -8,16 +8,16 @@ module m !DEF: /m/j PUBLIC (Implicit, InNamelist) ObjectEntity INTEGER(4) namelist/a/j end module m -!DEF: /main MainProgram -program main - !DEF: /main/j (Implicit) ObjectEntity INTEGER(4) +!DEF: /MAIN MainProgram +program MAIN + !DEF: /MAIN/j (Implicit) ObjectEntity INTEGER(4) j = 1 contains - !DEF: /main/inner (Subroutine) Subprogram + !DEF: /MAIN/inner (Subroutine) Subprogram subroutine inner !REF: /m use :: m - !DEF: /main/inner/j (Implicit, InNamelist) Use INTEGER(4) + !DEF: /MAIN/inner/j (Implicit, InNamelist) Use INTEGER(4) j = 2 end subroutine end program diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 index 12f63031cbaee..6f24b346e3fb9 100644 --- a/flang/test/Transforms/DoConcurrent/basic_host.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -5,7 +5,7 @@ ! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-to-openmp=host %s -o - \ ! RUN: | FileCheck %s -! CHECK-LABEL: do_concurrent_basic +! CHECK-LABEL: DO_CONCURRENT_BASIC program do_concurrent_basic ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) From 2c0c87be1258c36a177bfd47f272f8dffca366f4 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 17 Jul 2025 11:28:27 -0700 Subject: [PATCH 221/813] Speculative buildbot fix. --- compiler-rt/test/lit.common.configured.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in index 04d1a4df5a54f..2b4b72bc895c5 100644 --- a/compiler-rt/test/lit.common.configured.in +++ b/compiler-rt/test/lit.common.configured.in @@ -10,7 +10,7 @@ set_default("target_triple", "@COMPILER_RT_DEFAULT_TARGET_TRIPLE@") set_default("target_cflags", "@COMPILER_RT_TEST_COMPILER_CFLAGS@") set_default("host_arch", "@HOST_ARCH@") set_default("target_arch", "@COMPILER_RT_DEFAULT_TARGET_ARCH@") -set_default("host_os", "@HOST_OS@") +set_default("target_os", "@HOST_OS@") set_default("llvm_build_mode", "@LLVM_BUILD_MODE@") set_default("llvm_src_root", "@LLVM_MAIN_SRC_DIR@") set_default("llvm_obj_root", "@LLVM_BINARY_DIR@") From 73d4cea68cce998b1349e3820dc5d80e1096b015 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Thu, 17 Jul 2025 13:41:04 -0500 Subject: [PATCH 222/813] [flang][OpenMP] Generalize isOpenMPPrivatizingConstruct (#148654) Instead of treating all block and all loop constructs as privatizing, actually check if the construct allows a privatizing clause. --- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 57 +++++++++++++++---- flang/lib/Lower/OpenMP/DataSharingProcessor.h | 12 +++- flang/test/Lower/OpenMP/taskgroup02.f90 | 5 +- llvm/include/llvm/Frontend/OpenMP/OMP.h | 4 ++ 4 files changed, 61 insertions(+), 17 deletions(-) diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index 3fae3f3a0ddfd..675a58e4f35a1 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -26,6 +26,8 @@ #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Semantics/attr.h" #include "flang/Semantics/tools.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallSet.h" namespace Fortran { namespace lower { @@ -49,7 +51,7 @@ DataSharingProcessor::DataSharingProcessor( firOpBuilder(converter.getFirOpBuilder()), clauses(clauses), eval(eval), shouldCollectPreDeterminedSymbols(shouldCollectPreDeterminedSymbols), useDelayedPrivatization(useDelayedPrivatization), symTable(symTable), - visitor() { + visitor(semaCtx) { eval.visit([&](const auto &functionParserNode) { parser::Walk(functionParserNode, visitor); }); @@ -424,24 +426,55 @@ getSource(const semantics::SemanticsContext &semaCtx, return source; } +static void collectPrivatizingConstructs( + llvm::SmallSet &constructs, unsigned version) { + using Clause = llvm::omp::Clause; + using Directive = llvm::omp::Directive; + + static const Clause privatizingClauses[] = { + Clause::OMPC_private, + Clause::OMPC_lastprivate, + Clause::OMPC_firstprivate, + Clause::OMPC_in_reduction, + Clause::OMPC_reduction, + Clause::OMPC_linear, + // TODO: Clause::OMPC_induction, + Clause::OMPC_task_reduction, + Clause::OMPC_detach, + Clause::OMPC_use_device_ptr, + Clause::OMPC_is_device_ptr, + }; + + for (auto dir : llvm::enum_seq_inclusive(Directive::First_, + Directive::Last_)) { + bool allowsPrivatizing = llvm::any_of(privatizingClauses, [&](Clause cls) { + return llvm::omp::isAllowedClauseForDirective(dir, cls, version); + }); + if (allowsPrivatizing) + constructs.insert(dir); + } +} + bool DataSharingProcessor::isOpenMPPrivatizingConstruct( - const parser::OpenMPConstruct &omp) { - return common::visit( - [](auto &&s) { - using BareS = llvm::remove_cvref_t; - return std::is_same_v || - std::is_same_v || - std::is_same_v; - }, - omp.u); + const parser::OpenMPConstruct &omp, unsigned version) { + static llvm::SmallSet privatizing; + [[maybe_unused]] static bool init = + (collectPrivatizingConstructs(privatizing, version), true); + + // As of OpenMP 6.0, privatizing constructs (with the test being if they + // allow a privatizing clause) are: dispatch, distribute, do, for, loop, + // parallel, scope, sections, simd, single, target, target_data, task, + // taskgroup, taskloop, and teams. + return llvm::is_contained(privatizing, extractOmpDirective(omp)); } bool DataSharingProcessor::isOpenMPPrivatizingEvaluation( const pft::Evaluation &eval) const { - return eval.visit([](auto &&s) { + unsigned version = semaCtx.langOptions().OpenMPVersion; + return eval.visit([=](auto &&s) { using BareS = llvm::remove_cvref_t; if constexpr (std::is_same_v) { - return isOpenMPPrivatizingConstruct(s); + return isOpenMPPrivatizingConstruct(s, version); } else { return false; } diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h index ee2fc70d2e673..bc422f410403a 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h @@ -36,6 +36,8 @@ class DataSharingProcessor { /// at any point in time. This is used to track Symbol definition scopes in /// order to tell which OMP scope defined vs. references a certain Symbol. struct OMPConstructSymbolVisitor { + OMPConstructSymbolVisitor(semantics::SemanticsContext &ctx) + : version(ctx.langOptions().OpenMPVersion) {} template bool Pre(const T &) { return true; @@ -45,13 +47,13 @@ class DataSharingProcessor { bool Pre(const parser::OpenMPConstruct &omp) { // Skip constructs that may not have privatizations. - if (isOpenMPPrivatizingConstruct(omp)) + if (isOpenMPPrivatizingConstruct(omp, version)) constructs.push_back(&omp); return true; } void Post(const parser::OpenMPConstruct &omp) { - if (isOpenMPPrivatizingConstruct(omp)) + if (isOpenMPPrivatizingConstruct(omp, version)) constructs.pop_back(); } @@ -68,6 +70,9 @@ class DataSharingProcessor { /// construct that defines symbol. bool isSymbolDefineBy(const semantics::Symbol *symbol, lower::pft::Evaluation &eval) const; + + private: + unsigned version; }; mlir::OpBuilder::InsertPoint lastPrivIP; @@ -115,7 +120,8 @@ class DataSharingProcessor { mlir::OpBuilder::InsertPoint *lastPrivIP); void insertDeallocs(); - static bool isOpenMPPrivatizingConstruct(const parser::OpenMPConstruct &omp); + static bool isOpenMPPrivatizingConstruct(const parser::OpenMPConstruct &omp, + unsigned version); bool isOpenMPPrivatizingEvaluation(const pft::Evaluation &eval) const; public: diff --git a/flang/test/Lower/OpenMP/taskgroup02.f90 b/flang/test/Lower/OpenMP/taskgroup02.f90 index 1e996a030c23a..4c470b7aa82d1 100644 --- a/flang/test/Lower/OpenMP/taskgroup02.f90 +++ b/flang/test/Lower/OpenMP/taskgroup02.f90 @@ -3,8 +3,9 @@ ! Check that variables are not privatized twice when TASKGROUP is used. !CHECK-LABEL: func.func @_QPsub() { -!CHECK: omp.parallel { -!CHECK: %[[PAR_I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsubEi"} +!CHECK: omp.parallel private(@_QFsubEi_private_i32 %[[SUB_I:.*]]#0 -> %[[ARG:.*]] : !fir.ref) +!CHECK: %[[ALLOCA:.*]] = fir.alloca i32 +!CHECK: %[[PAR_I:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFsubEi"} !CHECK: omp.master { !CHECK: omp.taskgroup { !CHECK-NEXT: omp.task private(@_QFsubEi_firstprivate_i32 %[[PAR_I]]#0 -> %[[TASK_I:.*]] : !fir.ref) { diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.h b/llvm/include/llvm/Frontend/OpenMP/OMP.h index d44c33301bde7..9d0a55432e1ae 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.h @@ -51,13 +51,17 @@ static constexpr inline bool canHaveIterator(Clause C) { // Can clause C create a private copy of a variable. static constexpr inline bool isPrivatizingClause(Clause C) { switch (C) { + case OMPC_detach: case OMPC_firstprivate: + // TODO case OMPC_induction: case OMPC_in_reduction: + case OMPC_is_device_ptr: case OMPC_lastprivate: case OMPC_linear: case OMPC_private: case OMPC_reduction: case OMPC_task_reduction: + case OMPC_use_device_ptr: return true; default: return false; From fd5fc76c91538871771be2c3be2ca3a5f2dcac31 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 17 Jul 2025 14:43:34 -0400 Subject: [PATCH 223/813] [AMDGPU] Add support for `v_cos_bf16` on gfx1250 (#149355) Co-authored-by: Mekhanoshin, Stanislav --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 ++++++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 2 + .../CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll | 33 ++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 +++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 48 ++++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 56 ++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 60 +++++++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 ++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 16 +++++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 45 +++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 48 ++++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 56 ++++++++++++++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 60 +++++++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16 +++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 20 ++++++ .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 63 ++++++++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 59 +++++++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 15 +++++ .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 64 +++++++++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 60 +++++++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 20 ++++++ 23 files changed, 819 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index e7a45f0e4300d..3b6ad7d90be3c 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -675,6 +675,7 @@ TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_exp2_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_sin_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_cos_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 32cf622f20605..9f48149354255 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -433,6 +433,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sin); case AMDGPU::BI__builtin_amdgcn_cosf: case AMDGPU::BI__builtin_amdgcn_cosh: + case AMDGPU::BI__builtin_amdgcn_cos_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_cos); case AMDGPU::BI__builtin_amdgcn_dispatch_ptr: return EmitAMDGPUDispatchPtr(*this, E); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 748b6455103ec..a1f984c129276 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -156,6 +156,25 @@ void test_sin_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_sin_bf16(a); } +// CHECK-LABEL: @test_cos_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.cos.bf16(bfloat [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_cos_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_cos_bf16(a); +} + // CHECK-LABEL: @test_cvt_f16_fp8( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index c91319eae7218..ff89b8badeed0 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -535,6 +535,7 @@ defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>; defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>; defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>; defm V_SIN_BF16 : VOP1Inst_t16 <"v_sin_bf16", VOP_BF16_BF16, AMDGPUsin>; +defm V_COS_BF16 : VOP1Inst_t16 <"v_cos_bf16", VOP_BF16_BF16, AMDGPUcos>; } } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>; @@ -1149,6 +1150,7 @@ defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>; defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>; defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>; defm V_SIN_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07e>; +defm V_COS_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07f>; //===----------------------------------------------------------------------===// // GFX10. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll new file mode 100644 index 0000000000000..091859f3c9bf3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.bf16.ll @@ -0,0 +1,33 @@ +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s + +; FIXME: GlobalISel does not work with bf16 + +declare bfloat @llvm.amdgcn.cos.bf16(bfloat) #0 + +; GCN-LABEL: {{^}}cos_bf16: +; GCN: v_cos_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}} +define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 { + %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat %src) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}cos_bf16_constant_4 +; GCN: v_cos_bf16_e32 v0, 4.0 +define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 { + %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat 4.0) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +; GCN-LABEL: {{^}}cos_bf16_constant_100 +; GCN: v_cos_bf16_e32 {{v[0-9]+}}, 0x42c8 +define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 { + %cos = call bfloat @llvm.amdgcn.cos.bf16(bfloat 100.0) #0 + store bfloat %cos, ptr addrspace(1) %out, align 2 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index f51d709a594a0..4b61064815ed5 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -343,6 +343,51 @@ v_sin_bf16 v5, src_scc v_sin_bf16 v127, 0x8000 // GFX1250: v_sin_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfc,0xfe,0x7e,0x00,0x80,0x00,0x00] +v_cos_bf16 v5, v1 +// GFX1250: v_cos_bf16_e32 v5, v1 ; encoding: [0x01,0xff,0x0a,0x7e] + +v_cos_bf16 v5, v127 +// GFX1250: v_cos_bf16_e32 v5, v127 ; encoding: [0x7f,0xff,0x0a,0x7e] + +v_cos_bf16 v5, s1 +// GFX1250: v_cos_bf16_e32 v5, s1 ; encoding: [0x01,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, s105 +// GFX1250: v_cos_bf16_e32 v5, s105 ; encoding: [0x69,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, vcc_lo +// GFX1250: v_cos_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, vcc_hi +// GFX1250: v_cos_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, ttmp15 +// GFX1250: v_cos_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, m0 +// GFX1250: v_cos_bf16_e32 v5, m0 ; encoding: [0x7d,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, exec_lo +// GFX1250: v_cos_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, exec_hi +// GFX1250: v_cos_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, null +// GFX1250: v_cos_bf16_e32 v5, null ; encoding: [0x7c,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, -1 +// GFX1250: v_cos_bf16_e32 v5, -1 ; encoding: [0xc1,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, 0.5 +// GFX1250: v_cos_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, src_scc +// GFX1250: v_cos_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfe,0x0a,0x7e] + +v_cos_bf16 v127, 0x8000 +// GFX1250: v_cos_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 39fc73d70cab2..40901618fce95 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -364,6 +364,54 @@ v_sin_bf16 v127, 0x8000 v_sin_bf16 v5.h, v1.h // GFX1250: v_sin_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xfd,0x0a,0x7f] +v_cos_bf16 v5, v1 +// GFX1250: v_cos_bf16_e32 v5, v1 ; encoding: [0x01,0xff,0x0a,0x7e] + +v_cos_bf16 v5, v127 +// GFX1250: v_cos_bf16_e32 v5, v127 ; encoding: [0x7f,0xff,0x0a,0x7e] + +v_cos_bf16 v5, s1 +// GFX1250: v_cos_bf16_e32 v5, s1 ; encoding: [0x01,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, s105 +// GFX1250: v_cos_bf16_e32 v5, s105 ; encoding: [0x69,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, vcc_lo +// GFX1250: v_cos_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, vcc_hi +// GFX1250: v_cos_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, ttmp15 +// GFX1250: v_cos_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, m0 +// GFX1250: v_cos_bf16_e32 v5, m0 ; encoding: [0x7d,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, exec_lo +// GFX1250: v_cos_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, exec_hi +// GFX1250: v_cos_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, null +// GFX1250: v_cos_bf16_e32 v5, null ; encoding: [0x7c,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, -1 +// GFX1250: v_cos_bf16_e32 v5, -1 ; encoding: [0xc1,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, 0.5 +// GFX1250: v_cos_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfe,0x0a,0x7e] + +v_cos_bf16 v5, src_scc +// GFX1250: v_cos_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfe,0x0a,0x7e] + +v_cos_bf16 v127, 0x8000 +// GFX1250: v_cos_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00] + +v_cos_bf16 v5.h, v1.h +// GFX1250: v_cos_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xff,0x0a,0x7f] + v_cvt_f32_bf16 v5, v1 // GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index 97058eb2e7c9f..ab5d55fad49ac 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -394,6 +394,62 @@ v_sin_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi // GFX1250: v_sin_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfc,0xfe,0x7e,0x7f,0x6f,0x35,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cos_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_mirror +// GFX1250: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_half_mirror +// GFX1250: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_shl:1 +// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_shl:15 +// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_shr:1 +// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_shr:15 +// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_ror:1 +// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_ror:15 +// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index 6a293c19a79a4..dcb613c09a62d 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -422,6 +422,66 @@ v_sin_bf16 v5.h, v1.h quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cos_bf16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_mirror +// GFX1250: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_half_mirror +// GFX1250: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_shl:1 +// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_shl:15 +// GFX1250: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_shr:1 +// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_shr:15 +// GFX1250: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_ror:1 +// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_ror:15 +// GFX1250: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5.h, v1.h quad_perm:[3,2,1,0] +// GFX1250: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe4,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index d1f53c7b2065c..4b37d648a928c 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -86,6 +86,18 @@ v_sin_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_sin_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfc,0xfe,0x7e,0x7f,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index dbee9f39df5f5..34489a1133abe 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -114,6 +114,22 @@ v_sin_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cos_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe4,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 4257334444244..a61f1da5040d9 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -397,6 +397,51 @@ v_sin_bf16_e64 v5, src_scc mul:4 v_sin_bf16_e64 v255, -|0x8000| clamp div:2 // GFX1250: v_sin_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xfe,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +v_cos_bf16_e64 v5, v1 +// GFX1250: v_cos_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00] + +v_cos_bf16_e64 v5, v255 +// GFX1250: v_cos_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00] + +v_cos_bf16_e64 v5, s1 +// GFX1250: v_cos_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, s105 +// GFX1250: v_cos_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, vcc_lo +// GFX1250: v_cos_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, vcc_hi +// GFX1250: v_cos_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, ttmp15 +// GFX1250: v_cos_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, m0 +// GFX1250: v_cos_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, exec_lo +// GFX1250: v_cos_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, exec_hi +// GFX1250: v_cos_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, null +// GFX1250: v_cos_bf16_e64 v5, null ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, -1 +// GFX1250: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] + +v_cos_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] + +v_cos_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 83986a61fd572..dbd1552b84ac2 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -418,6 +418,54 @@ v_sin_bf16_e64 v255, -|0x8000| clamp div:2 v_sin_bf16 v5.h, v128.h // GFX1250: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] +v_cos_bf16_e64 v5, v1 +// GFX1250: v_cos_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00] + +v_cos_bf16_e64 v5, v255 +// GFX1250: v_cos_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00] + +v_cos_bf16_e64 v5, s1 +// GFX1250: v_cos_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, s105 +// GFX1250: v_cos_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, vcc_lo +// GFX1250: v_cos_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, vcc_hi +// GFX1250: v_cos_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, ttmp15 +// GFX1250: v_cos_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, m0 +// GFX1250: v_cos_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, exec_lo +// GFX1250: v_cos_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, exec_hi +// GFX1250: v_cos_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, null +// GFX1250: v_cos_bf16_e64 v5, null ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, -1 +// GFX1250: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] + +v_cos_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] + +v_cos_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] + +v_cos_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +v_cos_bf16_e64 v5.h, v128.h +// GFX1250: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00] + v_cvt_f32_bf16_e64 v5, v1 // GFX1250: v_cvt_f32_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf2,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index bb6739ec312a5..22ad29a7a8d05 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -394,6 +394,62 @@ v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask // GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfe,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index 5f6f28e0f6edb..04cf346797845 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -422,6 +422,66 @@ v_sin_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_mirror +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] +// GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 037e7d650ad73..3ec947575f53a 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -114,6 +114,22 @@ v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_sin_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfe,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 53fb0eb4e9517..643731f6d46e7 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -142,6 +142,26 @@ v_sin_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index fec2207d70a8e..05c18cbf724ba 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -470,6 +470,69 @@ 0x81,0xfd,0x0a,0x7f # GFX1250-REAL16: v_sin_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xfd,0x0a,0x7f] +0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xfe,0xfe,0x7e,0x00,0x80,0x00,0x00] + +0xc1,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, -1 ; encoding: [0xc1,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, -1 ; encoding: [0xc1,0xfe,0x0a,0x7e] + +0xf0,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, 0.5 ; encoding: [0xf0,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xfe,0x0a,0x7e] + +0x7f,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, exec_hi ; encoding: [0x7f,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xfe,0x0a,0x7e] + +0x7e,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, exec_lo ; encoding: [0x7e,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xfe,0x0a,0x7e] + +0x7d,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, m0 ; encoding: [0x7d,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, m0 ; encoding: [0x7d,0xfe,0x0a,0x7e] + +0x7c,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, null ; encoding: [0x7c,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, null ; encoding: [0x7c,0xfe,0x0a,0x7e] + +0x01,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, s1 ; encoding: [0x01,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, s1 ; encoding: [0x01,0xfe,0x0a,0x7e] + +0x69,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, s105 ; encoding: [0x69,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, s105 ; encoding: [0x69,0xfe,0x0a,0x7e] + +0xfd,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, src_scc ; encoding: [0xfd,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, src_scc ; encoding: [0xfd,0xfe,0x0a,0x7e] + +0x7b,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, ttmp15 ; encoding: [0x7b,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xfe,0x0a,0x7e] + +0x01,0xff,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, v1.l ; encoding: [0x01,0xff,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, v1 ; encoding: [0x01,0xff,0x0a,0x7e] + +0x7f,0xff,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, v127.l ; encoding: [0x7f,0xff,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, v127 ; encoding: [0x7f,0xff,0x0a,0x7e] + +0x6b,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, vcc_hi ; encoding: [0x6b,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xfe,0x0a,0x7e] + +0x6a,0xfe,0x0a,0x7e +# GFX1250-REAL16: v_cos_bf16_e32 v5.l, vcc_lo ; encoding: [0x6a,0xfe,0x0a,0x7e] +# GFX1250-FAKE16: v_cos_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xfe,0x0a,0x7e] + +0x81,0xff,0x0a,0x7f +# GFX1250-REAL16: v_cos_bf16_e32 v5.h, v1.h ; encoding: [0x81,0xff,0x0a,0x7f] + 0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00 # GFX1250: v_cvt_f32_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xe4,0xfe,0x7e,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index dc8c6b15dd1bb..2aad85e5ac539 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -415,6 +415,65 @@ 0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff # GFX1250-REAL16: v_sin_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfc,0x0a,0x7f,0x81,0x1b,0x00,0xff] +0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30 +# GFX1250-REAL16: v_cos_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +# GFX1250-FAKE16: v_cos_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x41,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x40,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x21,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x2f,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x50,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x01,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x11,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0xfe,0x0a,0x7e,0x01,0x60,0x09,0x13] + +0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff +# GFX1250-REAL16: v_cos_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xfe,0x0a,0x7f,0x81,0x1b,0x00,0xff] + 0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30 # GFX1250: v_cvt_f32_bf16_dpp v127, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xe4,0xfe,0x7e,0x7f,0x6f,0x35,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index 741bf3fd34d32..f67e104c7dc20 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -110,6 +110,21 @@ 0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05 # GFX1250-REAL16: v_sin_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfc,0x0a,0x7f,0x81,0x77,0x39,0x05] +0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_cos_bf16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_cos_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX1250-REAL16: v_cos_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7f,0x81,0x77,0x39,0x05] + 0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf16_dpp v127, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xe4,0xfe,0x7e,0x7f,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index cd9b7120ca966..641e0872eafe8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -450,6 +450,70 @@ # GFX1250-REAL16: v_sin_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xfe,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_sin_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xfe,0xd5,0x80,0x01,0x00,0x00] +0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xff,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xff,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xff,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xff,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xff,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xff,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, null ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, null ; encoding: [0x05,0x00,0xff,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xff,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xff,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xff,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xff,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xff,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xff,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xff,0xd5,0x6a,0x00,0x00,0x00] + +0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xff,0xd5,0x80,0x01,0x00,0x00] + 0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index ed07393d18b18..0314ab3b59718 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -242,6 +242,66 @@ # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfe,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xff,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + 0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index a6d6713c1b00d..ead589195ff50 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -82,6 +82,26 @@ # GFX1250-REAL16: v_sin_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_sin_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfe,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xff,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xff,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + 0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From d994487db780d5b3ec4286391598684d99e9c9c3 Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:03:56 -0700 Subject: [PATCH 224/813] [IR2Vec] Add llvm-ir2vec tool for generating triplet embeddings (#147842) Add a new LLVM tool `llvm-ir2vec`. This tool is primarily intended to generate triplets for training the vocabulary (#141834) and to potentially generate the embeddings in a stand alone manner. This PR introduces the tool with triplet generation functionality. In the upcoming PRs I'll add scripts under `utils/mlgo` to complete the vocabulary tooling. #147844 adds embedding generation logic to the tool. (Tracking issue - #141817) --- llvm/test/CMakeLists.txt | 1 + llvm/test/lit.cfg.py | 1 + llvm/test/tools/llvm-ir2vec/triplets.ll | 38 ++++++ llvm/tools/llvm-ir2vec/CMakeLists.txt | 10 ++ llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 154 ++++++++++++++++++++++++ 5 files changed, 204 insertions(+) create mode 100644 llvm/test/tools/llvm-ir2vec/triplets.ll create mode 100644 llvm/tools/llvm-ir2vec/CMakeLists.txt create mode 100644 llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index 2a6135da9a61e..3426b6ff8d24d 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -97,6 +97,7 @@ set(LLVM_TEST_DEPENDS llvm-exegesis llvm-extract llvm-gsymutil + llvm-ir2vec llvm-isel-fuzzer llvm-ifs llvm-install-name-tool diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 1076456a4aef0..672382364a8ec 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -197,6 +197,7 @@ def get_asan_rtlib(): "llvm-dlltool", "llvm-exegesis", "llvm-extract", + "llvm-ir2vec", "llvm-isel-fuzzer", "llvm-ifs", "llvm-install-name-tool", diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll b/llvm/test/tools/llvm-ir2vec/triplets.ll new file mode 100644 index 0000000000000..fa5aaa895406f --- /dev/null +++ b/llvm/test/tools/llvm-ir2vec/triplets.ll @@ -0,0 +1,38 @@ +; RUN: llvm-ir2vec %s | FileCheck %s -check-prefix=TRIPLETS + +define i32 @simple_add(i32 %a, i32 %b) { +entry: + %add = add i32 %a, %b + ret i32 %add +} + +define i32 @simple_mul(i32 %x, i32 %y) { +entry: + %mul = mul i32 %x, %y + ret i32 %mul +} + +define i32 @test_function(i32 %arg1, i32 %arg2) { +entry: + %local1 = alloca i32, align 4 + %local2 = alloca i32, align 4 + store i32 %arg1, ptr %local1, align 4 + store i32 %arg2, ptr %local2, align 4 + %load1 = load i32, ptr %local1, align 4 + %load2 = load i32, ptr %local2, align 4 + %result = add i32 %load1, %load2 + ret i32 %result +} + +; TRIPLETS: Add IntegerTy Variable Variable +; TRIPLETS-NEXT: Ret VoidTy Variable +; TRIPLETS-NEXT: Mul IntegerTy Variable Variable +; TRIPLETS-NEXT: Ret VoidTy Variable +; TRIPLETS-NEXT: Alloca PointerTy Constant +; TRIPLETS-NEXT: Alloca PointerTy Constant +; TRIPLETS-NEXT: Store VoidTy Variable Pointer +; TRIPLETS-NEXT: Store VoidTy Variable Pointer +; TRIPLETS-NEXT: Load IntegerTy Pointer +; TRIPLETS-NEXT: Load IntegerTy Pointer +; TRIPLETS-NEXT: Add IntegerTy Variable Variable +; TRIPLETS-NEXT: Ret VoidTy Variable diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt new file mode 100644 index 0000000000000..a4cf9690e86b5 --- /dev/null +++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt @@ -0,0 +1,10 @@ +set(LLVM_LINK_COMPONENTS + Analysis + Core + IRReader + Support + ) + +add_llvm_tool(llvm-ir2vec + llvm-ir2vec.cpp + ) diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp new file mode 100644 index 0000000000000..aef843d5d9d82 --- /dev/null +++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp @@ -0,0 +1,154 @@ +//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file implements the IR2Vec embedding generation tool. +/// +/// Currently supports triplet generation for vocabulary training. +/// Future updates will support embedding generation using trained vocabulary. +/// +/// Usage: llvm-ir2vec input.bc -o triplets.txt +/// +/// TODO: Add embedding generation mode with vocabulary support +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/IR2Vec.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Errc.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace ir2vec; + +#define DEBUG_TYPE "ir2vec" + +static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options"); + +static cl::opt InputFilename(cl::Positional, + cl::desc(""), + cl::Required, + cl::cat(IR2VecToolCategory)); + +static cl::opt OutputFilename("o", cl::desc("Output filename"), + cl::value_desc("filename"), + cl::init("-"), + cl::cat(IR2VecToolCategory)); + +namespace { + +/// Helper class for collecting IR information and generating triplets +class IR2VecTool { +private: + Module &M; + +public: + explicit IR2VecTool(Module &M) : M(M) {} + + /// Generate triplets for the entire module + void generateTriplets(raw_ostream &OS) const { + for (const Function &F : M) + generateTriplets(F, OS); + } + + /// Generate triplets for a single function + void generateTriplets(const Function &F, raw_ostream &OS) const { + if (F.isDeclaration()) + return; + + std::string LocalOutput; + raw_string_ostream LocalOS(LocalOutput); + + for (const BasicBlock &BB : F) + traverseBasicBlock(BB, LocalOS); + + LocalOS.flush(); + OS << LocalOutput; + } + +private: + /// Process a single basic block for triplet generation + void traverseBasicBlock(const BasicBlock &BB, raw_string_ostream &OS) const { + // Consider only non-debug and non-pseudo instructions + for (const auto &I : BB.instructionsWithoutDebug()) { + StringRef OpcStr = Vocabulary::getVocabKeyForOpcode(I.getOpcode()); + StringRef TypeStr = + Vocabulary::getVocabKeyForTypeID(I.getType()->getTypeID()); + + OS << '\n' << OpcStr << ' ' << TypeStr << ' '; + + LLVM_DEBUG({ + I.print(dbgs()); + dbgs() << "\n"; + I.getType()->print(dbgs()); + dbgs() << " Type\n"; + }); + + for (const Use &U : I.operands()) + OS << Vocabulary::getVocabKeyForOperandKind( + Vocabulary::getOperandKind(U.get())) + << ' '; + } + } +}; + +Error processModule(Module &M, raw_ostream &OS) { + IR2VecTool Tool(M); + Tool.generateTriplets(OS); + + return Error::success(); +} + +} // anonymous namespace + +int main(int argc, char **argv) { + InitLLVM X(argc, argv); + cl::HideUnrelatedOptions(IR2VecToolCategory); + cl::ParseCommandLineOptions( + argc, argv, + "IR2Vec - Triplet Generation Tool\n" + "Generates triplets for vocabulary training from LLVM IR.\n" + "Future updates will support embedding generation.\n\n" + "Usage:\n" + " llvm-ir2vec input.bc -o triplets.txt\n"); + + // Parse the input LLVM IR file + SMDiagnostic Err; + LLVMContext Context; + std::unique_ptr M = parseIRFile(InputFilename, Err, Context); + if (!M) { + Err.print(argv[0], errs()); + return 1; + } + + std::error_code EC; + raw_fd_ostream OS(OutputFilename, EC); + if (EC) { + errs() << "Error opening output file: " << EC.message() << "\n"; + return 1; + } + + if (Error Err = processModule(*M, OS)) { + handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EIB) { + errs() << "Error: " << EIB.message() << "\n"; + }); + return 1; + } + + return 0; +} From 70e2319e9a0f65d8cac666a16a432501261e16a8 Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:06:52 -0700 Subject: [PATCH 225/813] [IR2Vec] Add embeddings mode to llvm-ir2vec tool (#147844) Add embedding generation functionality to the llvm-ir2vec tool, complementing the existing triplet generation mode. This change completes the IR2Vec tool by adding the embedding generation functionality, which was previously mentioned as a TODO item. The tool now supports both triplet generation for vocabulary training and embedding generation using a trained vocabulary. --- llvm/test/lit.cfg.py | 7 + llvm/test/tools/llvm-ir2vec/embeddings.ll | 73 ++++++++ llvm/test/tools/llvm-ir2vec/triplets.ll | 2 +- llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 194 ++++++++++++++++++++-- 4 files changed, 260 insertions(+), 16 deletions(-) create mode 100644 llvm/test/tools/llvm-ir2vec/embeddings.ll diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 672382364a8ec..143cc3817bd08 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -93,6 +93,13 @@ def get_asan_rtlib(): config.substitutions.append(("%exeext", config.llvm_exe_ext)) config.substitutions.append(("%llvm_src_root", config.llvm_src_root)) +# Add IR2Vec test vocabulary path substitution +config.substitutions.append( + ( + "%ir2vec_test_vocab_dir", + os.path.join(config.test_source_root, "Analysis", "IR2Vec", "Inputs"), + ) +) lli_args = [] # The target triple used by default by lli is the process target triple (some diff --git a/llvm/test/tools/llvm-ir2vec/embeddings.ll b/llvm/test/tools/llvm-ir2vec/embeddings.ll new file mode 100644 index 0000000000000..993ea865170f9 --- /dev/null +++ b/llvm/test/tools/llvm-ir2vec/embeddings.ll @@ -0,0 +1,73 @@ +; RUN: llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT +; RUN: llvm-ir2vec --mode=embeddings --level=func --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL +; RUN: llvm-ir2vec --mode=embeddings --level=func --function=abc --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ABC +; RUN: not llvm-ir2vec --mode=embeddings --level=func --function=def --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-DEF +; RUN: llvm-ir2vec --mode=embeddings --level=bb --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL +; RUN: llvm-ir2vec --mode=embeddings --level=bb --function=abc_repeat --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL-ABC-REPEAT +; RUN: llvm-ir2vec --mode=embeddings --level=inst --function=abc_repeat --ir2vec-vocab-path=%ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL-ABC-REPEAT + +define dso_local noundef float @abc(i32 noundef %a, float noundef %b) #0 { +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca float, align 4 + store i32 %a, ptr %a.addr, align 4 + store float %b, ptr %b.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %a.addr, align 4 + %mul = mul nsw i32 %0, %1 + %conv = sitofp i32 %mul to float + %2 = load float, ptr %b.addr, align 4 + %add = fadd float %conv, %2 + ret float %add +} + +define dso_local noundef float @abc_repeat(i32 noundef %a, float noundef %b) #0 { +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca float, align 4 + store i32 %a, ptr %a.addr, align 4 + store float %b, ptr %b.addr, align 4 + %0 = load i32, ptr %a.addr, align 4 + %1 = load i32, ptr %a.addr, align 4 + %mul = mul nsw i32 %0, %1 + %conv = sitofp i32 %mul to float + %2 = load float, ptr %b.addr, align 4 + %add = fadd float %conv, %2 + ret float %add +} + +; CHECK-DEFAULT: Function: abc +; CHECK-DEFAULT-NEXT: [ 878.00 889.00 900.00 ] +; CHECK-DEFAULT-NEXT: Function: abc_repeat +; CHECK-DEFAULT-NEXT: [ 878.00 889.00 900.00 ] + +; CHECK-FUNC-LEVEL: Function: abc +; CHECK-FUNC-LEVEL-NEXT: [ 878.00 889.00 900.00 ] +; CHECK-FUNC-LEVEL-NEXT: Function: abc_repeat +; CHECK-FUNC-LEVEL-NEXT: [ 878.00 889.00 900.00 ] + +; CHECK-FUNC-LEVEL-ABC: Function: abc +; CHECK-FUNC-LEVEL-NEXT-ABC: [ 878.00 889.00 900.00 ] + +; CHECK-FUNC-DEF: Error: Function 'def' not found + +; CHECK-BB-LEVEL: Function: abc +; CHECK-BB-LEVEL-NEXT: entry: [ 878.00 889.00 900.00 ] +; CHECK-BB-LEVEL-NEXT: Function: abc_repeat +; CHECK-BB-LEVEL-NEXT: entry: [ 878.00 889.00 900.00 ] + +; CHECK-BB-LEVEL-ABC-REPEAT: Function: abc_repeat +; CHECK-BB-LEVEL-ABC-REPEAT-NEXT: entry: [ 878.00 889.00 900.00 ] + +; CHECK-INST-LEVEL-ABC-REPEAT: Function: abc_repeat +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %a.addr = alloca i32, align 4 [ 91.00 92.00 93.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %b.addr = alloca float, align 4 [ 91.00 92.00 93.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store i32 %a, ptr %a.addr, align 4 [ 97.00 98.00 99.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: store float %b, ptr %b.addr, align 4 [ 97.00 98.00 99.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %0 = load i32, ptr %a.addr, align 4 [ 94.00 95.00 96.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %1 = load i32, ptr %a.addr, align 4 [ 94.00 95.00 96.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %mul = mul nsw i32 %0, %1 [ 49.00 50.00 51.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %conv = sitofp i32 %mul to float [ 130.00 131.00 132.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %2 = load float, ptr %b.addr, align 4 [ 94.00 95.00 96.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: %add = fadd float %conv, %2 [ 40.00 41.00 42.00 ] +; CHECK-INST-LEVEL-ABC-REPEAT-NEXT: ret float %add [ 1.00 2.00 3.00 ] diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll b/llvm/test/tools/llvm-ir2vec/triplets.ll index fa5aaa895406f..d1ef5b388e258 100644 --- a/llvm/test/tools/llvm-ir2vec/triplets.ll +++ b/llvm/test/tools/llvm-ir2vec/triplets.ll @@ -1,4 +1,4 @@ -; RUN: llvm-ir2vec %s | FileCheck %s -check-prefix=TRIPLETS +; RUN: llvm-ir2vec --mode=triplets %s | FileCheck %s -check-prefix=TRIPLETS define i32 @simple_add(i32 %a, i32 %b) { entry: diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp index aef843d5d9d82..d60d88596c2e0 100644 --- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp +++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp @@ -9,12 +9,18 @@ /// \file /// This file implements the IR2Vec embedding generation tool. /// -/// Currently supports triplet generation for vocabulary training. -/// Future updates will support embedding generation using trained vocabulary. +/// This tool provides two main functionalities: /// -/// Usage: llvm-ir2vec input.bc -o triplets.txt +/// 1. Triplet Generation Mode (--mode=triplets): +/// Generates triplets (opcode, type, operands) for vocabulary training. +/// Usage: llvm-ir2vec --mode=triplets input.bc -o triplets.txt /// -/// TODO: Add embedding generation mode with vocabulary support +/// 2. Embedding Generation Mode (--mode=embeddings): +/// Generates IR2Vec embeddings using a trained vocabulary. +/// Usage: llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json +/// --level=func input.bc -o embeddings.txt Levels: --level=inst +/// (instructions), --level=bb (basic blocks), --level=func (functions) +/// (See IR2Vec.cpp for more embedding generation options) /// //===----------------------------------------------------------------------===// @@ -24,6 +30,8 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassInstrumentation.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IRReader/IRReader.h" #include "llvm/Support/CommandLine.h" @@ -33,11 +41,11 @@ #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" -using namespace llvm; -using namespace ir2vec; - #define DEBUG_TYPE "ir2vec" +namespace llvm { +namespace ir2vec { + static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options"); static cl::opt InputFilename(cl::Positional, @@ -50,16 +58,63 @@ static cl::opt OutputFilename("o", cl::desc("Output filename"), cl::init("-"), cl::cat(IR2VecToolCategory)); +enum ToolMode { + TripletMode, // Generate triplets for vocabulary training + EmbeddingMode // Generate embeddings using trained vocabulary +}; + +static cl::opt + Mode("mode", cl::desc("Tool operation mode:"), + cl::values(clEnumValN(TripletMode, "triplets", + "Generate triplets for vocabulary training"), + clEnumValN(EmbeddingMode, "embeddings", + "Generate embeddings using trained vocabulary")), + cl::init(EmbeddingMode), cl::cat(IR2VecToolCategory)); + +static cl::opt + FunctionName("function", cl::desc("Process specific function only"), + cl::value_desc("name"), cl::Optional, cl::init(""), + cl::cat(IR2VecToolCategory)); + +enum EmbeddingLevel { + InstructionLevel, // Generate instruction-level embeddings + BasicBlockLevel, // Generate basic block-level embeddings + FunctionLevel // Generate function-level embeddings +}; + +static cl::opt + Level("level", cl::desc("Embedding generation level (for embedding mode):"), + cl::values(clEnumValN(InstructionLevel, "inst", + "Generate instruction-level embeddings"), + clEnumValN(BasicBlockLevel, "bb", + "Generate basic block-level embeddings"), + clEnumValN(FunctionLevel, "func", + "Generate function-level embeddings")), + cl::init(FunctionLevel), cl::cat(IR2VecToolCategory)); + namespace { -/// Helper class for collecting IR information and generating triplets +/// Helper class for collecting IR triplets and generating embeddings class IR2VecTool { private: Module &M; + ModuleAnalysisManager MAM; + const Vocabulary *Vocab = nullptr; public: explicit IR2VecTool(Module &M) : M(M) {} + /// Initialize the IR2Vec vocabulary analysis + bool initializeVocabulary() { + // Register and run the IR2Vec vocabulary analysis + // The vocabulary file path is specified via --ir2vec-vocab-path global + // option + MAM.registerPass([&] { return PassInstrumentationAnalysis(); }); + MAM.registerPass([&] { return IR2VecVocabAnalysis(); }); + Vocab = &MAM.getResult(M); + return Vocab->isValid(); + } + /// Generate triplets for the entire module void generateTriplets(raw_ostream &OS) const { for (const Function &F : M) @@ -81,6 +136,68 @@ class IR2VecTool { OS << LocalOutput; } + /// Generate embeddings for the entire module + void generateEmbeddings(raw_ostream &OS) const { + if (!Vocab->isValid()) { + OS << "Error: Vocabulary is not valid. IR2VecTool not initialized.\n"; + return; + } + + for (const Function &F : M) + generateEmbeddings(F, OS); + } + + /// Generate embeddings for a single function + void generateEmbeddings(const Function &F, raw_ostream &OS) const { + if (F.isDeclaration()) { + OS << "Function " << F.getName() << " is a declaration, skipping.\n"; + return; + } + + // Create embedder for this function + assert(Vocab->isValid() && "Vocabulary is not valid"); + auto Emb = Embedder::create(IR2VecKind::Symbolic, F, *Vocab); + if (!Emb) { + OS << "Error: Failed to create embedder for function " << F.getName() + << "\n"; + return; + } + + OS << "Function: " << F.getName() << "\n"; + + // Generate embeddings based on the specified level + switch (Level) { + case FunctionLevel: { + Emb->getFunctionVector().print(OS); + break; + } + case BasicBlockLevel: { + const auto &BBVecMap = Emb->getBBVecMap(); + for (const BasicBlock &BB : F) { + auto It = BBVecMap.find(&BB); + if (It != BBVecMap.end()) { + OS << BB.getName() << ":"; + It->second.print(OS); + } + } + break; + } + case InstructionLevel: { + const auto &InstMap = Emb->getInstVecMap(); + for (const BasicBlock &BB : F) { + for (const Instruction &I : BB) { + auto It = InstMap.find(&I); + if (It != InstMap.end()) { + I.print(OS); + It->second.print(OS); + } + } + } + break; + } + } + } + private: /// Process a single basic block for triplet generation void traverseBasicBlock(const BasicBlock &BB, raw_string_ostream &OS) const { @@ -109,23 +226,70 @@ class IR2VecTool { Error processModule(Module &M, raw_ostream &OS) { IR2VecTool Tool(M); - Tool.generateTriplets(OS); + if (Mode == EmbeddingMode) { + // Initialize vocabulary for embedding generation + // Note: Requires --ir2vec-vocab-path option to be set + if (!Tool.initializeVocabulary()) + return createStringError( + errc::invalid_argument, + "Failed to initialize IR2Vec vocabulary. " + "Make sure to specify --ir2vec-vocab-path for embedding mode."); + + if (!FunctionName.empty()) { + // Process single function + if (const Function *F = M.getFunction(FunctionName)) + Tool.generateEmbeddings(*F, OS); + else + return createStringError(errc::invalid_argument, + "Function '%s' not found", + FunctionName.c_str()); + } else { + // Process all functions + Tool.generateEmbeddings(OS); + } + } else { + // Triplet generation mode - no vocabulary needed + if (!FunctionName.empty()) + // Process single function + if (const Function *F = M.getFunction(FunctionName)) + Tool.generateTriplets(*F, OS); + else + return createStringError(errc::invalid_argument, + "Function '%s' not found", + FunctionName.c_str()); + else + // Process all functions + Tool.generateTriplets(OS); + } return Error::success(); } - -} // anonymous namespace +} // namespace +} // namespace ir2vec +} // namespace llvm int main(int argc, char **argv) { + using namespace llvm; + using namespace llvm::ir2vec; + InitLLVM X(argc, argv); cl::HideUnrelatedOptions(IR2VecToolCategory); cl::ParseCommandLineOptions( argc, argv, - "IR2Vec - Triplet Generation Tool\n" - "Generates triplets for vocabulary training from LLVM IR.\n" - "Future updates will support embedding generation.\n\n" + "IR2Vec - Embedding Generation Tool\n" + "Generates embeddings for a given LLVM IR and " + "supports triplet generation for vocabulary " + "training and embedding generation.\n\n" "Usage:\n" - " llvm-ir2vec input.bc -o triplets.txt\n"); + " Triplet mode: llvm-ir2vec --mode=triplets input.bc\n" + " Embedding mode: llvm-ir2vec --mode=embeddings " + "--ir2vec-vocab-path=vocab.json --level=func input.bc\n" + " Levels: --level=inst (instructions), --level=bb (basic blocks), " + "--level=func (functions)\n"); + + // Validate command line options + if (Mode == TripletMode && Level.getNumOccurrences() > 0) + errs() << "Warning: --level option is ignored in triplet mode\n"; // Parse the input LLVM IR file SMDiagnostic Err; From f2956173aea4ff0fe0b823be1953d1968f91fb98 Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Thu, 17 Jul 2025 12:09:50 -0700 Subject: [PATCH 226/813] [IR2Vec] Adding documentation for llvm-ir2vec tool (#148719) Tracking issues - #141817, #141834 --- llvm/docs/CommandGuide/index.rst | 1 + llvm/docs/CommandGuide/llvm-ir2vec.rst | 170 +++++++++++++++++++++++++ llvm/docs/MLGO.rst | 12 +- llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 8 +- 4 files changed, 184 insertions(+), 7 deletions(-) create mode 100644 llvm/docs/CommandGuide/llvm-ir2vec.rst diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst index 88fc1fd326b76..f85f32a1fdd51 100644 --- a/llvm/docs/CommandGuide/index.rst +++ b/llvm/docs/CommandGuide/index.rst @@ -27,6 +27,7 @@ Basic Commands llvm-dis llvm-dwarfdump llvm-dwarfutil + llvm-ir2vec llvm-lib llvm-libtool-darwin llvm-link diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst b/llvm/docs/CommandGuide/llvm-ir2vec.rst new file mode 100644 index 0000000000000..13fe4996b968f --- /dev/null +++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst @@ -0,0 +1,170 @@ +llvm-ir2vec - IR2Vec Embedding Generation Tool +============================================== + +.. program:: llvm-ir2vec + +SYNOPSIS +-------- + +:program:`llvm-ir2vec` [*options*] *input-file* + +DESCRIPTION +----------- + +:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It +generates IR2Vec embeddings for LLVM IR and supports triplet generation +for vocabulary training. It provides two main operation modes: + +1. **Triplet Mode**: Generates triplets (opcode, type, operands) for vocabulary + training from LLVM IR. + +2. **Embedding Mode**: Generates IR2Vec embeddings using a trained vocabulary + at different granularity levels (instruction, basic block, or function). + +The tool is designed to facilitate machine learning applications that work with +LLVM IR by converting the IR into numerical representations that can be used by +ML models. + +.. note:: + + For information about using IR2Vec programmatically within LLVM passes and + the C++ API, see the `IR2Vec Embeddings `_ + section in the MLGO documentation. + +OPERATION MODES +--------------- + +Triplet Generation Mode +~~~~~~~~~~~~~~~~~~~~~~~ + +In triplet mode, :program:`llvm-ir2vec` analyzes LLVM IR and extracts triplets +consisting of opcodes, types, and operands. These triplets can be used to train +vocabularies for embedding generation. + +Usage: + +.. code-block:: bash + + llvm-ir2vec --mode=triplets input.bc -o triplets.txt + +Embedding Generation Mode +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In embedding mode, :program:`llvm-ir2vec` uses a pre-trained vocabulary to +generate numerical embeddings for LLVM IR at different levels of granularity. + +Example Usage: + +.. code-block:: bash + + llvm-ir2vec --mode=embeddings --ir2vec-vocab-path=vocab.json --level=func input.bc -o embeddings.txt + +OPTIONS +------- + +.. option:: --mode= + + Specify the operation mode. Valid values are: + + * ``triplets`` - Generate triplets for vocabulary training + * ``embeddings`` - Generate embeddings using trained vocabulary (default) + +.. option:: --level= + + Specify the embedding generation level. Valid values are: + + * ``inst`` - Generate instruction-level embeddings + * ``bb`` - Generate basic block-level embeddings + * ``func`` - Generate function-level embeddings (default) + +.. option:: --function= + + Process only the specified function instead of all functions in the module. + +.. option:: --ir2vec-vocab-path= + + Specify the path to the vocabulary file (required for embedding mode). + The vocabulary file should be in JSON format and contain the trained + vocabulary for embedding generation. See `llvm/lib/Analysis/models` + for pre-trained vocabulary files. + +.. option:: --ir2vec-opc-weight= + + Specify the weight for opcode embeddings (default: 1.0). This controls + the relative importance of instruction opcodes in the final embedding. + +.. option:: --ir2vec-type-weight= + + Specify the weight for type embeddings (default: 0.5). This controls + the relative importance of type information in the final embedding. + +.. option:: --ir2vec-arg-weight= + + Specify the weight for argument embeddings (default: 0.2). This controls + the relative importance of operand information in the final embedding. + +.. option:: -o + + Specify the output filename. Use ``-`` to write to standard output (default). + +.. option:: --help + + Print a summary of command line options. + +.. note:: + + ``--level``, ``--function``, ``--ir2vec-vocab-path``, ``--ir2vec-opc-weight``, + ``--ir2vec-type-weight``, and ``--ir2vec-arg-weight`` are only used in embedding + mode. These options are ignored in triplet mode. + +INPUT FILE FORMAT +----------------- + +:program:`llvm-ir2vec` accepts LLVM bitcode files (``.bc``) and LLVM IR files +(``.ll``) as input. The input file should contain valid LLVM IR. + +OUTPUT FORMAT +------------- + +Triplet Mode Output +~~~~~~~~~~~~~~~~~~~ + +In triplet mode, the output consists of lines containing space-separated triplets: + +.. code-block:: text + + ... + +Each line represents the information of one instruction, with the opcode, type, +and operands. + +Embedding Mode Output +~~~~~~~~~~~~~~~~~~~~~ + +In embedding mode, the output format depends on the specified level: + +* **Function Level**: One embedding vector per function +* **Basic Block Level**: One embedding vector per basic block, grouped by function +* **Instruction Level**: One embedding vector per instruction, grouped by basic block and function + +Each embedding is represented as a floating point vector. + +EXIT STATUS +----------- + +:program:`llvm-ir2vec` returns 0 on success, and a non-zero value on failure. + +Common failure cases include: + +* Invalid or missing input file +* Missing or invalid vocabulary file (in embedding mode) +* Specified function not found in the module +* Invalid command line options + +SEE ALSO +-------- + +:doc:`../MLGO` + +For more information about the IR2Vec algorithm and approach, see: +`IR2Vec: LLVM IR Based Scalable Program Embeddings `_. diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst index ed0769bebeac3..965a21b8c84b8 100644 --- a/llvm/docs/MLGO.rst +++ b/llvm/docs/MLGO.rst @@ -468,6 +468,13 @@ The core components are: Using IR2Vec ------------ +.. note:: + + This section describes how to use IR2Vec within LLVM passes. A standalone + tool :doc:`CommandGuide/llvm-ir2vec` is available for generating the + embeddings and triplets from LLVM IR files, which can be useful for + training vocabularies and generating embeddings outside of compiler passes. + For generating embeddings, first the vocabulary should be obtained. Then, the embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance. @@ -524,6 +531,10 @@ Further Details For more detailed information about the IR2Vec algorithm, its parameters, and advanced usage, please refer to the original paper: `IR2Vec: LLVM IR Based Scalable Program Embeddings `_. + +For information about using IR2Vec tool for generating embeddings and +triplets from LLVM IR, see :doc:`CommandGuide/llvm-ir2vec`. + The LLVM source code for ``IR2Vec`` can also be explored to understand the implementation details. @@ -595,4 +606,3 @@ optimizations that are currently MLGO-enabled, it may be used as follows: where the ``name`` is a path fragment. We will expect to find 2 files, ``.in`` (readable, data incoming from the managing process) and ``.out`` (writable, the model runner sends data to the managing process) - diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp index d60d88596c2e0..4e88282e85c14 100644 --- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp +++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp @@ -280,12 +280,8 @@ int main(int argc, char **argv) { "Generates embeddings for a given LLVM IR and " "supports triplet generation for vocabulary " "training and embedding generation.\n\n" - "Usage:\n" - " Triplet mode: llvm-ir2vec --mode=triplets input.bc\n" - " Embedding mode: llvm-ir2vec --mode=embeddings " - "--ir2vec-vocab-path=vocab.json --level=func input.bc\n" - " Levels: --level=inst (instructions), --level=bb (basic blocks), " - "--level=func (functions)\n"); + "See https://llvm.org/docs/CommandGuide/llvm-ir2vec.html for more " + "information.\n"); // Validate command line options if (Mode == TripletMode && Level.getNumOccurrences() > 0) From e68efed71ba818a9eb18a2baede922e1e2ff9a46 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 17 Jul 2025 12:33:59 -0700 Subject: [PATCH 227/813] Fix more compiler-rt tests after #149015. --- compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py | 2 +- compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py | 2 +- compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py | 2 +- compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py | 2 +- compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py | 2 +- compiler-rt/test/cfi/cross-dso/lit.local.cfg.py | 2 +- compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py | 2 +- compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py | 2 +- compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py | 2 +- compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py | 2 +- compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py | 2 +- compiler-rt/test/msan/Linux/lit.local.cfg.py | 2 +- compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py | 2 +- compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py | 2 +- compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py | 2 +- compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py | 2 +- compiler-rt/test/profile/AIX/lit.local.cfg.py | 2 +- compiler-rt/test/profile/Darwin/lit.local.cfg.py | 2 +- compiler-rt/test/profile/Linux/lit.local.cfg.py | 2 +- compiler-rt/test/profile/Posix/lit.local.cfg.py | 4 ++-- compiler-rt/test/profile/Windows/lit.local.cfg.py | 2 +- .../test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py | 2 +- .../test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py | 2 +- .../test/sanitizer_common/TestCases/Linux/lit.local.cfg.py | 2 +- .../test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py | 2 +- .../test/sanitizer_common/TestCases/Posix/lit.local.cfg.py | 2 +- compiler-rt/test/tsan/Darwin/lit.local.cfg.py | 2 +- compiler-rt/test/tsan/Linux/lit.local.cfg.py | 2 +- compiler-rt/test/tsan/libcxx/lit.local.cfg.py | 2 +- compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py | 2 +- .../test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py | 2 +- 31 files changed, 32 insertions(+), 32 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py index 520a963d01198..af82d30cf4de9 100644 --- a/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py +++ b/compiler-rt/test/asan/TestCases/Darwin/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Darwin"]: +if root.target_os not in ["Darwin"]: config.unsupported = True diff --git a/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py index 603ca0365068f..3ea05fa044356 100644 --- a/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py +++ b/compiler-rt/test/asan/TestCases/Linux/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Linux"]: +if root.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py index 63240c3962565..c43790b98f38a 100644 --- a/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py +++ b/compiler-rt/test/asan/TestCases/Posix/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os in ["Windows"]: +if root.target_os in ["Windows"]: config.unsupported = True diff --git a/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py b/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py index 57c0979e60962..b622e072bcbfb 100644 --- a/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py +++ b/compiler-rt/test/asan/TestCases/Windows/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Windows"]: +if root.target_os not in ["Windows"]: config.unsupported = True diff --git a/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py index 520a963d01198..af82d30cf4de9 100644 --- a/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py +++ b/compiler-rt/test/builtins/TestCases/Darwin/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Darwin"]: +if root.target_os not in ["Darwin"]: config.unsupported = True diff --git a/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py b/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py index dceb7cde7218b..2778d8c995fd1 100644 --- a/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py +++ b/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py @@ -6,7 +6,7 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Linux", "FreeBSD", "NetBSD"]: +if root.target_os not in ["Linux", "FreeBSD", "NetBSD"]: config.unsupported = True # Android O (API level 26) has support for cross-dso cfi in libdl.so. diff --git a/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py index 603ca0365068f..3ea05fa044356 100644 --- a/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py +++ b/compiler-rt/test/hwasan/TestCases/Linux/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Linux"]: +if root.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py index 63240c3962565..c43790b98f38a 100644 --- a/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py +++ b/compiler-rt/test/hwasan/TestCases/Posix/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os in ["Windows"]: +if root.target_os in ["Windows"]: config.unsupported = True diff --git a/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py index 520a963d01198..af82d30cf4de9 100644 --- a/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py +++ b/compiler-rt/test/lsan/TestCases/Darwin/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Darwin"]: +if root.target_os not in ["Darwin"]: config.unsupported = True diff --git a/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py index 603ca0365068f..3ea05fa044356 100644 --- a/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py +++ b/compiler-rt/test/lsan/TestCases/Linux/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Linux"]: +if root.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py index 63240c3962565..c43790b98f38a 100644 --- a/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py +++ b/compiler-rt/test/lsan/TestCases/Posix/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os in ["Windows"]: +if root.target_os in ["Windows"]: config.unsupported = True diff --git a/compiler-rt/test/msan/Linux/lit.local.cfg.py b/compiler-rt/test/msan/Linux/lit.local.cfg.py index 603ca0365068f..3ea05fa044356 100644 --- a/compiler-rt/test/msan/Linux/lit.local.cfg.py +++ b/compiler-rt/test/msan/Linux/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Linux"]: +if root.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py index b455a936e7cc1..2e3d36c446714 100644 --- a/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py +++ b/compiler-rt/test/orc/TestCases/Darwin/lit.local.cfg.py @@ -1,2 +1,2 @@ -if config.root.host_os != "Darwin": +if config.root.target_os != "Darwin": config.unsupported = True diff --git a/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py index e9b1b38ccacd1..0efdb55dc77f4 100644 --- a/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py +++ b/compiler-rt/test/orc/TestCases/FreeBSD/lit.local.cfg.py @@ -1,2 +1,2 @@ -if config.root.host_os != "FreeBSD": +if config.root.target_os != "FreeBSD": config.unsupported = True diff --git a/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py index 7d85fa3fce392..32e5cfdb141ae 100644 --- a/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py +++ b/compiler-rt/test/orc/TestCases/Linux/lit.local.cfg.py @@ -1,2 +1,2 @@ -if config.root.host_os != "Linux": +if config.root.target_os != "Linux": config.unsupported = True diff --git a/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py b/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py index 6d4e7da813641..99d4464cf9e77 100644 --- a/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py +++ b/compiler-rt/test/orc/TestCases/Windows/lit.local.cfg.py @@ -1,2 +1,2 @@ -if config.root.host_os != "Windows": +if config.root.target_os != "Windows": config.unsupported = True diff --git a/compiler-rt/test/profile/AIX/lit.local.cfg.py b/compiler-rt/test/profile/AIX/lit.local.cfg.py index 55462708e3b6c..3337c692bd0d7 100644 --- a/compiler-rt/test/profile/AIX/lit.local.cfg.py +++ b/compiler-rt/test/profile/AIX/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["AIX"]: +if root.target_os not in ["AIX"]: config.unsupported = True diff --git a/compiler-rt/test/profile/Darwin/lit.local.cfg.py b/compiler-rt/test/profile/Darwin/lit.local.cfg.py index 520a963d01198..af82d30cf4de9 100644 --- a/compiler-rt/test/profile/Darwin/lit.local.cfg.py +++ b/compiler-rt/test/profile/Darwin/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Darwin"]: +if root.target_os not in ["Darwin"]: config.unsupported = True diff --git a/compiler-rt/test/profile/Linux/lit.local.cfg.py b/compiler-rt/test/profile/Linux/lit.local.cfg.py index c1e89581a1ab9..4bce33db9bbf7 100644 --- a/compiler-rt/test/profile/Linux/lit.local.cfg.py +++ b/compiler-rt/test/profile/Linux/lit.local.cfg.py @@ -42,7 +42,7 @@ def is_gold_linker_available(): root = getRoot(config) -if root.host_os not in ["Linux"] or not is_gold_linker_available(): +if root.target_os not in ["Linux"] or not is_gold_linker_available(): config.unsupported = True if config.have_curl: diff --git a/compiler-rt/test/profile/Posix/lit.local.cfg.py b/compiler-rt/test/profile/Posix/lit.local.cfg.py index 17a67689192d0..62ee3cbb466c4 100644 --- a/compiler-rt/test/profile/Posix/lit.local.cfg.py +++ b/compiler-rt/test/profile/Posix/lit.local.cfg.py @@ -6,12 +6,12 @@ def getRoot(config): root = getRoot(config) -if root.host_os in ["Windows"]: +if root.target_os in ["Windows"]: config.unsupported = True # AIX usually usually makes use of an explicit export list when linking a shared # object, since the linker doesn't export anything by default. -if root.host_os in ["AIX"]: +if root.target_os in ["AIX"]: config.substitutions.append(("%shared_linker_xopts", "-Wl,-bE:shr.exp")) else: config.substitutions.append(("%shared_linker_xopts", "")) diff --git a/compiler-rt/test/profile/Windows/lit.local.cfg.py b/compiler-rt/test/profile/Windows/lit.local.cfg.py index 57c0979e60962..b622e072bcbfb 100644 --- a/compiler-rt/test/profile/Windows/lit.local.cfg.py +++ b/compiler-rt/test/profile/Windows/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Windows"]: +if root.target_os not in ["Windows"]: config.unsupported = True diff --git a/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py index 520a963d01198..af82d30cf4de9 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py +++ b/compiler-rt/test/sanitizer_common/TestCases/Darwin/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Darwin"]: +if root.target_os not in ["Darwin"]: config.unsupported = True diff --git a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py index 0102001660cf1..d4948f04ef64e 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py +++ b/compiler-rt/test/sanitizer_common/TestCases/FreeBSD/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["FreeBSD"]: +if root.target_os not in ["FreeBSD"]: config.unsupported = True diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py index 603ca0365068f..3ea05fa044356 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Linux"]: +if root.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py index 3cd1aa667343c..aa4438d04380a 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py +++ b/compiler-rt/test/sanitizer_common/TestCases/NetBSD/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["NetBSD"]: +if root.target_os not in ["NetBSD"]: config.unsupported = True diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py b/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py index 63240c3962565..c43790b98f38a 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os in ["Windows"]: +if root.target_os in ["Windows"]: config.unsupported = True diff --git a/compiler-rt/test/tsan/Darwin/lit.local.cfg.py b/compiler-rt/test/tsan/Darwin/lit.local.cfg.py index 7bf80ac5e1375..876f0cd638bd2 100644 --- a/compiler-rt/test/tsan/Darwin/lit.local.cfg.py +++ b/compiler-rt/test/tsan/Darwin/lit.local.cfg.py @@ -6,7 +6,7 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Darwin"]: +if root.target_os not in ["Darwin"]: config.unsupported = True config.environment["TSAN_OPTIONS"] += ":ignore_noninstrumented_modules=1" diff --git a/compiler-rt/test/tsan/Linux/lit.local.cfg.py b/compiler-rt/test/tsan/Linux/lit.local.cfg.py index 603ca0365068f..3ea05fa044356 100644 --- a/compiler-rt/test/tsan/Linux/lit.local.cfg.py +++ b/compiler-rt/test/tsan/Linux/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Linux"]: +if root.target_os not in ["Linux"]: config.unsupported = True diff --git a/compiler-rt/test/tsan/libcxx/lit.local.cfg.py b/compiler-rt/test/tsan/libcxx/lit.local.cfg.py index f4820dccb0109..b8d054e2de976 100644 --- a/compiler-rt/test/tsan/libcxx/lit.local.cfg.py +++ b/compiler-rt/test/tsan/libcxx/lit.local.cfg.py @@ -8,5 +8,5 @@ def getRoot(config): # Only run if we have an instrumented libcxx. On Darwin, run always (we have # interceptors to support the system-provided libcxx). -if not root.has_libcxx and root.host_os != "Darwin": +if not root.has_libcxx and root.target_os != "Darwin": config.unsupported = True diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py index 63240c3962565..c43790b98f38a 100644 --- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py +++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os in ["Windows"]: +if root.target_os in ["Windows"]: config.unsupported = True diff --git a/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py b/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py index 603ca0365068f..3ea05fa044356 100644 --- a/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py +++ b/compiler-rt/test/ubsan/TestCases/TypeCheck/Linux/lit.local.cfg.py @@ -6,5 +6,5 @@ def getRoot(config): root = getRoot(config) -if root.host_os not in ["Linux"]: +if root.target_os not in ["Linux"]: config.unsupported = True From 7e105fbdbe3167d0724a64601a0e72923ed5e021 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 17 Jul 2025 15:42:35 -0400 Subject: [PATCH 228/813] [AMDGPU] Add support for `v_tanh_f32` on gfx1250 (#149360) Co-authored-by: Mekhanoshin, Stanislav --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + clang/test/CodeGenOpenCL/amdgpu-features.cl | 2 +- .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 +++++ llvm/lib/Target/AMDGPU/AMDGPU.td | 10 +++ llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 4 + llvm/lib/TargetParser/TargetParser.cpp | 1 + llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll | 84 +++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 ++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 45 ++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 56 +++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 56 +++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 12 +++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 45 ++++++++++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 45 ++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 56 +++++++++++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 56 +++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16 ++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 16 ++++ .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 45 ++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 42 ++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 9 ++ .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 45 ++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 42 ++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 12 +++ 27 files changed, 779 insertions(+), 1 deletion(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 3b6ad7d90be3c..4111837d962b5 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -669,6 +669,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_monitor_sleep, "vIs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 9f48149354255..bcdb488f11639 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -503,6 +503,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, { ResultType }); return Builder.CreateCall(F, { Src }); } + case AMDGPU::BI__builtin_amdgcn_tanhf: case AMDGPU::BI__builtin_amdgcn_tanh_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_tanh); diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 42768ac8def1f..75e9710f96705 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -108,7 +108,7 @@ // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" -// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+transpose-load-f4f6-insts,+wavefrontsize32" +// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf16-trans-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+transpose-load-f4f6-insts,+wavefrontsize32 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index a1f984c129276..e120a46c6327b 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -42,6 +42,25 @@ void test_s_wait_tensorcnt() { __builtin_amdgcn_s_wait_tensorcnt(0); } +// CHECK-LABEL: @test_tanh_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca float, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.amdgcn.tanh.f32(float [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store float [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 +// CHECK-NEXT: ret void +// +void test_tanh_f32(global float* out, float a) +{ + *out = __builtin_amdgcn_tanhf(a); +} + // CHECK-LABEL: @test_tanh_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index faf59c1541fc0..0e0e83b7a6b54 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1118,6 +1118,12 @@ def FeatureBitOp3Insts : SubtargetFeature<"bitop3-insts", "Has v_bitop3_b32/v_bitop3_b16 instructions" >; +def FeatureTanhInsts : SubtargetFeature<"tanh-insts", + "HasTanhInsts", + "true", + "Has v_tanh_f32/f16 instructions" +>; + def FeatureTransposeLoadF4F6Insts : SubtargetFeature<"transpose-load-f4f6-insts", "HasTransposeLoadF4F6Insts", "true", @@ -1979,6 +1985,7 @@ def FeatureISAVersion12_50 : FeatureSet< FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureBitOp3Insts, + FeatureTanhInsts, FeatureTransposeLoadF4F6Insts, FeatureBF16TransInsts, FeatureBF16ConversionInsts, @@ -2703,6 +2710,9 @@ def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, def HasBitOp3Insts : Predicate<"Subtarget->hasBitOp3Insts()">, AssemblerPredicate<(all_of FeatureBitOp3Insts)>; +def HasTanhInsts : Predicate<"Subtarget->hasTanhInsts()">, + AssemblerPredicate<(all_of FeatureTanhInsts)>; + def HasTransposeLoadF4F6Insts : Predicate<"Subtarget->hasTransposeLoadF4F6Insts()">, AssemblerPredicate<(all_of FeatureTransposeLoadF4F6Insts)>; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 67c6daaa24c2a..268162bcada47 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -234,6 +234,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasRestrictedSOffset = false; bool Has64BitLiterals = false; bool HasBitOp3Insts = false; + bool HasTanhInsts = false; bool HasTransposeLoadF4F6Insts = false; bool HasPrngInst = false; bool HasBVHDualAndBVH8Insts = false; @@ -1380,6 +1381,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return HasMinimum3Maximum3F16; } + bool hasTanhInsts() const { return HasTanhInsts; } + bool hasAddPC64Inst() const { return GFX1250Insts; } bool hasMinimum3Maximum3PKF16() const { diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index ff89b8badeed0..8c35fea8259f4 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -366,6 +366,9 @@ defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>; let TRANS = 1, SchedRW = [WriteTrans32] in { defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>; defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>; + +let SubtargetPredicate = HasTanhInsts in +defm V_TANH_F32 : VOP1Inst <"v_tanh_f32", VOP_F32_F32, int_amdgcn_tanh>; } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; @@ -1138,6 +1141,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL ; +defm V_TANH_F32 : VOP1_Real_FULL; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index d7e206ef8cd4f..4ca7444a73b35 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -443,6 +443,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx1250-insts"] = true; Features["bitop3-insts"] = true; Features["prng-inst"] = true; + Features["tanh-insts"] = true; Features["transpose-load-f4f6-insts"] = true; Features["bf16-trans-insts"] = true; Features["fp8-conversion-insts"] = true; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll index 344c0112e4a54..91a2a0b651132 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll @@ -7,8 +7,92 @@ ; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select. ; FIXME: GlobalISel does not work with bf16 +declare float @llvm.amdgcn.tanh.f32(float) #0 declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0 +define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 { +; SDAG-REAL16-LABEL: tanh_f32: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, s2 +; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f32: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, s2 +; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call float @llvm.amdgcn.tanh.f32(float %src) #0 + store float %tanh, ptr addrspace(1) %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_f32_constant_4.0: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 4.0 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f32_constant_4.0: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 4.0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call float @llvm.amdgcn.tanh.f32(float 4.0) #0 + store float %tanh, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_f32_constant_100.0: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_tanh_f32_e32 v0, 0x42c80000 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f32_constant_100.0: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f32_e32 v0, 0x42c80000 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: global_store_b32 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call float @llvm.amdgcn.tanh.f32(float 100.0) #0 + store float %tanh, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_undef_f32: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_undef_f32: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call float @llvm.amdgcn.tanh.f32(float undef) + store float %tanh, ptr addrspace(1) %out, align 4 + ret void +} + define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; SDAG-REAL16-LABEL: tanh_bf16: ; SDAG-REAL16: ; %bb.0: diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index 4b61064815ed5..f9e217d1f0361 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -28,6 +28,51 @@ v_mov_b64 v[4:5], 0.5 v_mov_b64 v[254:255], 0xaf123456 // GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +v_tanh_f32 v5, v1 +// GFX1250: v_tanh_f32_e32 v5, v1 ; encoding: [0x01,0x3d,0x0a,0x7e] + +v_tanh_f32 v5, v255 +// GFX1250: v_tanh_f32_e32 v5, v255 ; encoding: [0xff,0x3d,0x0a,0x7e] + +v_tanh_f32 v5, s1 +// GFX1250: v_tanh_f32_e32 v5, s1 ; encoding: [0x01,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, s105 +// GFX1250: v_tanh_f32_e32 v5, s105 ; encoding: [0x69,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, vcc_lo +// GFX1250: v_tanh_f32_e32 v5, vcc_lo ; encoding: [0x6a,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, vcc_hi +// GFX1250: v_tanh_f32_e32 v5, vcc_hi ; encoding: [0x6b,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, ttmp15 +// GFX1250: v_tanh_f32_e32 v5, ttmp15 ; encoding: [0x7b,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, m0 +// GFX1250: v_tanh_f32_e32 v5, m0 ; encoding: [0x7d,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, exec_lo +// GFX1250: v_tanh_f32_e32 v5, exec_lo ; encoding: [0x7e,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, exec_hi +// GFX1250: v_tanh_f32_e32 v5, exec_hi ; encoding: [0x7f,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, null +// GFX1250: v_tanh_f32_e32 v5, null ; encoding: [0x7c,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, -1 +// GFX1250: v_tanh_f32_e32 v5, -1 ; encoding: [0xc1,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, 0.5 +// GFX1250: v_tanh_f32_e32 v5, 0.5 ; encoding: [0xf0,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, src_scc +// GFX1250: v_tanh_f32_e32 v5, src_scc ; encoding: [0xfd,0x3c,0x0a,0x7e] + +v_tanh_f32 v255, 0xaf123456 +// GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf] + v_tanh_bf16 v5, v1 // GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 40901618fce95..d51ef68bf1e19 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -28,6 +28,51 @@ v_mov_b64 v[4:5], 0.5 v_mov_b64 v[254:255], 0xaf123456 // GFX1250: v_mov_b64_e32 v[254:255], lit64(0xaf123456) ; encoding: [0xfe,0x3a,0xfc,0x7f,0x56,0x34,0x12,0xaf,0x00,0x00,0x00,0x00] +v_tanh_f32 v5, v1 +// GFX1250: v_tanh_f32_e32 v5, v1 ; encoding: [0x01,0x3d,0x0a,0x7e] + +v_tanh_f32 v5, v255 +// GFX1250: v_tanh_f32_e32 v5, v255 ; encoding: [0xff,0x3d,0x0a,0x7e] + +v_tanh_f32 v5, s1 +// GFX1250: v_tanh_f32_e32 v5, s1 ; encoding: [0x01,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, s105 +// GFX1250: v_tanh_f32_e32 v5, s105 ; encoding: [0x69,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, vcc_lo +// GFX1250: v_tanh_f32_e32 v5, vcc_lo ; encoding: [0x6a,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, vcc_hi +// GFX1250: v_tanh_f32_e32 v5, vcc_hi ; encoding: [0x6b,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, ttmp15 +// GFX1250: v_tanh_f32_e32 v5, ttmp15 ; encoding: [0x7b,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, m0 +// GFX1250: v_tanh_f32_e32 v5, m0 ; encoding: [0x7d,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, exec_lo +// GFX1250: v_tanh_f32_e32 v5, exec_lo ; encoding: [0x7e,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, exec_hi +// GFX1250: v_tanh_f32_e32 v5, exec_hi ; encoding: [0x7f,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, null +// GFX1250: v_tanh_f32_e32 v5, null ; encoding: [0x7c,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, -1 +// GFX1250: v_tanh_f32_e32 v5, -1 ; encoding: [0xc1,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, 0.5 +// GFX1250: v_tanh_f32_e32 v5, 0.5 ; encoding: [0xf0,0x3c,0x0a,0x7e] + +v_tanh_f32 v5, src_scc +// GFX1250: v_tanh_f32_e32 v5, src_scc ; encoding: [0xfd,0x3c,0x0a,0x7e] + +v_tanh_f32 v255, 0xaf123456 +// GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf] + v_tanh_bf16 v5, v1 // GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index ab5d55fad49ac..ae22f68e54835 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -2,6 +2,62 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_tanh_f32 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_mirror +// GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_half_mirror +// GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_shl:1 +// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_shl:15 +// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_shr:1 +// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_shr:15 +// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_ror:1 +// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_ror:15 +// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index dcb613c09a62d..37ecb66bfe809 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -2,6 +2,62 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_tanh_f32 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_mirror +// GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_half_mirror +// GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_shl:1 +// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_shl:15 +// GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_shr:1 +// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_shr:15 +// GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_ror:1 +// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_ror:15 +// GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index 4b37d648a928c..f24122e24b70e 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -2,6 +2,18 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index 34489a1133abe..34abc829d4eb1 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -2,6 +2,18 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index a61f1da5040d9..340a7857419c4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -127,6 +127,51 @@ v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp // GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] +v_tanh_f32_e64 v5, v1 +// GFX1250: v_tanh_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00] + +v_tanh_f32_e64 v5, v255 +// GFX1250: v_tanh_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00] + +v_tanh_f32_e64 v5, s1 +// GFX1250: v_tanh_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, s105 +// GFX1250: v_tanh_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, vcc_lo +// GFX1250: v_tanh_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, vcc_hi +// GFX1250: v_tanh_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, ttmp15 +// GFX1250: v_tanh_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, m0 +// GFX1250: v_tanh_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, exec_lo +// GFX1250: v_tanh_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, exec_hi +// GFX1250: v_tanh_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, null +// GFX1250: v_tanh_f32_e64 v5, null ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, -1 +// GFX1250: v_tanh_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08] + +v_tanh_f32_e64 v5, src_scc mul:4 +// GFX1250: v_tanh_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10] + +v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + v_rcp_bf16_e64 v5, v1 // GFX1250: v_rcp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index dbd1552b84ac2..579a467b41052 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -130,6 +130,51 @@ v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp // GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] +v_tanh_f32_e64 v5, v1 +// GFX1250: v_tanh_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00] + +v_tanh_f32_e64 v5, v255 +// GFX1250: v_tanh_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00] + +v_tanh_f32_e64 v5, s1 +// GFX1250: v_tanh_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, s105 +// GFX1250: v_tanh_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, vcc_lo +// GFX1250: v_tanh_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, vcc_hi +// GFX1250: v_tanh_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, ttmp15 +// GFX1250: v_tanh_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, m0 +// GFX1250: v_tanh_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, exec_lo +// GFX1250: v_tanh_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, exec_hi +// GFX1250: v_tanh_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, null +// GFX1250: v_tanh_f32_e64 v5, null ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, -1 +// GFX1250: v_tanh_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00] + +v_tanh_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08] + +v_tanh_f32_e64 v5, src_scc mul:4 +// GFX1250: v_tanh_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10] + +v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + v_rcp_bf16_e64 v5, v1 // GFX1250: v_rcp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index 22ad29a7a8d05..423340cc90b30 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -2,6 +2,62 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_mirror +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index 04cf346797845..7968b39839a78 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -2,6 +2,62 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_mirror +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 3ec947575f53a..dd469c2eef850 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -2,6 +2,22 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 643731f6d46e7..9fce77916b66e 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -2,6 +2,22 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding < %s | FileCheck --check-prefix=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index 05c18cbf724ba..0a6fc391e63a5 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -29,6 +29,51 @@ 0x6a,0x3a,0x08,0x7e # GFX1250: v_mov_b64_e32 v[4:5], vcc ; encoding: [0x6a,0x3a,0x08,0x7e] +0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf +# GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf] + +0xc1,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, -1 ; encoding: [0xc1,0x3c,0x0a,0x7e] + +0xf0,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, 0.5 ; encoding: [0xf0,0x3c,0x0a,0x7e] + +0x7f,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, exec_hi ; encoding: [0x7f,0x3c,0x0a,0x7e] + +0x7e,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, exec_lo ; encoding: [0x7e,0x3c,0x0a,0x7e] + +0x7d,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, m0 ; encoding: [0x7d,0x3c,0x0a,0x7e] + +0x7c,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, null ; encoding: [0x7c,0x3c,0x0a,0x7e] + +0x01,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, s1 ; encoding: [0x01,0x3c,0x0a,0x7e] + +0x69,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, s105 ; encoding: [0x69,0x3c,0x0a,0x7e] + +0xfd,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, src_scc ; encoding: [0xfd,0x3c,0x0a,0x7e] + +0x7b,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, ttmp15 ; encoding: [0x7b,0x3c,0x0a,0x7e] + +0x01,0x3d,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, v1 ; encoding: [0x01,0x3d,0x0a,0x7e] + +0xff,0x3d,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, v255 ; encoding: [0xff,0x3d,0x0a,0x7e] + +0x6b,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, vcc_hi ; encoding: [0x6b,0x3c,0x0a,0x7e] + +0x6a,0x3c,0x0a,0x7e +# GFX1250: v_tanh_f32_e32 v5, vcc_lo ; encoding: [0x6a,0x3c,0x0a,0x7e] + 0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e32 v127, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index 2aad85e5ac539..f099ffcba36e4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -2,6 +2,48 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30 +# GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30] + +0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x50,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01 +# GFX1250: v_tanh_f32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff +# GFX1250: v_tanh_f32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13 +# GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13] + 0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30 # GFX1250-REAL16: v_tanh_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30] # GFX1250-FAKE16: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index f67e104c7dc20..d86d4630c48ea 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -2,6 +2,15 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00] + +0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05] + 0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 641e0872eafe8..4dc7ed4237f53 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -2,6 +2,51 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x9e,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_tanh_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x9e,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x9e,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x9e,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, null ; encoding: [0x05,0x00,0x9e,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x9e,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_tanh_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x9e,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x9e,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x9e,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x9e,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00] + 0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 0314ab3b59718..1f03a43cd8bd4 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -2,6 +2,48 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s +0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + 0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index ead589195ff50..e673f9fdfc7bb 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -2,6 +2,18 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s +0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250: v_tanh_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9e,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + 0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From 4e6b843cf59735ffb7092edd178e4b03433a44df Mon Sep 17 00:00:00 2001 From: Jake Egan Date: Thu, 17 Jul 2025 15:50:44 -0400 Subject: [PATCH 229/813] [asan] Revert global check for non-AIX (#149245) 287b24e1899eb6ce62eb9daef5a24faae5e66c1e moved the `GetGlobalAddressInformation` call earlier, but this broke a chromium test, so make this workaround for AIX only. --- compiler-rt/lib/asan/asan_descriptions.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/compiler-rt/lib/asan/asan_descriptions.cpp b/compiler-rt/lib/asan/asan_descriptions.cpp index 0c30959b23e28..18c2a6c571c1f 100644 --- a/compiler-rt/lib/asan/asan_descriptions.cpp +++ b/compiler-rt/lib/asan/asan_descriptions.cpp @@ -449,10 +449,12 @@ AddressDescription::AddressDescription(uptr addr, uptr access_size, // are put to the STACK region for unknown reasons. Check global first can // workaround this issue. // TODO: Look into whether there's a different solution to this problem. +#if SANITIZER_AIX if (GetGlobalAddressInformation(addr, access_size, &data.global)) { data.kind = kAddressKindGlobal; return; } +#endif if (GetHeapAddressInformation(addr, access_size, &data.heap)) { data.kind = kAddressKindHeap; @@ -471,6 +473,14 @@ AddressDescription::AddressDescription(uptr addr, uptr access_size, return; } +// GetGlobalAddressInformation is called earlier on AIX due to a workaround +#if !SANITIZER_AIX + if (GetGlobalAddressInformation(addr, access_size, &data.global)) { + data.kind = kAddressKindGlobal; + return; + } +#endif + data.kind = kAddressKindWild; data.wild.addr = addr; data.wild.access_size = access_size; From a8880265e1755b346fe6c3e7e93b57381d8c9eb8 Mon Sep 17 00:00:00 2001 From: Jeremy Kun Date: Thu, 17 Jul 2025 12:52:37 -0700 Subject: [PATCH 230/813] [mlir] Fix CI breakage from https://github.com/llvm/llvm-project/pull/146228 (#149378) Some platforms print `{anonymous}` instead of the other two forms accepted by the test regex. This PR just removes the attempt to guess how the anonymous namespace will be printed. @Kewen12 is there a way to trigger the particular CIs that failed in https://github.com/llvm/llvm-project/pull/146228 on this PR? Co-authored-by: Jeremy Kun --- .../IR/test-pattern-logging-listener.mlir | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/mlir/test/IR/test-pattern-logging-listener.mlir b/mlir/test/IR/test-pattern-logging-listener.mlir index a1d27741a0723..e8f0d61a75960 100644 --- a/mlir/test/IR/test-pattern-logging-listener.mlir +++ b/mlir/test/IR/test-pattern-logging-listener.mlir @@ -2,14 +2,21 @@ // RUN: --allow-unregistered-dialect --debug-only=pattern-logging-listener 2>&1 | FileCheck %s // Check that when replacing an op with a new op, we get appropriate -// pattern-logging lines. The regex is because the anonymous namespace is -// printed differently on different platforms. +// pattern-logging lines. The use of check same is to avoid the complexity of +// matching the anonymous namespace prefix, which can be one of {anonymous} vs +// {anonymous_namespace} vs `anonymous_namespace` (and maybe others?) on the +// various platforms. -// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationInserted | test.new_op -// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationReplaced (with values) | test.replace_with_new_op -// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationModified | arith.addi -// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationModified | arith.addi -// CHECK: [pattern-logging-listener] {{.anonymous.namespace.}}::ReplaceWithNewOp | notifyOperationErased | test.replace_with_new_op +// CHECK: [pattern-logging-listener] +// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationInserted | test.new_op +// CHECK: [pattern-logging-listener] +// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationReplaced (with values) | test.replace_with_new_op +// CHECK: [pattern-logging-listener] +// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationModified | arith.addi +// CHECK: [pattern-logging-listener] +// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationModified | arith.addi +// CHECK: [pattern-logging-listener] +// CHECK-SAME: ::ReplaceWithNewOp | notifyOperationErased | test.replace_with_new_op func.func @replace_with_new_op() -> i32 { %a = "test.replace_with_new_op"() : () -> (i32) %res = arith.addi %a, %a : i32 From 48cd22c5661ea454f4ff189c21a8f01c426eb1aa Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Thu, 17 Jul 2025 12:54:56 -0700 Subject: [PATCH 231/813] [NFC] simplify LowerAllowCheckPass::printPipeline (#149374) --- .../Instrumentation/LowerAllowCheckPass.cpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp index e7a6fa48e9004..55f3239ada100 100644 --- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp +++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp @@ -10,6 +10,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -184,22 +185,14 @@ void LowerAllowCheckPass::printPipeline( // correctness. // TODO: print shorter output by combining adjacent runs, etc. int i = 0; - bool printed = false; + ListSeparator LS(";"); for (unsigned int cutoff : Opts.cutoffs) { - if (cutoff > 0) { - if (printed) - OS << ";"; - OS << "cutoffs[" << i << "]=" << cutoff; - printed = true; - } - + if (cutoff > 0) + OS << LS << "cutoffs[" << i << "]=" << cutoff; i++; } - if (Opts.runtime_check) { - if (printed) - OS << ";"; - OS << "runtime_check=" << Opts.runtime_check; - } + if (Opts.runtime_check) + OS << LS << "runtime_check=" << Opts.runtime_check; OS << '>'; } From e8182fb501622840e7b0a981506f71188fdaeb61 Mon Sep 17 00:00:00 2001 From: Prabhu Rajasekaran Date: Thu, 17 Jul 2025 13:06:04 -0700 Subject: [PATCH 232/813] [libc] add wctype.h header (#149202) Add basic configurations to generate wctype.h header file. To begin with this header file just exposes one function iswalpha. --- libc/config/baremetal/arm/entrypoints.txt | 3 ++ libc/config/baremetal/arm/headers.txt | 1 + libc/config/baremetal/riscv/entrypoints.txt | 3 ++ libc/config/baremetal/riscv/headers.txt | 1 + libc/config/darwin/aarch64/entrypoints.txt | 3 ++ libc/config/darwin/aarch64/headers.txt | 1 + libc/config/linux/aarch64/entrypoints.txt | 3 ++ libc/config/linux/aarch64/headers.txt | 1 + libc/config/linux/arm/entrypoints.txt | 3 ++ libc/config/linux/arm/headers.txt | 1 + libc/config/linux/riscv/entrypoints.txt | 3 ++ libc/config/linux/riscv/headers.txt | 1 + libc/config/linux/x86_64/entrypoints.txt | 2 + libc/config/linux/x86_64/headers.txt | 1 + libc/config/windows/entrypoints.txt | 3 ++ libc/config/windows/headers.txt | 1 + libc/include/CMakeLists.txt | 9 ++++ libc/include/wctype.yaml | 10 ++++ libc/src/CMakeLists.txt | 1 + libc/src/wctype/CMakeLists.txt | 9 ++++ libc/src/wctype/iswalpha.cpp | 19 ++++++++ libc/src/wctype/iswalpha.h | 21 ++++++++ libc/test/src/CMakeLists.txt | 1 + libc/test/src/wctype/CMakeLists.txt | 11 +++++ libc/test/src/wctype/iswalpha_test.cpp | 54 +++++++++++++++++++++ 25 files changed, 166 insertions(+) create mode 100644 libc/include/wctype.yaml create mode 100644 libc/src/wctype/CMakeLists.txt create mode 100644 libc/src/wctype/iswalpha.cpp create mode 100644 libc/src/wctype/iswalpha.h create mode 100644 libc/test/src/wctype/CMakeLists.txt create mode 100644 libc/test/src/wctype/iswalpha_test.cpp diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt index de7549c57ff44..80cd15eebc91f 100644 --- a/libc/config/baremetal/arm/entrypoints.txt +++ b/libc/config/baremetal/arm/entrypoints.txt @@ -278,6 +278,9 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.wchar.wcslen libc.src.wchar.wctob + # wctype.h entrypoints + libc.src.wctype.iswalpha + # internal entrypoints libc.startup.baremetal.init libc.startup.baremetal.fini diff --git a/libc/config/baremetal/arm/headers.txt b/libc/config/baremetal/arm/headers.txt index 5666ef7e0012d..1f64afebdaaa7 100644 --- a/libc/config/baremetal/arm/headers.txt +++ b/libc/config/baremetal/arm/headers.txt @@ -23,4 +23,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.time libc.include.uchar libc.include.wchar + libc.include.wctype ) diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt index 7e8c186d52469..c9f8118f6e800 100644 --- a/libc/config/baremetal/riscv/entrypoints.txt +++ b/libc/config/baremetal/riscv/entrypoints.txt @@ -278,6 +278,9 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.wchar.wcslen libc.src.wchar.wctob + # wctype.h entrypoints + libc.src.wctype.iswalpha + # internal entrypoints libc.startup.baremetal.init libc.startup.baremetal.fini diff --git a/libc/config/baremetal/riscv/headers.txt b/libc/config/baremetal/riscv/headers.txt index 5666ef7e0012d..1f64afebdaaa7 100644 --- a/libc/config/baremetal/riscv/headers.txt +++ b/libc/config/baremetal/riscv/headers.txt @@ -23,4 +23,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.time libc.include.uchar libc.include.wchar + libc.include.wctype ) diff --git a/libc/config/darwin/aarch64/entrypoints.txt b/libc/config/darwin/aarch64/entrypoints.txt index 4674a9309115b..3bfdcdbee555e 100644 --- a/libc/config/darwin/aarch64/entrypoints.txt +++ b/libc/config/darwin/aarch64/entrypoints.txt @@ -99,6 +99,9 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdlib.calloc libc.src.stdlib.realloc libc.src.stdlib.free + + # wctype.h entrypoints + libc.src.wctype.iswalpha ) if(LLVM_LIBC_FULL_BUILD) diff --git a/libc/config/darwin/aarch64/headers.txt b/libc/config/darwin/aarch64/headers.txt index 8f3d6029c9b6a..55a112c0c3ad3 100644 --- a/libc/config/darwin/aarch64/headers.txt +++ b/libc/config/darwin/aarch64/headers.txt @@ -11,4 +11,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.stdlib libc.include.string libc.include.strings + libc.include.wctype ) diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index cff5b7f8312d6..b2abebee017d8 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -363,6 +363,9 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.wchar.wcslen libc.src.wchar.wctob + # wctype.h entrypoints + libc.src.wctype.iswalpha + # sys/uio.h entrypoints libc.src.sys.uio.writev libc.src.sys.uio.readv diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt index 01b0bf36498ce..6d3bc9188583b 100644 --- a/libc/config/linux/aarch64/headers.txt +++ b/libc/config/linux/aarch64/headers.txt @@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.uchar libc.include.unistd libc.include.wchar + libc.include.wctype ) diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt index a1203cc4991af..5865dc93a9aef 100644 --- a/libc/config/linux/arm/entrypoints.txt +++ b/libc/config/linux/arm/entrypoints.txt @@ -191,6 +191,9 @@ set(TARGET_LIBC_ENTRYPOINTS # sys/time.h entrypoints libc.src.sys.time.setitimer libc.src.sys.time.getitimer + + # wctype.h entrypoints + libc.src.wctype.iswalpha ) if(LLVM_LIBC_FULL_BUILD) diff --git a/libc/config/linux/arm/headers.txt b/libc/config/linux/arm/headers.txt index 9aabac5dea33c..14c730e2b77b1 100644 --- a/libc/config/linux/arm/headers.txt +++ b/libc/config/linux/arm/headers.txt @@ -17,6 +17,7 @@ set(TARGET_PUBLIC_HEADERS libc.include.strings libc.include.uchar libc.include.wchar + libc.include.wctype # Disabled due to epoll_wait syscalls not being available on this platform. # libc.include.sys_epoll diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt index 14361f5b6beff..79077a5e66ef5 100644 --- a/libc/config/linux/riscv/entrypoints.txt +++ b/libc/config/linux/riscv/entrypoints.txt @@ -368,6 +368,9 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.wchar.wcslen libc.src.wchar.wctob + # wctype.h entrypoints + libc.src.wctype.iswalpha + # sys/uio.h entrypoints libc.src.sys.uio.writev libc.src.sys.uio.readv diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt index 01b0bf36498ce..6d3bc9188583b 100644 --- a/libc/config/linux/riscv/headers.txt +++ b/libc/config/linux/riscv/headers.txt @@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.uchar libc.include.unistd libc.include.wchar + libc.include.wctype ) diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 9223911f04a93..381359cec6f1d 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -396,6 +396,8 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.wchar.wcstoul libc.src.wchar.wcstoull + # wctype.h entrypoints + libc.src.wctype.iswalpha # sys/uio.h entrypoints libc.src.sys.uio.writev diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt index 01b0bf36498ce..6d3bc9188583b 100644 --- a/libc/config/linux/x86_64/headers.txt +++ b/libc/config/linux/x86_64/headers.txt @@ -57,4 +57,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.uchar libc.include.unistd libc.include.wchar + libc.include.wctype ) diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt index 8898fd74c302f..18027298acc18 100644 --- a/libc/config/windows/entrypoints.txt +++ b/libc/config/windows/entrypoints.txt @@ -105,6 +105,9 @@ set(TARGET_LIBC_ENTRYPOINTS # unistd.h entrypoints libc.src.unistd.getentropy + + # wctype.h entrypoints + libc.src.wctype.iswalpha ) set(TARGET_LIBM_ENTRYPOINTS diff --git a/libc/config/windows/headers.txt b/libc/config/windows/headers.txt index 6d9aae9276924..d4a0947d867bb 100644 --- a/libc/config/windows/headers.txt +++ b/libc/config/windows/headers.txt @@ -7,4 +7,5 @@ set(TARGET_PUBLIC_HEADERS libc.include.fenv libc.include.math libc.include.unistd + libc.include.wctype ) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 55268d19529c7..984b960acb2d7 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -720,6 +720,15 @@ add_header_macro( .llvm-libc-types.wchar_t ) +add_header_macro( + wctype + ../libc/include/wctype.yaml + wctype.h + DEPENDS + .llvm_libc_common_h + .llvm-libc-types.wint_t +) + add_header_macro( locale ../libc/include/locale.yaml diff --git a/libc/include/wctype.yaml b/libc/include/wctype.yaml new file mode 100644 index 0000000000000..fb4f96f7d17e4 --- /dev/null +++ b/libc/include/wctype.yaml @@ -0,0 +1,10 @@ +header: wctype.h +types: + - type_name: wint_t +functions: + - name: iswalpha + standards: + - stdc + return_type: int + arguments: + - type: wint_t diff --git a/libc/src/CMakeLists.txt b/libc/src/CMakeLists.txt index a665253c4cc03..d7a1e1f49e6ff 100644 --- a/libc/src/CMakeLists.txt +++ b/libc/src/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(strings) add_subdirectory(time) add_subdirectory(unistd) add_subdirectory(wchar) +add_subdirectory(wctype) if(${LIBC_TARGET_OS} STREQUAL "linux") add_subdirectory(dirent) diff --git a/libc/src/wctype/CMakeLists.txt b/libc/src/wctype/CMakeLists.txt new file mode 100644 index 0000000000000..3ac5eaef8ed8b --- /dev/null +++ b/libc/src/wctype/CMakeLists.txt @@ -0,0 +1,9 @@ +add_entrypoint_object( + iswalpha + SRCS + iswalpha.cpp + HDRS + iswalpha.h + DEPENDS + libc.src.__support.wctype_utils +) diff --git a/libc/src/wctype/iswalpha.cpp b/libc/src/wctype/iswalpha.cpp new file mode 100644 index 0000000000000..e18f29370fbd0 --- /dev/null +++ b/libc/src/wctype/iswalpha.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of iswalpha ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/wctype/iswalpha.h" +#include "src/__support/common.h" +#include "src/__support/wctype_utils.h" + +#include "hdr/types/wint_t.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(bool, iswalpha, (wint_t c)) { return internal::iswalpha(c); } + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/wctype/iswalpha.h b/libc/src/wctype/iswalpha.h new file mode 100644 index 0000000000000..681fc6ba79a54 --- /dev/null +++ b/libc/src/wctype/iswalpha.h @@ -0,0 +1,21 @@ +//===-- Implementation header for iswalpha ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H +#define LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H + +#include "hdr/types/wint_t.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE_DECL { + +bool iswalpha(wint_t c); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_WCTYPE_ISWALPHA_H diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt index 6dca47b5343e6..b3eba43582074 100644 --- a/libc/test/src/CMakeLists.txt +++ b/libc/test/src/CMakeLists.txt @@ -70,6 +70,7 @@ add_subdirectory(stdlib) add_subdirectory(string) add_subdirectory(strings) add_subdirectory(wchar) +add_subdirectory(wctype) add_subdirectory(time) add_subdirectory(unistd) diff --git a/libc/test/src/wctype/CMakeLists.txt b/libc/test/src/wctype/CMakeLists.txt new file mode 100644 index 0000000000000..5459cdb4a9b71 --- /dev/null +++ b/libc/test/src/wctype/CMakeLists.txt @@ -0,0 +1,11 @@ +add_custom_target(libc_wctype_unittests) + +add_libc_test( + iswalpha_test + SUITE + libc_wctype_unittests + SRCS + iswalpha_test.cpp + DEPENDS + libc.src.wctype.iswalpha +) diff --git a/libc/test/src/wctype/iswalpha_test.cpp b/libc/test/src/wctype/iswalpha_test.cpp new file mode 100644 index 0000000000000..f3f75f4dc7aa5 --- /dev/null +++ b/libc/test/src/wctype/iswalpha_test.cpp @@ -0,0 +1,54 @@ +//===-- Unittests for iswalpha --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/span.h" +#include "src/wctype/iswalpha.h" + +#include "test/UnitTest/LibcTest.h" +#include "test/UnitTest/Test.h" + +namespace { + +// TODO: Merge the wctype tests using this framework. +constexpr char WALPHA_ARRAY[] = { + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', + 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', + 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', +}; + +bool in_span(int ch, LIBC_NAMESPACE::cpp::span arr) { + for (size_t i = 0; i < arr.size(); ++i) + if (static_cast(arr[i]) == ch) + return true; + return false; +} + +} // namespace + +TEST(LlvmLibciswalpha, SimpleTest) { + EXPECT_TRUE(LIBC_NAMESPACE::iswalpha('a')); + EXPECT_TRUE(LIBC_NAMESPACE::iswalpha('B')); + + EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('3')); + EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(' ')); + EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('?')); + EXPECT_FALSE(LIBC_NAMESPACE::iswalpha('\0')); + EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(-1)); +} + +TEST(LlvmLibciswalpha, DefaultLocale) { + // Loops through all characters, verifying that letters return + // true and everything else returns false. + for (int ch = -255; ch < 255; ++ch) { + if (in_span(ch, WALPHA_ARRAY)) + EXPECT_TRUE(LIBC_NAMESPACE::iswalpha(ch)); + else + EXPECT_FALSE(LIBC_NAMESPACE::iswalpha(ch)); + } +} From 6a60f18997d62b0e2842a921fcb6beb3e52ed823 Mon Sep 17 00:00:00 2001 From: Eli Friedman Date: Thu, 17 Jul 2025 13:14:34 -0700 Subject: [PATCH 233/813] [clang] Fix potential constant expression checking with constexpr-unknown. (#149227) 071765749a70b22fb62f2efc07a3f242ff5b4c52 improved constexpr-unknown diagnostics, but potential constant expression checking broke in the process: we produce diagnostics in more cases. Suppress the diagnostics as appropriate. This fix affects -Winvalid-constexpr and the enable_if attribute. (The -Winvalid-constexpr diagnostic isn't really important right now, but it will become important if we allow constexpr-unknown with pre-C++23 standards.) Fixes #149041. Fixes #149188. --- clang/lib/AST/ExprConstant.cpp | 11 +++++--- .../SemaCXX/constant-expression-p2280r4.cpp | 26 +++++++++++++++++++ .../test/SemaCXX/constexpr-never-constant.cpp | 7 +++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 767cc4c3b19eb..8797eaddd0e18 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -4450,7 +4450,8 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E, } } else if (!IsAccess) { return CompleteObject(LVal.getLValueBase(), nullptr, BaseType); - } else if (IsConstant && Info.checkingPotentialConstantExpression() && + } else if ((IsConstant || BaseType->isReferenceType()) && + Info.checkingPotentialConstantExpression() && BaseType->isLiteralType(Info.Ctx) && !VD->hasDefinition()) { // This variable might end up being constexpr. Don't diagnose it yet. } else if (IsConstant) { @@ -4491,9 +4492,11 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E, // a null BaseVal. Any constexpr-unknown variable seen here is an error: // we can't access a constexpr-unknown object. if (AK != clang::AK_Dereference && !BaseVal) { - Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1) - << AK << VD; - Info.Note(VD->getLocation(), diag::note_declared_at); + if (!Info.checkingPotentialConstantExpression()) { + Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1) + << AK << VD; + Info.Note(VD->getLocation(), diag::note_declared_at); + } return CompleteObject(); } } else if (DynamicAllocLValue DA = LVal.Base.dyn_cast()) { diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp index 03fea91169787..16f5f823d26c1 100644 --- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp +++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp @@ -357,3 +357,29 @@ namespace pointer_comparisons { static_assert(!f4()); // expected-error {{static assertion expression is not an integral constant expression}} \ // expected-note {{in call to 'f4()'}} } + +namespace GH149188 { +namespace enable_if_1 { + template <__SIZE_TYPE__ N> + constexpr void foo(const char (&Str)[N]) + __attribute((enable_if(__builtin_strlen(Str), ""))) {} + + void x() { + foo("1234"); + } +} + +namespace enable_if_2 { + constexpr const char (&f())[]; + extern const char (&Str)[]; + constexpr int foo() + __attribute((enable_if(__builtin_strlen(Str), ""))) + {return __builtin_strlen(Str);} + + constexpr const char (&f())[] {return "a";} + constexpr const char (&Str)[] = f(); + void x() { + constexpr int x = foo(); + } +} +} diff --git a/clang/test/SemaCXX/constexpr-never-constant.cpp b/clang/test/SemaCXX/constexpr-never-constant.cpp index 307810ee263dd..5756bb647ce88 100644 --- a/clang/test/SemaCXX/constexpr-never-constant.cpp +++ b/clang/test/SemaCXX/constexpr-never-constant.cpp @@ -24,3 +24,10 @@ constexpr void other_func() { throw 12; } + +namespace GH149041 { + // Make sure these don't trigger the diagnostic. + extern const bool& b; + constexpr bool fun1() { return b; } + constexpr bool fun2(const bool& b) { return b; } +} From 284dd5ba84ade5891ab9d68f00e335b37c0f1a1f Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 17 Jul 2025 21:18:05 +0100 Subject: [PATCH 234/813] [SelectionDAG] Fix misplaced commas in operand bundle errors (#149331) --- .../lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 12 +++++------- llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 74c14ede24755..01e53123ea7e1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -845,16 +846,13 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, static void failForInvalidBundles(const CallBase &I, StringRef Name, ArrayRef AllowedBundles) { if (I.hasOperandBundlesOtherThan(AllowedBundles)) { + ListSeparator LS; std::string Error; + raw_string_ostream OS(Error); for (unsigned i = 0, e = I.getNumOperandBundles(); i != e; ++i) { OperandBundleUse U = I.getOperandBundleAt(i); - bool First = true; - if (is_contained(AllowedBundles, U.getTagID())) - continue; - if (!First) - Error += ", "; - First = false; - Error += U.getTagName(); + if (!is_contained(AllowedBundles, U.getTagID())) + OS << LS << U.getTagName(); } reportFatalUsageError( Twine("cannot lower ", Name) diff --git a/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll b/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll index ac4963f1f79cc..17065a4a61c2c 100644 --- a/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll +++ b/llvm/test/CodeGen/X86/invalid-operand-bundle-call.ll @@ -1,10 +1,10 @@ ; RUN: not llc -mtriple=x86_64-unknown-linux-gnu < %s 2>&1 | FileCheck %s -; CHECK: LLVM ERROR: cannot lower calls with arbitrary operand bundles: foo +; CHECK: LLVM ERROR: cannot lower calls with arbitrary operand bundles: foo, bar, baz declare void @g() define void @f(i32 %arg) { - call void @g() [ "foo"(i32 %arg) ] + call void @g() [ "foo"(i32 %arg), "bar"(i32 %arg), "baz"(i32 %arg) ] ret void } From b0c6148584854af3d7ed2425034c3b5252f6b769 Mon Sep 17 00:00:00 2001 From: Peter Rong Date: Thu, 17 Jul 2025 13:19:26 -0700 Subject: [PATCH 235/813] [DWARFLinker] Use different addresses to distinguish invalid DW_AT_LLVM_stmt_sequence offset (#149376) It'd be helpful (especially when `llvm-dwarfdump ... | grep `) to separate two different invalid reasons for debugging. --- llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp index 222dc88098102..559d808a72f98 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp @@ -43,6 +43,12 @@ namespace llvm { using namespace dwarf_linker; using namespace dwarf_linker::classic; +enum InvalidStmtSeqOffset { + MaxStmtSeqOffset = UINT64_MAX, + OrigOffsetMissing = MaxStmtSeqOffset - 1, + NewOffsetMissing = MaxStmtSeqOffset - 2, +}; + /// Hold the input and output of the debug info size in bytes. struct DebugInfoSize { uint64_t Input; @@ -2315,7 +2321,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { // Some sequences are discarded by the DWARFLinker if they are invalid // (empty). if (OrigRowIter == SeqOffToOrigRow.end()) { - StmtSeq.set(UINT64_MAX); + StmtSeq.set(OrigOffsetMissing); continue; } size_t OrigRowIndex = OrigRowIter->second; @@ -2325,7 +2331,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { if (NewRowIter == OrigRowToNewRow.end()) { // If the original row index is not found in the map, update the // stmt_sequence attribute to the 'invalid offset' magic value. - StmtSeq.set(UINT64_MAX); + StmtSeq.set(NewOffsetMissing); continue; } From 1e7ec351c40af981e299743a955d71c7d70753a8 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 17 Jul 2025 21:24:27 +0100 Subject: [PATCH 236/813] [lldb] Adjust default target.max-children-depth (#149282) Deeply nested structs can be noisy, so Apple's LLDB fork sets the default to `4`: https://github.com/swiftlang/llvm-project/blob/9c93adbb283005ab416fd155b75fd43e6a8288ca/lldb/source/Target/TargetProperties.td#L134-L136 Thought it would be useful to upstream this. Though happy to pick a different default or keep it as-is. --- lldb/source/Target/TargetProperties.td | 2 +- .../Settings/TestChildDepthTruncation.test | 84 +++++++++++++++++++ .../NativePDB/Inputs/class_layout.lldbinit | 1 + 3 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 lldb/test/Shell/Settings/TestChildDepthTruncation.test diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td index 656503bb8d228..e6cd48a9d3dad 100644 --- a/lldb/source/Target/TargetProperties.td +++ b/lldb/source/Target/TargetProperties.td @@ -99,7 +99,7 @@ let Definition = "target" in { DefaultUnsignedValue<24>, Desc<"Maximum number of children to expand in any level of depth.">; def MaxChildrenDepth: Property<"max-children-depth", "UInt64">, - DefaultUnsignedValue<0xFFFFFFFF>, + DefaultUnsignedValue<4>, Desc<"Maximum depth to expand children.">; def MaxSummaryLength: Property<"max-string-summary-length", "UInt64">, DefaultUnsignedValue<1024>, diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test new file mode 100644 index 0000000000000..e0e6cda516655 --- /dev/null +++ b/lldb/test/Shell/Settings/TestChildDepthTruncation.test @@ -0,0 +1,84 @@ +# Test that we warn the user about truncated output +# when target.max-children-depth wasn't explicitly set. + +# RUN: split-file %s %t +# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s --check-prefix=DWIM +# +# RUN: %lldb -x -b -s %t/expr-commands.input %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s --check-prefix=EXPR +# +# RUN: %lldb -x -b -s %t/frame-var-commands.input %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s --check-prefix=VAR +# +# RUN: %lldb -x -b -s %t/with-setting-commands.input %t.out -o exit 2>&1 \ +# RUN: | FileCheck %s --check-prefix=SETTING + +#--- main.cpp + +struct L1 { + int w; + struct L2 { + int x; + struct L3 { + int y; + struct L4 { + int z; + struct L5 { + int a; + } l5; + } l4; + } l3; + } l2; +}; + +int main() { + L1 nested; + __builtin_debugtrap(); +} + +#--- dwim-commands.input + +run +dwim-print nested +frame variable nested + +DWIM: (lldb) dwim-print nested +DWIM: *** Some of the displayed variables have a greater depth of members +DWIM-SAME: use the --depth option to dwim-print +DWIM: (lldb) frame variable nested +DWIM-NOT: *** Some of the displayed variables have a greater depth of members + +#--- expr-commands.input + +run +expression nested +frame variable nested + +EXPR: (lldb) expression nested +EXPR: *** Some of the displayed variables have a greater depth of members +EXPR-SAME: use the --depth option to expression +EXPR: (lldb) frame variable nested +EXPR-NOT: *** Some of the displayed variables have a greater depth of members + +#--- frame-var-commands.input + +run +frame variable nested +frame variable nested + +VAR: (lldb) frame variable nested +VAR: *** Some of the displayed variables have a greater depth of members +VAR-SAME: use the --depth option to frame variable +VAR: (lldb) frame variable nested +VAR-NOT: *** Some of the displayed variables have a greater depth of members + +#--- with-setting-commands.input + +run +settings set target.max-children-depth 1 +frame variable nested + +SETTING: (lldb) frame variable nested +SETTING-NOT: *** Some of the displayed variables have a greater depth of members diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit index bbce1e88626e5..301488d5810b3 100644 --- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit +++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/class_layout.lldbinit @@ -1,3 +1,4 @@ +settings set target.max-children-depth 10 expr a expr b.c expr b.u.c From 867ff3001e4e1e68b3f26c5ead281ea2208c4c48 Mon Sep 17 00:00:00 2001 From: Tobias Hieta Date: Thu, 17 Jul 2025 22:33:27 +0200 Subject: [PATCH 237/813] Use Parallel xz for test-suite sources. (#149389) --- llvm/utils/release/export.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh index 66bef82586a34..0ac392cbed7be 100755 --- a/llvm/utils/release/export.sh +++ b/llvm/utils/release/export.sh @@ -123,7 +123,7 @@ export_sources() { tar -C test-suite-$release$rc.src --strip-components=1 -xzf - fi echo "Creating tarball for test-suite ..." - tar --sort=name --owner=0 --group=0 \ + XZ_OPT="-T0" tar --sort=name --owner=0 --group=0 \ --pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \ -cJf test-suite-$release$rc.src.tar.xz test-suite-$release$rc.src fi From aea2d5396187cf19447cc80716fd483dd4b634dc Mon Sep 17 00:00:00 2001 From: Jianhui Li Date: Thu, 17 Jul 2025 13:33:39 -0700 Subject: [PATCH 238/813] [MLIR][XeGPU] make offsets optional for create_nd_tdesc (#148335) --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 49 +++++-- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 121 +++++++++++++++++- .../VectorToXeGPU/load-to-xegpu.mlir | 2 +- .../VectorToXeGPU/store-to-xegpu.mlir | 2 +- .../VectorToXeGPU/transfer-read-to-xegpu.mlir | 2 +- .../transfer-write-to-xegpu.mlir | 2 +- mlir/test/Dialect/XeGPU/invalid.mlir | 29 ++++- mlir/test/Dialect/XeGPU/ops.mlir | 45 ++++++- .../Dialect/XeGPU/subgroup-distribute.mlir | 8 +- 9 files changed, 233 insertions(+), 27 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index bd5ea9fd83781..81e25f7537cb0 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -110,23 +110,34 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface Variadic: $offsets, Variadic: $shape, Variadic: $strides, - DenseI64ArrayAttr: $const_offsets, + OptionalAttr: $const_offsets, OptionalAttr: $const_shape, OptionalAttr: $const_strides ); - let results = (outs XeGPU_TensorDesc: $TensorDesc); let assemblyFormat = [{ $source `` - custom($offsets, $const_offsets) - (`,` custom($shape, $const_shape)^ - `,` custom($strides, $const_strides))? + custom($offsets, $const_offsets) + (`,` `shape` `:` custom($shape, $const_shape)^ + `,` `strides``:` custom($strides, $const_strides))? attr-dict `:` type($source) `->` qualified(type($TensorDesc)) }]; + let results = (outs XeGPU_TensorDesc: $TensorDesc); + let hasVerifier = 1; let builders = [ + OpBuilder<(ins "Type": $tdesc, "TypedValue": $source)>, + + OpBuilder<(ins "Type": $tdesc, "TypedValue ": $source, + "llvm::ArrayRef": $shape, + "llvm::ArrayRef": $strides)>, + + OpBuilder<(ins "Type": $tdesc, "TypedValue ": $source, + "llvm::ArrayRef": $shape, + "llvm::ArrayRef": $strides)>, + OpBuilder<(ins "Type": $tdesc, "TypedValue": $source, "llvm::ArrayRef": $offsets)>, @@ -163,7 +174,17 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface } ArrayRef getStaticOffsets(){ - return getConstOffsets(); + auto attr = getConstOffsetsAttr(); + + if (attr) + return attr; + + int64_t rank = getMixedSizes().size(); + + setConstOffsets(llvm::SmallVector(rank, 0)); + + attr = getConstOffsetsAttr(); + return attr; } /// wrapper for matching with OffsetSizeAndStrideOpInterface @@ -172,10 +193,16 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface /// and `const_shape` will be used to represent the shape of /// source operand. They overide static shape from source memref type. ArrayRef getStaticSizes() { + /// To be compatible with OffsetSizeAndStrideOpInterface, which expects valid return value and perform checks + static llvm::SmallVector emptyShape; + auto attr = getConstShapeAttr(); - if (llvm::isa(getSourceType()) || attr) + if (attr) return attr; + if (llvm::isa(getSourceType())) + return emptyShape; + auto memrefType = llvm::dyn_cast(getSourceType()); assert(memrefType && "Incorrect use of getStaticSizes"); return memrefType.getShape(); @@ -187,9 +214,15 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface /// and `const_strides` will be used to represent the strides of /// source operand. They overide static strides from source memref type. ArrayRef getStaticStrides() { + /// To be compatible with OffsetSizeAndStrideOpInterface, which expects valid return value and perform checks + static llvm::SmallVector emptyStrides; + auto attr = getConstStridesAttr(); - if (llvm::isa(getSourceType()) || attr) + if (attr) return attr; + + if (llvm::isa(getSourceType())) + return emptyStrides; auto memrefType = llvm::dyn_cast(getSourceType()); assert(memrefType && "Incorrect use of getStaticStrides"); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index ef7cd1424e7a4..78cbf884a1911 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/IR/Builders.h" #include "mlir/IR/TypeUtilities.h" +#include "mlir/Interfaces/ViewLikeInterface.h" #include "llvm/Support/Debug.h" @@ -112,6 +113,68 @@ isValidGatherScatterParams(Type maskTy, VectorType valueTy, //===----------------------------------------------------------------------===// // XeGPU_CreateNdDescOp //===----------------------------------------------------------------------===// + +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, TypedValue source) { + [[maybe_unused]] auto ty = source.getType(); + assert(ty.hasStaticShape() && "expecting a memref with static shape"); + + build(builder, state, tdesc, source, ValueRange({}) /* dynamic offsets */, + ValueRange({}) /* empty dynamic shape */, + ValueRange({}) /* empty dynamic strides */, + DenseI64ArrayAttr({}) /* const offsets */, + DenseI64ArrayAttr({}) /* empty const shape*/, + DenseI64ArrayAttr({}) /* empty const strides*/); +} + +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, TypedValue source, + llvm::ArrayRef shape, + llvm::ArrayRef strides) { + assert(shape.size() && strides.size() && shape.size() == strides.size() && + "Shape and strides must be present and of equal size for ui64 " + "initialization."); + + llvm::SmallVector staticShape; + llvm::SmallVector staticStrides; + llvm::SmallVector dynamicShape; + llvm::SmallVector dynamicStrides; + + dispatchIndexOpFoldResults(shape, dynamicShape, staticShape); + dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides); + + auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape); + auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides); + + build(builder, state, tdesc, source, ValueRange({}), dynamicShape, + dynamicStrides, builder.getDenseI64ArrayAttr({}), staticShapeAttr, + staticStridesAttr); +} + +void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, + Type tdesc, TypedValue source, + llvm::ArrayRef shape, + llvm::ArrayRef strides) { + assert(shape.size() && strides.size() && shape.size() == strides.size() && + "Shape and strides must be present and of equal size for ui64 " + "initialization."); + + llvm::SmallVector staticShape; + llvm::SmallVector staticStrides; + llvm::SmallVector dynamicShape; + llvm::SmallVector dynamicStrides; + + dispatchIndexOpFoldResults(shape, dynamicShape, staticShape); + dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides); + + auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape); + auto staticStridesAttr = builder.getDenseI64ArrayAttr(staticStrides); + + build(builder, state, tdesc, source, ValueRange({}), dynamicShape, + dynamicStrides, builder.getDenseI64ArrayAttr({}), staticShapeAttr, + staticStridesAttr); +} + void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, Type tdesc, TypedValue source, llvm::ArrayRef offsets) { @@ -125,8 +188,8 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, build(builder, state, tdesc, source, dynamicOffsets /* dynamic offsets */, ValueRange({}) /* empty dynamic shape */, ValueRange({}) /* empty dynamic strides */, - staticOffsets /* const offsets */, {} /* empty const shape*/, - {} /* empty const strides*/); + builder.getDenseI64ArrayAttr(staticOffsets) /* const offsets */, + {} /* empty const shape*/, {} /* empty const strides*/); } void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, @@ -197,6 +260,13 @@ LogicalResult CreateNdDescOp::verify() { invalidElemTy |= memrefTy.getElementType() != getElementType(); } + if (llvm::isa(getSourceType())) { + // strides and shape must present for integer source. + if (getMixedStrides().empty() || getMixedSizes().empty()) + return emitOpError("Expecting strides and shape to be present for " + "integer source."); + } + // mismatches among shape, strides, and offsets are // already handeled by OffsetSizeAndStrideOpInterface. // So they are not check here. @@ -221,6 +291,53 @@ LogicalResult CreateNdDescOp::verify() { return success(); } +ParseResult parseOptionalDynamicIndexList( + OpAsmParser &parser, + SmallVectorImpl &values, + DenseI64ArrayAttr &integers, SmallVectorImpl *valueTypes = nullptr, + AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) { + + SmallVector integerVals; + auto parseIntegerOrValue = [&]() { + OpAsmParser::UnresolvedOperand operand; + auto res = parser.parseOptionalOperand(operand); + + if (res.has_value() && succeeded(res.value())) { + values.push_back(operand); + integerVals.push_back(ShapedType::kDynamic); + if (valueTypes && parser.parseColonType(valueTypes->emplace_back())) + return failure(); + } else { + int64_t integer; + if (failed(parser.parseInteger(integer))) + return failure(); + integerVals.push_back(integer); + } + return success(); + }; + + // If the optional values are given there must be left bracket + if (parser.parseOptionalLSquare().succeeded()) { + if (parser.parseCommaSeparatedList(parseIntegerOrValue) || + parser.parseRSquare()) + return parser.emitError(parser.getNameLoc()) + << "expected a list of SSA values or integers"; + integers = parser.getBuilder().getDenseI64ArrayAttr(integerVals); + return success(); + } + + return success(); +} + +void printOptionalDynamicIndexList( + OpAsmPrinter &printer, Operation *op, OperandRange values, + ArrayRef integers, TypeRange valueTypes = TypeRange(), + AsmParser::Delimiter delimiter = AsmParser::Delimiter::Square) { + + return printDynamicIndexList(printer, op, values, integers, + /*scalableFlags=*/{}, valueTypes, delimiter); +} + //===----------------------------------------------------------------------===// // XeGPU_PrefetchNdOp //===----------------------------------------------------------------------===// diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir index 4af7061a4f8a3..58719e75b1bde 100644 --- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir @@ -54,7 +54,7 @@ func.func @load_dynamic_source(%source: memref, // CHECK-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] // CHECK: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] +// CHECK-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, // CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir index d68a02b54e967..0d3da815529e3 100644 --- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir @@ -56,7 +56,7 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>, // CHECK-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] // CHECK: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] +// CHECK-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, // CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir index c2f760b29afc4..05b41a8233e8c 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir @@ -96,7 +96,7 @@ func.func @load_dynamic_source(%source: memref, // CHECK-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] // CHECK: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] +// CHECK-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32 // CHECK: %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32> // CHECK: return %[[VEC]] diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir index 8de6c2283b37c..2bfee03892d10 100644 --- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir +++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir @@ -60,7 +60,7 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>, // CHECK-DAG: %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]] // CHECK: %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]] // CHECK: %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]] -// CHECK-SAME: [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] +// CHECK-SAME: , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1] // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32 // CHECK: xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32> diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 83a98ab0622b7..eb564d55bfd51 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s -split-input-file -verify-diagnostics // ----- -func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) { +func.func @create_nd_tdesc_1(%src: memref<24xf32>) { // expected-error@+1 {{Expecting the TensorDesc rank is not greater than the ranks of shape, strides, offsets or the memref source}} %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32> return @@ -9,47 +9,62 @@ func.func @create_nd_tdesc_vc_1(%src: memref<24xf32>) { // ----- -func.func @create_nd_tdesc_vc_2(%src: memref<24x32xf32>) { +func.func @create_nd_tdesc_2(%src: memref<24x32xf32>) { // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}} %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16> return } // ----- -func.func @create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) { +func.func @create_nd_tdesc_3(%src: memref<2x24x32xf32, 3>) { // expected-error@+1 {{SLM is only supported for 1D block tensor}} %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } // ----- -func.func @create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) { +func.func @create_nd_tdesc_4(%src: memref<2x24x32xf32, 3>) { // expected-error@+1 {{Memory space mismatch}} %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32> return } // ----- -func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { +func.func @create_nd_tdesc_5(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } // ----- -func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { +func.func @create_nd_tdesc_6(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } // ----- -func.func @create_nd_tdesc_subgroup_1(%src: memref<128x128xf32>) { +func.func @create_nd_tdesc_7(%src: memref<128x128xf32>) { // expected-error@+1 {{cannot distribute [128, 128] using #xegpu.layout}} %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<128x128xf32> -> !xegpu.tensor_desc<128x128xf32, #xegpu.layout> return } +// ----- +func.func @create_nd_tdesc_8(%src: ui64) { + // expected-error@+1 {{'xegpu.create_nd_tdesc' op Expecting strides and shape to be present for integer source}} + %1 = xegpu.create_nd_tdesc %src : ui64-> !xegpu.tensor_desc<128x128xf32> + return +} + +// ----- +func.func @create_nd_tdesc_9(%src: ui64) { + // expected-error@+1 {{expected mixed offsets rank to match mixed sizes rank}} + %1 = xegpu.create_nd_tdesc %src[0, 0] : ui64-> !xegpu.tensor_desc<128x128xf32> + return +} + + // ----- func.func @prefetch_nd_vc_1(%src: memref<24x32xf16>) { %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 3bfe1fa81aa6e..695437354cd7c 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -17,8 +17,8 @@ gpu.func @create_nd_tdesc_1(%src: memref<24x32xf32>) { gpu.func @create_nd_tdesc_2(%src: ui64, %w : index, %h : index, %x : index, %y : index) { //CHECK: %[[C:.*]] = arith.constant 1 : index %c1 = arith.constant 1 : index - // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], [%[[arg2]], %[[arg1]]], [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> - %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[arg3]], %[[arg4]]], shape : [%[[arg2]], %[[arg1]]], strides : [%[[arg1]], %[[C]]] : ui64 -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides: [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> gpu.return } @@ -62,6 +62,47 @@ gpu.func @create_nd_tdesc_7(%src: memref<8x24x32x48x64xf32>) { } +// CHECK: gpu.func @test_create_nd_tdesc_7(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index, %[[arg5:.*]]: memref<24x32xf32>) +gpu.func @test_create_nd_tdesc_7(%src: ui64, %w : index, %h : index, %x : index, %y : index, %src2: memref<24x32xf32>) { + //CHECK: %[[C:.*]] = arith.constant 1 : index + %c1 = arith.constant 1 : index + + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg5]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %3 = xegpu.create_nd_tdesc %src2 : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32> + + gpu.return +} + +// CHECK: gpu.func @test_create_nd_tdesc_8(%[[arg0:.*]]: ui64, %[[arg1:.*]]: index, %[[arg2:.*]]: index, %[[arg3:.*]]: index, %[[arg4:.*]]: index) +gpu.func @test_create_nd_tdesc_8(%src: ui64, %w : index, %h : index, %x : index, %y : index) { + + %c1 = arith.constant 1 : index + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> + %2 = xegpu.create_nd_tdesc %src, shape : [%h, %w], strides : [%w, %c1] : ui64 -> !xegpu.tensor_desc<8x16xf32> + + gpu.return +} + +// CHECK-LABEL: func @test_create_nd_tdesc_9({{.*}}) + +gpu.func @test_create_nd_tdesc_9(%src: memref, %w : index, %h : index, %x : index, %y : index) { + + %c1 = arith.constant 1 : index + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[%arg3, %arg4], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> + %1 = xegpu.create_nd_tdesc %src[%x, %y], shape:[%h, %w], strides:[%w, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> + + gpu.return +} + +// CHECK-LABEL: func @test_create_nd_tdesc_10({{.*}}) +gpu.func @test_create_nd_tdesc_10(%src: memref, %w : index, %h : index, %x : index, %y : index) { + %c1 = arith.constant 1 : index + // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0], shape : [%arg2, %arg1], strides : [%arg1, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> + %2 = xegpu.create_nd_tdesc %src, shape:[%h, %w], strides:[%w, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> + + gpu.return +} + // CHECK: gpu.func @prefetch_nd(%[[arg0:.*]]: memref<24x32xf16>) { gpu.func @prefetch_nd(%src: memref<24x32xf16>) { // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16> diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 3d91b2269bc4b..0bfbc4a35c03b 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -150,16 +150,16 @@ gpu.module @test { // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: ui64, %[[ARG1:[0-9a-zA-Z]+]]: ui64, %[[ARG2:[0-9a-zA-Z]+]]: index, // CHECK-SAME: %[[ARG3:[0-9a-zA-Z]+]]: index, %[[ARG4:[0-9a-zA-Z]+]]: index, // CHECK-SAME: %[[ARG5:[0-9a-zA-Z]+]]: index, %[[ARG6:[0-9a-zA-Z]+]]: index, %[[ARG7:[0-9a-zA-Z]+]]: index) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> // CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], [%[[ARG2]], %[[ARG3]]], [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][{{.*}}], shape : [%[[ARG2]], %[[ARG3]]], strides : [%[[ARG4]], %[[ARG5]]] : ui64 -> !xegpu.tensor_desc<16x16xf16> // CHECK: xegpu.store_nd %[[T1]], %[[T2]] : vector<16xf16>, !xegpu.tensor_desc<16x16xf16> gpu.module @test { gpu.func @create_nd_tdesc_non_memref(%arg0: ui64, %arg1: ui64, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index) { %c0 = arith.constant 0 : index - %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %0 = xegpu.create_nd_tdesc %arg0[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> %1 = xegpu.load_nd %0 {layout_result_0 = #xegpu.layout} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout> -> vector<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], [%arg2, %arg3], [%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> + %2 = xegpu.create_nd_tdesc %arg1[%c0, %c0], shape:[%arg2, %arg3], strides:[%arg4, %arg5] : ui64 -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout> xegpu.store_nd %1, %2 : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout> gpu.return } From 61a45d20cfe7f93ec1b73dc2dd776f493af2a7cc Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Thu, 17 Jul 2025 13:40:51 -0700 Subject: [PATCH 239/813] [IR2Vec][NFC] Add helper methods for numeric ID mapping in Vocabulary (#149212) Add helper methods to IR2Vec's Vocabulary class for numeric ID mapping and vocabulary size calculation. These APIs will be useful in triplet generation for `llvm-ir2vec` tool (See #149214). (Tracking issue - #141817) --- llvm/include/llvm/Analysis/IR2Vec.h | 9 ++++ llvm/lib/Analysis/IR2Vec.cpp | 20 +++++++- llvm/unittests/Analysis/IR2VecTest.cpp | 63 ++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 2 deletions(-) diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h index 3d7edf08c8807..d87457cac7642 100644 --- a/llvm/include/llvm/Analysis/IR2Vec.h +++ b/llvm/include/llvm/Analysis/IR2Vec.h @@ -170,6 +170,10 @@ class Vocabulary { unsigned getDimension() const; size_t size() const; + static size_t expectedSize() { + return MaxOpcodes + MaxTypeIDs + MaxOperandKinds; + } + /// Helper function to get vocabulary key for a given Opcode static StringRef getVocabKeyForOpcode(unsigned Opcode); @@ -182,6 +186,11 @@ class Vocabulary { /// Helper function to classify an operand into OperandKind static OperandKind getOperandKind(const Value *Op); + /// Helpers to return the IDs of a given Opcode, TypeID, or OperandKind + static unsigned getNumericID(unsigned Opcode); + static unsigned getNumericID(Type::TypeID TypeID); + static unsigned getNumericID(const Value *Op); + /// Accessors to get the embedding for a given entity. const ir2vec::Embedding &operator[](unsigned Opcode) const; const ir2vec::Embedding &operator[](Type::TypeID TypeId) const; diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 898bf5b202feb..95f30fd3f4275 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -215,7 +215,7 @@ Vocabulary::Vocabulary(VocabVector &&Vocab) : Vocab(std::move(Vocab)), Valid(true) {} bool Vocabulary::isValid() const { - return Vocab.size() == (MaxOpcodes + MaxTypeIDs + MaxOperandKinds) && Valid; + return Vocab.size() == Vocabulary::expectedSize() && Valid; } size_t Vocabulary::size() const { @@ -324,8 +324,24 @@ Vocabulary::OperandKind Vocabulary::getOperandKind(const Value *Op) { return OperandKind::VariableID; } +unsigned Vocabulary::getNumericID(unsigned Opcode) { + assert(Opcode >= 1 && Opcode <= MaxOpcodes && "Invalid opcode"); + return Opcode - 1; // Convert to zero-based index +} + +unsigned Vocabulary::getNumericID(Type::TypeID TypeID) { + assert(static_cast(TypeID) < MaxTypeIDs && "Invalid type ID"); + return MaxOpcodes + static_cast(TypeID); +} + +unsigned Vocabulary::getNumericID(const Value *Op) { + unsigned Index = static_cast(getOperandKind(Op)); + assert(Index < MaxOperandKinds && "Invalid OperandKind"); + return MaxOpcodes + MaxTypeIDs + Index; +} + StringRef Vocabulary::getStringKey(unsigned Pos) { - assert(Pos < MaxOpcodes + MaxTypeIDs + MaxOperandKinds && + assert(Pos < Vocabulary::expectedSize() && "Position out of bounds in vocabulary"); // Opcode if (Pos < MaxOpcodes) diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp index cb6d633306a81..7c9a5464bfe1d 100644 --- a/llvm/unittests/Analysis/IR2VecTest.cpp +++ b/llvm/unittests/Analysis/IR2VecTest.cpp @@ -396,6 +396,69 @@ TEST(IR2VecVocabularyTest, DummyVocabTest) { } } +TEST(IR2VecVocabularyTest, NumericIDMap) { + // Test getNumericID for opcodes + EXPECT_EQ(Vocabulary::getNumericID(1u), 0u); + EXPECT_EQ(Vocabulary::getNumericID(13u), 12u); + EXPECT_EQ(Vocabulary::getNumericID(MaxOpcodes), MaxOpcodes - 1); + + // Test getNumericID for Type IDs + EXPECT_EQ(Vocabulary::getNumericID(Type::VoidTyID), + MaxOpcodes + static_cast(Type::VoidTyID)); + EXPECT_EQ(Vocabulary::getNumericID(Type::HalfTyID), + MaxOpcodes + static_cast(Type::HalfTyID)); + EXPECT_EQ(Vocabulary::getNumericID(Type::FloatTyID), + MaxOpcodes + static_cast(Type::FloatTyID)); + EXPECT_EQ(Vocabulary::getNumericID(Type::IntegerTyID), + MaxOpcodes + static_cast(Type::IntegerTyID)); + EXPECT_EQ(Vocabulary::getNumericID(Type::PointerTyID), + MaxOpcodes + static_cast(Type::PointerTyID)); + + // Test getNumericID for Value operands + LLVMContext Ctx; + Module M("TestM", Ctx); + FunctionType *FTy = + FunctionType::get(Type::getVoidTy(Ctx), {Type::getInt32Ty(Ctx)}, false); + Function *F = Function::Create(FTy, Function::ExternalLinkage, "testFunc", M); + + // Test Function operand + EXPECT_EQ(Vocabulary::getNumericID(F), + MaxOpcodes + MaxTypeIDs + 0u); // Function = 0 + + // Test Constant operand + Constant *C = ConstantInt::get(Type::getInt32Ty(Ctx), 42); + EXPECT_EQ(Vocabulary::getNumericID(C), + MaxOpcodes + MaxTypeIDs + 2u); // Constant = 2 + + // Test Pointer operand + BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F); + AllocaInst *PtrVal = new AllocaInst(Type::getInt32Ty(Ctx), 0, "ptr", BB); + EXPECT_EQ(Vocabulary::getNumericID(PtrVal), + MaxOpcodes + MaxTypeIDs + 1u); // Pointer = 1 + + // Test Variable operand (function argument) + Argument *Arg = F->getArg(0); + EXPECT_EQ(Vocabulary::getNumericID(Arg), + MaxOpcodes + MaxTypeIDs + 3u); // Variable = 3 +} + +#if GTEST_HAS_DEATH_TEST +#ifndef NDEBUG +TEST(IR2VecVocabularyTest, NumericIDMapInvalidInputs) { + // Test invalid opcode IDs + EXPECT_DEATH(Vocabulary::getNumericID(0u), "Invalid opcode"); + EXPECT_DEATH(Vocabulary::getNumericID(MaxOpcodes + 1), "Invalid opcode"); + + // Test invalid type IDs + EXPECT_DEATH(Vocabulary::getNumericID(static_cast(MaxTypeIDs)), + "Invalid type ID"); + EXPECT_DEATH( + Vocabulary::getNumericID(static_cast(MaxTypeIDs + 10)), + "Invalid type ID"); +} +#endif // NDEBUG +#endif // GTEST_HAS_DEATH_TEST + TEST(IR2VecVocabularyTest, StringKeyGeneration) { EXPECT_EQ(Vocabulary::getStringKey(0), "Ret"); EXPECT_EQ(Vocabulary::getStringKey(12), "Add"); From 202f30ede1bc4ceb2480db3c93440d999da9f24b Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Thu, 17 Jul 2025 13:43:53 -0700 Subject: [PATCH 240/813] [IR2Vec][llvm-ir2vec] Add support for reading from stdin (#149213) Add support for reading LLVM IR from stdin in the llvm-ir2vec tool. This allows usage of the tool in pipelines where LLVM IR is generated or transformed on-the-fly just like the other llvm tools. Useful in upcoming PRs. (Tracking issue - #141817) --- llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp index 4e88282e85c14..e1e5fad13f413 100644 --- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp +++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp @@ -48,10 +48,10 @@ namespace ir2vec { static cl::OptionCategory IR2VecToolCategory("IR2Vec Tool Options"); -static cl::opt InputFilename(cl::Positional, - cl::desc(""), - cl::Required, - cl::cat(IR2VecToolCategory)); +static cl::opt + InputFilename(cl::Positional, + cl::desc(""), + cl::init("-"), cl::cat(IR2VecToolCategory)); static cl::opt OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"), @@ -287,7 +287,7 @@ int main(int argc, char **argv) { if (Mode == TripletMode && Level.getNumOccurrences() > 0) errs() << "Warning: --level option is ignored in triplet mode\n"; - // Parse the input LLVM IR file + // Parse the input LLVM IR file or stdin SMDiagnostic Err; LLVMContext Context; std::unique_ptr M = parseIRFile(InputFilename, Err, Context); From 64c7e7efebcdd5bccae4a44d414ef686357fc509 Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Thu, 17 Jul 2025 14:03:21 -0700 Subject: [PATCH 241/813] Add tools/llvm-ir2vec to pr-subscribes-mlgo (#149405) --- .github/new-prs-labeler.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index b05e9c6c56ed0..d4cf869b023a1 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -717,6 +717,7 @@ mlgo: - llvm/lib/Analysis/IR2Vec.cpp - llvm/lib/Analysis/models/** - llvm/test/Analysis/IR2Vec/** + - llvm/tools/llvm-ir2vec/** tools:llvm-exegesis: - llvm/tools/llvm-exegesis/** From 7e220630d28741b30bb4423e34e76654c7d9d1f7 Mon Sep 17 00:00:00 2001 From: Jian Cai Date: Thu, 17 Jul 2025 14:13:28 -0700 Subject: [PATCH 242/813] [mlir][docs] Rename OpTrait to Trait in ODS doc (#148276) This makes the doc consistent with the code base. --- mlir/docs/DefiningDialects/Operations.md | 8 ++++---- mlir/include/mlir/IR/OpBase.td | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/docs/DefiningDialects/Operations.md b/mlir/docs/DefiningDialects/Operations.md index 2225329ff830b..f988bebea1223 100644 --- a/mlir/docs/DefiningDialects/Operations.md +++ b/mlir/docs/DefiningDialects/Operations.md @@ -89,7 +89,7 @@ their semantics via a special [TableGen backend][TableGenBackend]: help of the following constructs. * The `Dialect` class: Operations belonging to one logical group are placed in the same dialect. The `Dialect` class contains dialect-level information. -* The `OpTrait` class hierarchy: They are used to specify special properties +* The `Trait` class hierarchy: They are used to specify special properties and constraints of the operation, including whether the operation has side effect or whether its output has the same shape as the input. * The `ins`/`outs` marker: These are two special markers builtin to the @@ -436,7 +436,7 @@ various traits in the `mlir::OpTrait` namespace. Both operation traits, [interfaces](../Interfaces.md/#utilizing-the-ods-framework), and constraints involving multiple operands/attributes/results are provided as the third template parameter to the `Op` class. They should be deriving from -the `OpTrait` class. See [Constraints](#constraints) for more information. +the `Trait` class. See [Constraints](#constraints) for more information. ### Builder methods @@ -1355,7 +1355,7 @@ results. These constraints should be specified as the `Op` class template parameter as described in [Operation traits and constraints](#operation-traits-and-constraints). -Multi-entity constraints are modeled as `PredOpTrait` (a subclass of `OpTrait`) +Multi-entity constraints are modeled as `PredOpTrait` (a subclass of `Trait`) in [`OpBase.td`][OpBase].A bunch of constraint primitives are provided to help specification. See [`OpBase.td`][OpBase] for the complete list. @@ -1366,7 +1366,7 @@ commutative or not, whether is a terminator, etc. These constraints should be specified as the `Op` class template parameter as described in [Operation traits and constraints](#operation-traits-and-constraints). -Traits are modeled as `NativeOpTrait` (a subclass of `OpTrait`) in +Traits are modeled as `NativeTrait` (a subclass of `Trait`) in [`OpBase.td`][OpBase]. They are backed and will be translated into the corresponding C++ `mlir::OpTrait` classes. diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td index 43ef28624fb19..9e5fb5659a22b 100644 --- a/mlir/include/mlir/IR/OpBase.td +++ b/mlir/include/mlir/IR/OpBase.td @@ -22,7 +22,7 @@ include "mlir/IR/Utils.td" include "mlir/IR/AttrTypeBase.td" //===----------------------------------------------------------------------===// -// OpTrait definitions +// *OpTrait definitions //===----------------------------------------------------------------------===// // A trait that describes the structure of operation will be marked with From 8de61eb01c9752f0488ed8b52d01fe3d0873ff6c Mon Sep 17 00:00:00 2001 From: Tomohiro Kashiwada Date: Fri, 18 Jul 2025 06:16:08 +0900 Subject: [PATCH 243/813] [Support/BLAKE3] quick fix for Cygwin build (#148635) BLAKE3 1.8.2 ( imported in d2ad63a193216d008c8161879a59c5f42e0125cc ) fails to build for the Cygwin target. see: https://github.com/BLAKE3-team/BLAKE3/issues/494 As a temporary workaround, add `&& !defined(__CYGWIN__)` to BLAKE3 locally. resolves https://github.com/llvm/llvm-project/issues/148365 --- llvm/lib/Support/BLAKE3/blake3_dispatch.c | 2 +- llvm/lib/Support/BLAKE3/blake3_impl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c index d00580fe35195..19918aa708b2f 100644 --- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c +++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c @@ -236,7 +236,7 @@ void blake3_xof_many(const uint32_t cv[8], #if defined(IS_X86) const enum cpu_feature features = get_cpu_features(); MAYBE_UNUSED(features); -#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512) +#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512) if (features & AVX512VL) { blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks); return; diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h index deed079e468a5..dd71e729f208f 100644 --- a/llvm/lib/Support/BLAKE3/blake3_impl.h +++ b/llvm/lib/Support/BLAKE3/blake3_impl.h @@ -324,7 +324,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out); -#if !defined(_WIN32) +#if !defined(_WIN32) && !defined(__CYGWIN__) LLVM_LIBRARY_VISIBILITY void blake3_xof_many_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], From 422a250b0b8fce3e7ff20c400b5ab2837a7baeae Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 17 Jul 2025 14:18:52 -0700 Subject: [PATCH 244/813] [AMDGPU] add tests for Change FLAT SADDR to VADDR form in moveToVALU. NFC. (#149392) --- llvm/lib/Target/AMDGPU/FLATInstructions.td | 1 + .../AMDGPU/move-load-addr-to-valu-flat.mir | 357 ++++++++++++++++++ 2 files changed, 358 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 74632c71f0f95..ff57a12561ca1 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -200,6 +200,7 @@ class VFLAT_Real op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{95-72} = !if(ps.has_offset, offset, ?); } +// TODO: Rename to FlatSaddrTable, it now handles both global and flat GVS addressing mode. class GlobalSaddrTable { bit IsSaddr = is_saddr; string SaddrOp = Name; diff --git a/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir b/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir new file mode 100644 index 0000000000000..95ccf6c0a4a33 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/move-load-addr-to-valu-flat.mir @@ -0,0 +1,357 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GCN %s + +--- +name: flat_load_saddr_to_valu +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: flat_load_saddr_to_valu + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1 + ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[PHI]], 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec + ; GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:sreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, %3, 0, 0, implicit $exec, implicit $flat_scr + %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc + S_CMP_LG_U64 %2, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 +... + +--- +name: flat_load_saddr_to_valu_non_zero_vaddr +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: flat_load_saddr_to_valu_non_zero_vaddr + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub0, implicit $exec + ; GCN-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub1, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GCN-NEXT: [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[REG_SEQUENCE]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]], implicit $exec + ; GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:sreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1 + %3:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, %3, 0, 0, implicit $exec, implicit $flat_scr + %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc + S_CMP_LG_U64 %2, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 +... + + +--- +name: flat_load_saddr_to_valu_undef_vaddr +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: flat_load_saddr_to_valu_undef_vaddr + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1 + ; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub0, implicit $exec + ; GCN-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI]].sub1, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1 + ; GCN-NEXT: [[FLAT_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD_SADDR [[REG_SEQUENCE]], undef %4:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE1]], 0, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]], implicit $exec + ; GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:sreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1 + %4:vgpr_32 = FLAT_LOAD_DWORD_SADDR %1, undef %3:vgpr_32, 0, 0, implicit $exec, implicit $flat_scr + %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc + S_CMP_LG_U64 %2, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 +... + +--- +name: flat_store_saddr_to_valu +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: flat_store_saddr_to_valu + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: FLAT_STORE_DWORD [[PHI]], [[DEF]], 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec + ; GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:sreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %4:vgpr_32 = IMPLICIT_DEF + FLAT_STORE_DWORD_SADDR %3, %4, %1, 0, 0, implicit $exec, implicit $flat_scr + %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc + S_CMP_LG_U64 %2, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 +... + +--- +name: flat_atomic_noret_saddr_to_valu +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: flat_atomic_noret_saddr_to_valu + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %6, %bb.1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: FLAT_ATOMIC_ADD [[PHI]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec + ; GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:sreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + FLAT_ATOMIC_ADD_SADDR %3, %3, %1, 0, 0, implicit $exec, implicit $flat_scr + %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc + S_CMP_LG_U64 %2, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 +... + +--- +name: flat_atomic_rtn_saddr_to_valu +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: flat_atomic_rtn_saddr_to_valu + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64_align2 = COPY $vgpr0_vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vreg_64_align2 = PHI [[COPY]], %bb.0, %7, %bb.1 + ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[PHI]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub0 + ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[PHI]].sub1 + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY1]], 1, implicit $exec + ; GCN-NEXT: [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY2]], 0, implicit $exec + ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[V_AND_B32_e64_1]], %subreg.sub1 + ; GCN-NEXT: [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], 0, implicit $exec + ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]], implicit $exec + ; GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U64_e64_]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0_vgpr1 + %0:sreg_64 = COPY $vgpr0_vgpr1 + + bb.1: + %1:sreg_64_xexec_xnull = PHI %0, %bb.0, %2, %bb.1 + %3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %4:vgpr_32 = FLAT_ATOMIC_ADD_SADDR_RTN %3, %3, %1, 0, 0, implicit $exec, implicit $flat_scr + %2:sreg_64 = S_AND_B64 %1, 1, implicit-def $scc + S_CMP_LG_U64 %2, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 +... + +--- +name: scratch_load_saddr_to_valu +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: scratch_load_saddr_to_valu + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %6, %bb.1 + ; GCN-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[PHI]], 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec + ; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec + ; GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0 + %0:sgpr_32 = COPY $vgpr0 + + bb.1: + %1:sgpr_32 = PHI %0, %bb.0, %2, %bb.1 + %4:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %1, 0, 0, implicit $exec, implicit $flat_scr + %2:sgpr_32 = S_AND_B32 %1, 1, implicit-def $scc + S_CMP_LG_U32 %2, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 +... + +--- +name: scratch_store_saddr_to_valu +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: scratch_store_saddr_to_valu + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %6, %bb.1 + ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF + ; GCN-NEXT: SCRATCH_STORE_DWORD [[DEF]], [[PHI]], 0, 0, implicit $exec, implicit $flat_scr + ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI]], 1, implicit $exec + ; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U32_e64 [[V_AND_B32_e64_]], 0, implicit $exec + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_AND_B32_e64_]], implicit $exec + ; GCN-NEXT: $vcc_lo = S_AND_B32 $exec_lo, [[V_CMP_NE_U32_e64_]], implicit-def $scc + ; GCN-NEXT: S_CBRANCH_VCCNZ %bb.1, implicit $vcc_lo + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0 + %0:sgpr_32 = COPY $vgpr0 + + bb.1: + %1:sgpr_32 = PHI %0, %bb.0, %2, %bb.1 + %4:vgpr_32 = IMPLICIT_DEF + SCRATCH_STORE_DWORD_SADDR %4, %1, 0, 0, implicit $exec, implicit $flat_scr + %2:sgpr_32 = S_AND_B32 %1, 1, implicit-def $scc + S_CMP_LG_U32 %2, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit $scc + + bb.2: + S_ENDPGM 0 +... From be3d614cc13f016b16634e18e10caed508d183d2 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Thu, 17 Jul 2025 23:23:04 +0200 Subject: [PATCH 245/813] [libc++] Fix hash_multi{map,set}::insert (#149290) --- libcxx/include/ext/hash_map | 4 +-- libcxx/include/ext/hash_set | 4 +-- .../gnu/hash_multimap/insert.pass.cpp | 35 +++++++++++++++++++ .../gnu/hash_multiset/insert.pass.cpp | 35 +++++++++++++++++++ 4 files changed, 74 insertions(+), 4 deletions(-) create mode 100644 libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp create mode 100644 libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp diff --git a/libcxx/include/ext/hash_map b/libcxx/include/ext/hash_map index d6b92204f4376..46815eaffa8bd 100644 --- a/libcxx/include/ext/hash_map +++ b/libcxx/include/ext/hash_map @@ -744,7 +744,7 @@ public: _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); } _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); } - _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); } + _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); } _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); } template _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last); @@ -831,7 +831,7 @@ template template inline void hash_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) { for (; __first != __last; ++__first) - __table_.__emplace_unique(*__first); + __table_.__emplace_multi(*__first); } template diff --git a/libcxx/include/ext/hash_set b/libcxx/include/ext/hash_set index 7fd5df24ed3a8..62a7a0dbcffb9 100644 --- a/libcxx/include/ext/hash_set +++ b/libcxx/include/ext/hash_set @@ -458,7 +458,7 @@ public: _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); } _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); } - _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); } + _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); } _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); } template _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last); @@ -543,7 +543,7 @@ template template inline void hash_multiset<_Value, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) { for (; __first != __last; ++__first) - __table_.__emplace_unique(*__first); + __table_.__emplace_multi(*__first); } template diff --git a/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp new file mode 100644 index 0000000000000..ea80359f1fea2 --- /dev/null +++ b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated + +// hash_multimap::insert + +#include +#include + +int main(int, char**) { + __gnu_cxx::hash_multimap map; + + map.insert(std::make_pair(1, 1)); + map.insert(std::make_pair(1, 1)); + + assert(map.size() == 2); + assert(map.equal_range(1).first == map.begin()); + assert(map.equal_range(1).second == map.end()); + + std::pair arr[] = {std::make_pair(1, 1), std::make_pair(1, 1)}; + + map.insert(arr, arr + 2); + + assert(map.size() == 4); + assert(map.equal_range(1).first == map.begin()); + assert(map.equal_range(1).second == map.end()); + + return 0; +} diff --git a/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp new file mode 100644 index 0000000000000..1a60cac158a40 --- /dev/null +++ b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated + +// hash_multimap::insert + +#include +#include + +int main(int, char**) { + __gnu_cxx::hash_multiset map; + + map.insert(1); + map.insert(1); + + assert(map.size() == 2); + assert(map.equal_range(1).first == map.begin()); + assert(map.equal_range(1).second == map.end()); + + int arr[] = {1, 1}; + + map.insert(arr, arr + 2); + + assert(map.size() == 4); + assert(map.equal_range(1).first == map.begin()); + assert(map.equal_range(1).second == map.end()); + + return 0; +} From 70046cd2b5f4302146d7ea79497b84748f606c77 Mon Sep 17 00:00:00 2001 From: Changpeng Fang Date: Thu, 17 Jul 2025 14:26:09 -0700 Subject: [PATCH 246/813] AMDGPU: Remove the dot4 test in insert-delay-alu-wmma-xdl.mir, NFC (#149375) This is irrelevant, and caused a failure in downstream. Fixes: SWDEV-544025 --- .../AMDGPU/insert-delay-alu-wmma-xdl.mir | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir index 7c3170d8d1e9f..0abf34797a5e7 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-wmma-xdl.mir @@ -65,20 +65,3 @@ body: | $vgpr12 = V_EXP_F32_e32 $vgpr12, implicit $exec, implicit $mode $vgpr13 = V_ADD_U32_e32 $vgpr13, $vgpr8, implicit $exec ... - ---- -name: dot_xdl_dep_2 -tracksRegLiveness: true -body: | - bb.0: - ; CHECK-LABEL: {{^}}dot_xdl_dep_2: - ; CHECK: %bb.0: - ; CHECK-NEXT: v_dot4_i32_iu8 v0, s2, s3, v0 neg_lo:[1,1,0] - ; CHECK-NEXT: v_dot4_i32_iu8 v1, s2, s3, v2 neg_lo:[1,1,0] - ; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) - ; CHECK-NEXT: v_add_nc_u32_e32 v2, v0, v0 - liveins: $vgpr0, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2 - $vgpr0 = V_DOT4_I32_IU8 9, $sgpr2, 9, $sgpr3, 8, $vgpr0, 0, 0, 0, implicit $exec - $vgpr1 = V_DOT4_I32_IU8 9, $sgpr2, 9, $sgpr3, 8, $vgpr2, 0, 0, 0, implicit $exec - $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec -... From b8264293a714347a77f150b109cfdde8665eeadc Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 17 Jul 2025 22:36:25 +0100 Subject: [PATCH 247/813] [lldb][test] TestChildDepthTruncation: don't force DWARF Fixes test on Windows. Same reason as https://github.com/llvm/llvm-project/pull/149322 --- lldb/test/Shell/Settings/TestChildDepthTruncation.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/Shell/Settings/TestChildDepthTruncation.test b/lldb/test/Shell/Settings/TestChildDepthTruncation.test index e0e6cda516655..12f5661600ae7 100644 --- a/lldb/test/Shell/Settings/TestChildDepthTruncation.test +++ b/lldb/test/Shell/Settings/TestChildDepthTruncation.test @@ -2,7 +2,7 @@ # when target.max-children-depth wasn't explicitly set. # RUN: split-file %s %t -# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out +# RUN: %clang_host -g %t/main.cpp -o %t.out # RUN: %lldb -x -b -s %t/dwim-commands.input %t.out -o exit 2>&1 \ # RUN: | FileCheck %s --check-prefix=DWIM # From 6b29ee9d9a8dc6eaf1f47b4d66b4c569e00a112f Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Thu, 17 Jul 2025 23:42:25 +0200 Subject: [PATCH 248/813] [mlir][amdgpu] Properly handle mismatching memref ranks in `amdgpu.gather_to_lds` (#149407) This op doesn't have any rank or indices restrictions on src/dst memrefs, but was using `SameVariadicOperandSize` which was causing issues. Also fix some other issues while we at it. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 16 ++++++++-------- mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp | 4 ++++ mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir | 7 +++++-- mlir/test/Dialect/AMDGPU/invalid.mlir | 8 ++++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 11 +++++++++++ 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index eadb5d9326798..80959ffbaf426 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -127,7 +127,7 @@ def AMDGPU_ScaledExtPackedOp let summary = "Extend a vector of packed floating point values"; let description = [{ - Extend and scale two packed floats in `source[index]` to two floats and + Extend and scale two packed floats in `source[index]` to two floats and return them. This rather unusual signature arises from the fact that AMD GPUs cannot @@ -861,7 +861,7 @@ def AMDGPU_WMMAOp : } def AMDGPU_GatherToLDSOp : - AMDGPU_Op<"gather_to_lds", [SameVariadicOperandSize]>, + AMDGPU_Op<"gather_to_lds", [AttrSizedOperandSegments]>, Arguments<(ins Arg:$src, Variadic:$srcIndices, @@ -966,13 +966,13 @@ def AMDGPU_ScaledMFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences: - - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and - fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile - size. - - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` + - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and + fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile + size. + - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp` are omitted from this wrapper. - - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for - double-precision operations on gfx94x and so are not included here. + - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for + double-precision operations on gfx94x and so are not included here. }]; let assemblyFormat = [{ `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp index acaf6a2f8792a..88c2eb3326d96 100644 --- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp +++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp @@ -134,6 +134,8 @@ static bool hasGlobalMemorySpace(Attribute memorySpace) { } static bool hasWorkgroupMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; if (auto intMemorySpace = dyn_cast(memorySpace)) return intMemorySpace.getInt() == 3; if (auto gpuMemorySpace = dyn_cast(memorySpace)) @@ -142,6 +144,8 @@ static bool hasWorkgroupMemorySpace(Attribute memorySpace) { } static bool hasFatRawBufferMemorySpace(Attribute memorySpace) { + if (!memorySpace) + return false; if (auto intMemorySpace = dyn_cast(memorySpace)) return intMemorySpace.getInt() == 7; if (auto gpuMemorySpace = dyn_cast(memorySpace)) diff --git a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir index 77103fa5c25f1..e48c94195ea56 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/load_lds.mlir @@ -127,12 +127,15 @@ func.func @global_load_to_rocdl_dynamic_indices(%global : memref<512xi32, #gpu_g // CHECK: %[[GLOBAL_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[ALLOC:.*]] = memref.alloc() // CHECK: %[[LDS_DESC:.*]] = builtin.unrealized_conversion_cast %[[ALLOC]] + // CHECK: %[[C0:.*]] = arith.constant 0 : index + // CHECK: %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64 // CHECK: %[[GLOBAL_BASE:.*]] = llvm.extractvalue %[[GLOBAL_DESC]][1] // CHECK: %[[GLOBAL_PTR:.*]] = llvm.getelementptr %[[GLOBAL_BASE]][%[[SRCIDX_CAST]]] // CHECK: %[[LDS_BASE:.*]] = llvm.extractvalue %[[LDS_DESC]][1] // CHECK: %[[C64:.*]] = llvm.mlir.constant(64 : index) : i64 // CHECK: %[[DSTIDX:.*]] = llvm.mul %[[DSTIDX_CAST]], %[[C64]] : i64 - // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX]]] + // CHECK: %[[DSTIDX1:.*]] = llvm.add %[[DSTIDX]], %[[C0_I64]] : i64 + // CHECK: %[[LDS_PTR:.*]] = llvm.getelementptr %[[LDS_BASE]][%[[DSTIDX1]]] // CHECK: rocdl.load.to.lds %[[GLOBAL_PTR]], %[[LDS_PTR]], 4 %alloc = memref.alloc() : memref<4x64xi32, #gpu_lds_addrspace> %c0 = arith.constant 0 : index @@ -151,7 +154,7 @@ func.func @fat_buffer_load_to_rocdl_f32(%global : memref<128x72xf32, #amdgpu_fat // CHECK: %[[BUFFER_DESC:.*]] = builtin.unrealized_conversion_cast %[[ARG0]] // CHECK: %[[C0:.*]] = arith.constant 0 : index - // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %c0 : index to i64 + // CHECK: %[[IC0:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64 // CHECK: %[[C12:.*]] = arith.constant 12 : index // CHECK: %[[IC12:.*]] = builtin.unrealized_conversion_cast %[[C12]] // CHECK: %[[C32:.*]] = arith.constant 32 : index diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir index 6d55583f8bc7c..0d2fd245af9e2 100644 --- a/mlir/test/Dialect/AMDGPU/invalid.mlir +++ b/mlir/test/Dialect/AMDGPU/invalid.mlir @@ -222,3 +222,11 @@ func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : me %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6> func.return %0 : vector<8xi6> } + +// ----- + +func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 : memref<32xf16>) { + // expected-error@+1 {{'amdgpu.gather_to_lds' op destination memory address space must be Workgroup}} + amdgpu.gather_to_lds %mem1[%idx1], %mem2[%idx1] : vector<2xf16>, memref<32xf16>, memref<32xf16> + func.return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 51f3bbd9ae45c..5559ac8f1a5c3 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -493,3 +493,14 @@ func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16 %0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16> func.return %0 : vector<4xf16> } + +// CHECK-LABEL: func @gather_to_lds +func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %mem2 : memref<32x32xf16>, %smem1 : memref<32xf16, #gpu.address_space>, %smem2 : memref<32x32xf16, #gpu.address_space>) { + // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}] + // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}}[%{{.*}}] + // CHECK: amdgpu.gather_to_lds %{{.*}}[%{{.*}}], %{{.*}}[%{{.*}}, %{{.*}}] + amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32x32xf16>, memref<32x32xf16, #gpu.address_space> + amdgpu.gather_to_lds %mem2[%idx1, %idx2], %smem1[%idx1] : vector<2xf16>, memref<32x32xf16>, memref<32xf16, #gpu.address_space> + amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space> + func.return +} From 25619c406ee2590f7b18364a5c88c7492bba3508 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 17 Jul 2025 14:45:26 -0700 Subject: [PATCH 249/813] [AMDGPU] Remove unused VGLOBAL_Real_AllAddr_gfx12. NFC. (#149398) --- llvm/lib/Target/AMDGPU/FLATInstructions.td | 7 ------- 1 file changed, 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index ff57a12561ca1..c84962b52befd 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -2947,13 +2947,6 @@ multiclass VFLAT_Real_AllAddr_gfx12 op, defm _SADDR : VFLAT_Real_gfx12; } -multiclass VGLOBAL_Real_AllAddr_gfx12 op, - string name = get_FLAT_ps.Mnemonic, - string alias = name> : - VFLAT_Real_Base_gfx12 { - defm _SADDR : VFLAT_Real_gfx12; -} - multiclass VGLOBAL_Real_AllAddr_gfx1200 op> { let AssemblerPredicate = isGFX12Not12_50 in { defm "" : VFLAT_Real_gfx12; From 5d78332e8aa8c2542aae80fd580c8137a060cb6b Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Thu, 17 Jul 2025 14:46:24 -0700 Subject: [PATCH 250/813] Add llvm-ir2vec.rst to pr-subscribes-mlgo (#149412) --- .github/new-prs-labeler.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index d4cf869b023a1..7905f762bf8e8 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -718,6 +718,7 @@ mlgo: - llvm/lib/Analysis/models/** - llvm/test/Analysis/IR2Vec/** - llvm/tools/llvm-ir2vec/** + - llvm/docs/CommandGuide/llvm-ir2vec.rst tools:llvm-exegesis: - llvm/tools/llvm-exegesis/** From 689e95817e1671b0ed6c7f2031fbcf2f81632978 Mon Sep 17 00:00:00 2001 From: "Deric C." Date: Thu, 17 Jul 2025 14:46:45 -0700 Subject: [PATCH 251/813] [DirectX] Add a GEP to scalar load/store on globals and remove incorrect assertion (#149191) Fixes #149180 This PR removes an assertion that triggered on valid IR. It has been replaced with an if statement that returns early if the conditions are not correct. This PR also adds GEPs to scalar loads and stores from/to global variables. --- llvm/lib/Target/DirectX/DXILLegalizePass.cpp | 36 +++++++++++-------- .../legalize-load-store-array-alloca.ll | 18 ++++++++++ .../DirectX/llc-vector-load-scalarize.ll | 9 +++-- llvm/test/CodeGen/DirectX/scalar-store.ll | 6 ++-- 4 files changed, 50 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp index c9ff7137fdac1..c73648f21e8d7 100644 --- a/llvm/lib/Target/DirectX/DXILLegalizePass.cpp +++ b/llvm/lib/Target/DirectX/DXILLegalizePass.cpp @@ -563,7 +563,7 @@ legalizeGetHighLowi64Bytes(Instruction &I, } static void -legalizeLoadStoreOnArrayAllocas(Instruction &I, +legalizeScalarLoadStoreOnArrays(Instruction &I, SmallVectorImpl &ToRemove, DenseMap &) { @@ -581,23 +581,31 @@ legalizeLoadStoreOnArrayAllocas(Instruction &I, } else return; - assert(LoadStoreTy->isSingleValueType() && - "Expected load/store type to be a single-valued type"); + // If the load/store is not of a single-value type (i.e., scalar or vector) + // then we do not modify it. It shouldn't be a vector either because the + // dxil-data-scalarization pass is expected to run before this, but it's not + // incorrect to apply this transformation to vector load/stores. + if (!LoadStoreTy->isSingleValueType()) + return; - auto *AllocaPtrOp = dyn_cast(PtrOp); - if (!AllocaPtrOp) + Type *ArrayTy; + if (auto *GlobalVarPtrOp = dyn_cast(PtrOp)) + ArrayTy = GlobalVarPtrOp->getValueType(); + else if (auto *AllocaPtrOp = dyn_cast(PtrOp)) + ArrayTy = AllocaPtrOp->getAllocatedType(); + else return; - Type *Ty = AllocaPtrOp->getAllocatedType(); - if (!isa(Ty)) + if (!isa(ArrayTy)) return; - assert(!isa(Ty->getArrayElementType()) && - "Expected allocated type of AllocaInst to be a flat ArrayType"); - IRBuilder<> Builder(&I); - Value *Zero = Builder.getInt32(0); - Value *GEP = Builder.CreateGEP(Ty, AllocaPtrOp, {Zero, Zero}, "", - GEPNoWrapFlags::all()); + assert(ArrayTy->getArrayElementType() == LoadStoreTy && + "Expected array element type to be the same as to the scalar load or " + "store type"); + + Value *Zero = ConstantInt::get(Type::getInt32Ty(I.getContext()), 0); + Value *GEP = GetElementPtrInst::Create( + ArrayTy, PtrOp, {Zero, Zero}, GEPNoWrapFlags::all(), "", I.getIterator()); I.setOperand(PtrOpIndex, GEP); } @@ -651,7 +659,7 @@ class DXILLegalizationPipeline { // downcastI64toI32InsertExtractElements needs to handle. LegalizationPipeline[Stage2].push_back( downcastI64toI32InsertExtractElements); - LegalizationPipeline[Stage2].push_back(legalizeLoadStoreOnArrayAllocas); + LegalizationPipeline[Stage2].push_back(legalizeScalarLoadStoreOnArrays); } }; diff --git a/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll b/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll index b25b3de901d91..c6789ac7886d5 100644 --- a/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll +++ b/llvm/test/CodeGen/DirectX/legalize-load-store-array-alloca.ll @@ -21,3 +21,21 @@ define void @store() { store i32 0, ptr %a, align 4 ret void } + +@g = local_unnamed_addr addrspace(3) global [4 x i32] zeroinitializer, align 4 +define void @load_whole_global () { +; CHECK-LABEL: define void @load_whole_global +; CHECK-NEXT: load [4 x i32], ptr addrspace(3) @g, align 4 +; CHECK-NEXT: ret void + %l = load [4 x i32], ptr addrspace(3) @g, align 4 + ret void +} + +define void @load_global_index0 () { +; CHECK-LABEL: define void @load_global_index0 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @g, i32 0, i32 0 +; CHECK-NEXT: load i32, ptr addrspace(3) [[GEP]], align 4 +; CHECK-NEXT: ret void + %l = load i32, ptr addrspace(3) @g, align 4 + ret void +} diff --git a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll index 27a892591a867..0c91c53227763 100644 --- a/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll +++ b/llvm/test/CodeGen/DirectX/llc-vector-load-scalarize.ll @@ -24,7 +24,8 @@ define <4 x i32> @load_array_vec_test() #0 { ; CHECK-LABEL: define <4 x i32> @load_array_vec_test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(3) @arrayofVecData.scalarized.1dim, align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4 ; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(3) getelementptr ([8 x i32], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 4 @@ -52,7 +53,8 @@ define <4 x i32> @load_array_vec_test() #0 { define <4 x i32> @load_vec_test() #0 { ; CHECK-LABEL: define <4 x i32> @load_vec_test( ; CHECK-SAME: ) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) @vecData.scalarized, align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 3), align 4 @@ -203,7 +205,8 @@ define <4 x i32> @load_static_array_of_vec_from_i8_gep_test(i32 %index) #0 { define <4 x i32> @multid_load_test() #0 { ; CHECK-LABEL: define <4 x i32> @multid_load_test( ; CHECK-SAME: ) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[GEP]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 1), align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) getelementptr ([36 x i32], ptr addrspace(3) @groupshared2dArrayofVectors.scalarized.1dim, i32 0, i32 3), align 4 diff --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll index a124c665ad15e..4394235ffe4bd 100644 --- a/llvm/test/CodeGen/DirectX/scalar-store.ll +++ b/llvm/test/CodeGen/DirectX/scalar-store.ll @@ -14,7 +14,8 @@ ; CHECK-LABEL: store_array_vec_test define void @store_array_vec_test () local_unnamed_addr #0 { -; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(3) @arrayofVecData.scalarized.1dim, align 16 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 0 +; CHECK-NEXT: store float 1.000000e+00, ptr addrspace(3) [[GEP]], align 16 ; CHECK-NEXT: store float 2.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 1), align 4 ; CHECK-NEXT: store float 3.000000e+00, ptr addrspace(3) getelementptr ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 2), align 8 ; CHECK-NEXT: store float 2.000000e+00, ptr addrspace(3) getelementptr inbounds ([6 x float], ptr addrspace(3) @arrayofVecData.scalarized.1dim, i32 0, i32 3), align 16 @@ -30,7 +31,8 @@ define void @store_array_vec_test () local_unnamed_addr #0 { ; CHECK-LABEL: store_vec_test define void @store_vec_test(<4 x i32> %inputVec) #0 { ; CHECK-NEXT: [[INPUTVEC_I01:%.*]] = extractelement <4 x i32> %inputVec, i32 0 -; CHECK-NEXT: store i32 [[INPUTVEC_I01]], ptr addrspace(3) @vecData.scalarized, align 4 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 0 +; CHECK-NEXT: store i32 [[INPUTVEC_I01]], ptr addrspace(3) [[GEP]], align 4 ; CHECK-NEXT: [[INPUTVEC_I12:%.*]] = extractelement <4 x i32> %inputVec, i32 1 ; CHECK-NEXT: store i32 [[INPUTVEC_I12]], ptr addrspace(3) getelementptr ([4 x i32], ptr addrspace(3) @vecData.scalarized, i32 0, i32 1), align 4 ; CHECK-NEXT: [[INPUTVEC_I23:%.*]] = extractelement <4 x i32> %inputVec, i32 2 From fae8df2b82692ec8f69ba578847713f0da6e1ddc Mon Sep 17 00:00:00 2001 From: "Deric C." Date: Thu, 17 Jul 2025 14:51:53 -0700 Subject: [PATCH 252/813] [DirectX] Fix GEP flattening with 0-indexed GEPs on global variables (#149211) Fixes #149179 The issue is that `Builder.CreateGEP` does not return a GEP Instruction or GEP ContantExpr when the pointer operand is a global variable and all indices are constant zeroes. This PR ensures that a GEP instruction is created if `Builder.CreateGEP` did not return a GEP. --- llvm/lib/Target/DirectX/DXILFlattenArrays.cpp | 10 +++++++++ llvm/test/CodeGen/DirectX/flatten-array.ll | 22 +++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp index ce43645d005b0..f0e2e786dfaf4 100644 --- a/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp +++ b/llvm/lib/Target/DirectX/DXILFlattenArrays.cpp @@ -343,6 +343,16 @@ bool DXILFlattenArraysVisitor::visitGetElementPtrInst(GetElementPtrInst &GEP) { Info.RootFlattenedArrayType, Info.RootPointerOperand, {ZeroIndex, FlattenedIndex}, GEP.getName(), GEP.getNoWrapFlags()); + // If the pointer operand is a global variable and all indices are 0, + // IRBuilder::CreateGEP will return the global variable instead of creating + // a GEP instruction or GEP ConstantExpr. In this case we have to create and + // insert our own GEP instruction. + if (!isa(NewGEP)) + NewGEP = GetElementPtrInst::Create( + Info.RootFlattenedArrayType, Info.RootPointerOperand, + {ZeroIndex, FlattenedIndex}, GEP.getNoWrapFlags(), GEP.getName(), + Builder.GetInsertPoint()); + // Replace the current GEP with the new GEP. Store GEPInfo into the map // for later use in case this GEP was not the end of the chain GEPChainInfoMap.insert({cast(NewGEP), std::move(Info)}); diff --git a/llvm/test/CodeGen/DirectX/flatten-array.ll b/llvm/test/CodeGen/DirectX/flatten-array.ll index 1376a1db25975..a2e105537ab88 100644 --- a/llvm/test/CodeGen/DirectX/flatten-array.ll +++ b/llvm/test/CodeGen/DirectX/flatten-array.ll @@ -218,6 +218,28 @@ define void @two_index_gep_const() { ret void } +define void @zero_index_global() { + ; CHECK-LABEL: define void @zero_index_global( + ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw [4 x float], ptr addrspace(3) @g.1dim, i32 0, i32 0 + ; CHECK-NEXT: load float, ptr addrspace(3) [[GEP]], align 4 + ; CHECK-NEXT: ret void + %1 = getelementptr inbounds nuw [2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0 + %2 = load float, ptr addrspace(3) %1, align 4 + ret void +} + +; Note: A ConstantExpr GEP with all 0 indices is equivalent to the pointer +; operand of the GEP. Therefore the visitLoadInst will not see the pointer operand +; as a ConstantExpr GEP and will not create a GEP instruction to be visited. +; The later dxil-legalize pass will insert a GEP in this instance. +define void @zero_index_global_const() { + ; CHECK-LABEL: define void @zero_index_global_const( + ; CHECK-NEXT: load float, ptr addrspace(3) @g.1dim, align 4 + ; CHECK-NEXT: ret void + %1 = load float, ptr addrspace(3) getelementptr inbounds nuw ([2 x [2 x float]], ptr addrspace(3) @g, i32 0, i32 0, i32 0), align 4 + ret void +} + define void @gep_4d_index_test() { ; CHECK-LABEL: gep_4d_index_test ; CHECK: [[a:%.*]] = alloca [16 x i32], align 4 From 72a2d8220ade3f9ac96f5424f803c774499cf54c Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 17 Jul 2025 15:05:20 -0700 Subject: [PATCH 253/813] [libc] Convert dlfcn.h to pure YAML (#149362) Remove the unnecessary .h.def file and move all the macro definitions directly into dlfcn.yaml. --- libc/include/CMakeLists.txt | 1 - libc/include/dlfcn.h.def | 17 ---------- libc/include/dlfcn.yaml | 33 +++++++++++++++----- libc/include/llvm-libc-macros/dlfcn-macros.h | 23 -------------- 4 files changed, 25 insertions(+), 49 deletions(-) delete mode 100644 libc/include/dlfcn.h.def delete mode 100644 libc/include/llvm-libc-macros/dlfcn-macros.h diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 984b960acb2d7..73213826ad607 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -69,7 +69,6 @@ add_header_macro( ../libc/include/dlfcn.yaml dlfcn.h DEPENDS - .llvm-libc-macros.dlfcn_macros .llvm_libc_common_h ) diff --git a/libc/include/dlfcn.h.def b/libc/include/dlfcn.h.def deleted file mode 100644 index 31395871c6b97..0000000000000 --- a/libc/include/dlfcn.h.def +++ /dev/null @@ -1,17 +0,0 @@ -//===-- C standard library header dlfcn.h ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_DLFCN_H -#define LLVM_LIBC_DLFCN_H - -#include "__llvm-libc-common.h" -#include "llvm-libc-macros/dlfcn-macros.h" - -%%public_api() - -#endif // LLVM_LIBC_DLFCN_H diff --git a/libc/include/dlfcn.yaml b/libc/include/dlfcn.yaml index 78bbeff4e60d9..28be34dbd95bd 100644 --- a/libc/include/dlfcn.yaml +++ b/libc/include/dlfcn.yaml @@ -1,17 +1,34 @@ header: dlfcn.h -header_template: dlfcn.h.def +standards: + - posix macros: + # Note that macro values are quoted to keep the integer literals as + # written. Without the quotes, YAML will normalize them to minimal + # decimal, which is less readable for humans seeing the generated header. - macro_name: RTLD_LAZY - macro_header: dlfcn-macros.h + macro_value: "0x00001" - macro_name: RTLD_NOW - macro_header: dlfcn-macros.h + macro_value: "0x00002" - macro_name: RTLD_GLOBAL - macro_header: dlfcn-macros.h + macro_value: "0x00100" - macro_name: RTLD_LOCAL - macro_header: dlfcn-macros.h -types: [] -enums: [] -objects: [] + macro_value: "0" + - macro_name: RTLD_BINDING_MASK + standards: + - gnu + macro_value: "0x00003" + - macro_name: RTLD_NOLOAD + standards: + - gnu + macro_value: "0x00004" + - macro_name: RTLD_DEEPBIND + standards: + - gnu + macro_value: "0x00008" + - macro_name: RTLD_NODELETE + standards: + - gnu + macro_value: "0x01000" functions: - name: dlclose standards: diff --git a/libc/include/llvm-libc-macros/dlfcn-macros.h b/libc/include/llvm-libc-macros/dlfcn-macros.h deleted file mode 100644 index dcd202b9ab435..0000000000000 --- a/libc/include/llvm-libc-macros/dlfcn-macros.h +++ /dev/null @@ -1,23 +0,0 @@ -//===-- Definition of macros from dlfcn.h ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_MACROS_DLFCN_MACROS_H -#define LLVM_LIBC_MACROS_DLFCN_MACROS_H - -#define RTLD_LAZY 0x00001 -#define RTLD_NOW 0x00002 -#define RTLD_GLOBAL 0x00100 -#define RTLD_LOCAL 0 - -// Non-standard stuff here -#define RTLD_BINDING_MASK 0x3 -#define RTLD_NOLOAD 0x00004 -#define RTLD_DEEPBIND 0x00008 -#define RTLD_NODELETE 0x01000 - -#endif // LLVM_LIBC_MACROS_DLFCN_MACROS_H From fc3781853ba1b456429a908e5604589c68878ab8 Mon Sep 17 00:00:00 2001 From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com> Date: Thu, 17 Jul 2025 15:13:20 -0700 Subject: [PATCH 254/813] [mlir][xegpu] Minor fixes in XeGPU subgroup distribution. (#147846) This PR addresses the following issues. 1. Add the missing attributes when creating a new GPU funcOp in `MoveFuncBodyToWarpExecuteOnLane0` pattern. 2. Bug fix in LoadNd distribution to make sure LoadOp is the last op in warpOp region before it is distributed (needed for preserving the memory op ordering during distribution). 3. Add utility for removing OpOperand or OpResult layout attributes. --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 11 ++ .../Transforms/XeGPUSubgroupDistribute.cpp | 140 +++++++++--------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 25 ++++ .../Dialect/XeGPU/subgroup-distribute.mlir | 8 +- 4 files changed, 110 insertions(+), 74 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 6fea10185402a..488f358ff3802 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -76,6 +76,17 @@ LayoutAttr getLayoutAttr(const Value value); /// it will check the operand itself and its defining op. LayoutAttr getLayoutAttr(const OpOperand &opr); +/// Removes the LayoutAttr for a given OpOperand or OpResult if it exists. +template || + std::is_same_v>> +void removeLayoutAttr(const T &operandOrResult); + +/// Removes the LayoutAttr for each OpOperand and OpResult of the given +/// operation if they exist. If the operation contains regions, it is also +/// applied recursively to the contained operations +void removeLayoutAttrs(Operation *op); + /// Sets the LayoutAttr for a given OpOperand or OpResult by attaching /// it to the owner's dictionary attributes template -removeTemporaryLayoutAttributes(ArrayRef attrs) { - SmallVector newAttrs; - for (NamedAttribute attr : attrs) { - if (!isa(attr.getValue())) - newAttrs.push_back(attr); - } - return newAttrs; -} - /// Helper function to check if the layout is packed. Layout is packed if it is /// 2D and lane_data[0] != 1 (data packed from col dimension). static bool hasPackedLayout(xegpu::LayoutAttr layout) { @@ -197,9 +184,17 @@ struct MoveFuncBodyToWarpExecuteOnLane0 return isa(op); })) return failure(); - // Create a new function with the same signature. + // Create a new function with the same signature and same attributes. + SmallVector workgroupAttributionsTypes = + llvm::map_to_vector(gpuFuncOp.getWorkgroupAttributions(), + [](BlockArgument arg) { return arg.getType(); }); + SmallVector privateAttributionsTypes = + llvm::map_to_vector(gpuFuncOp.getPrivateAttributions(), + [](BlockArgument arg) { return arg.getType(); }); auto newGpuFunc = rewriter.create( - gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType()); + gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType(), + workgroupAttributionsTypes, privateAttributionsTypes); + newGpuFunc->setAttrs(gpuFuncOp->getAttrs()); // Create a WarpExecuteOnLane0Op with same arguments and results as the // original gpuFuncOp. rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front()); @@ -265,13 +260,13 @@ struct MoveFuncBodyToWarpExecuteOnLane0 /// ``` struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { OpOperand *operand = - getWarpResult(subgroupOp, llvm::IsaPred); + getWarpResult(warpOp, llvm::IsaPred); if (!operand) return rewriter.notifyMatchFailure( - subgroupOp, "warp result is not a xegpu::CreateNdDesc op"); + warpOp, "warp result is not a xegpu::CreateNdDesc op"); auto descOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); @@ -288,9 +283,9 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { newYieldValues.push_back(operand); newYieldTypes.push_back(operand.getType()); } - rewriter.setInsertionPoint(subgroupOp); + rewriter.setInsertionPoint(warpOp); gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, subgroupOp, /* new yieled values = */ newYieldValues, + rewriter, warpOp, /* new yieled values = */ newYieldValues, /* new yielded types = */ newYieldTypes, newRetIndices); SmallVector newDescOperands; @@ -347,10 +342,10 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern { /// ``` struct StoreNdDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { auto yield = cast( - subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator()); + warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); Operation *lastNode = yield->getPrevNode(); auto storeOp = dyn_cast_or_null(lastNode); if (!storeOp) @@ -372,7 +367,7 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, subgroupOp, + rewriter, warpOp, /* new yielded values = */ ValueRange{storeOp.getValue(), storeOp.getTensorDesc()}, /* new yielded types = */ @@ -403,9 +398,9 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]), distributedTensorDescTy, rewriter)); - rewriter.create( - newWarpOp.getLoc(), TypeRange{}, newStoreOperands, - removeTemporaryLayoutAttributes(storeOp->getAttrs())); + auto newStoreOp = rewriter.create( + newWarpOp.getLoc(), TypeRange{}, newStoreOperands, storeOp->getAttrs()); + xegpu::removeLayoutAttrs(newStoreOp); rewriter.eraseOp(storeOp); return success(); } @@ -449,21 +444,22 @@ struct StoreNdDistribution final : public gpu::WarpDistributionPattern { /// ``` struct LoadNdDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { - OpOperand *operand = - getWarpResult(subgroupOp, llvm::IsaPred); + OpOperand *operand = getWarpResult(warpOp, [&](Operation *op) { + if (!isa(op)) + return false; + // Make sure the same load op is the last operation in the warp op body. + // This ensure that load op is not sinked earlier violating any barrier + // synchronizations. + auto yield = cast( + warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); + return yield->getPrevNode() == op; + }); + if (!operand) return rewriter.notifyMatchFailure( - subgroupOp, "warp result is not a xegpu::LoadNd op"); - // Make sure the load op is the last operation in the warp op body. This - // ensure that load op is not sinked earlier violating any barrier - // synchronizations. - auto yield = cast( - subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator()); - Operation *lastNode = yield->getPrevNode(); - if (!dyn_cast_or_null(lastNode)) - return failure(); + warpOp, "warp result is not a xegpu::LoadNd op"); auto loadOp = operand->get().getDefiningOp(); xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType(); @@ -474,11 +470,11 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { unsigned operandIdx = operand->getOperandNumber(); VectorType distributedTypeByWarpOp = - cast(subgroupOp.getResult(operandIdx).getType()); + cast(warpOp.getResult(operandIdx).getType()); SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, subgroupOp, + rewriter, warpOp, /* new yielded values = */ loadOp.getTensorDesc(), /* new yielded types = */ tensorDescTy, newRetIndices); @@ -498,7 +494,8 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(), resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]), distributedTensorDescTy, rewriter), - removeTemporaryLayoutAttributes(loadOp->getAttrs())); + loadOp->getAttrs()); + xegpu::removeLayoutAttrs(newLoadOp); // Set the packed attribute if the layout requires it. newLoadOp.setPacked(hasPackedLayout(layout)); Value distributedVal = newWarpOp.getResult(operandIdx); @@ -548,12 +545,11 @@ struct LoadNdDistribution final : public gpu::WarpDistributionPattern { /// ``` struct DpasDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { - OpOperand *operand = - getWarpResult(subgroupOp, llvm::IsaPred); + OpOperand *operand = getWarpResult(warpOp, llvm::IsaPred); if (!operand) - return rewriter.notifyMatchFailure(subgroupOp, + return rewriter.notifyMatchFailure(warpOp, "warp result is not a xegpu::Dpas op"); auto dpasOp = operand->get().getDefiningOp(); @@ -599,7 +595,7 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { // Create a new warp op without the dpas. SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices); + rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); FailureOr expectedDistLhsTyOrFailure = xegpu::getDistributedVectorType(dpasOp.getLhsType(), layoutA); @@ -630,14 +626,16 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { resolveDistributedTy(newWarpOp.getResult(newRetIndices[i]), newDpasOperandExpectedTypes[i], rewriter)); } - Value newDpasOp = rewriter.create( - newWarpOp->getLoc(), distributedResultTy, newDpasOperands, - removeTemporaryLayoutAttributes(dpasOp->getAttrs())); + auto newDpasOp = + rewriter.create(newWarpOp->getLoc(), distributedResultTy, + newDpasOperands, dpasOp->getAttrs()); + xegpu::removeLayoutAttrs(newDpasOp); Value distributedVal = newWarpOp.getResult(operandIdx); // Resolve the output type. - newDpasOp = resolveDistributedTy( - newDpasOp, distResultTypeByWarpOpOrFailure.value(), rewriter); - rewriter.replaceAllUsesWith(distributedVal, newDpasOp); + Value typeResolved = + resolveDistributedTy(newDpasOp.getResult(), + distResultTypeByWarpOpOrFailure.value(), rewriter); + rewriter.replaceAllUsesWith(distributedVal, typeResolved); return success(); } }; @@ -678,13 +676,13 @@ struct DpasDistribution final : public gpu::WarpDistributionPattern { /// ``` struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { OpOperand *operand = - getWarpResult(subgroupOp, llvm::IsaPred); + getWarpResult(warpOp, llvm::IsaPred); if (!operand) return rewriter.notifyMatchFailure( - subgroupOp, "warp result is not a xegpu::UpdateNdOffset op"); + warpOp, "warp result is not a xegpu::UpdateNdOffset op"); auto updateOp = operand->get().getDefiningOp(); unsigned operandIdx = operand->getOperandNumber(); // new update op does not have layout attribute. @@ -703,7 +701,7 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { } SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices); + rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); rewriter.setInsertionPointAfter(newWarpOp); SmallVector newUpdateOperands; for (size_t i : newRetIndices) { @@ -717,14 +715,15 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { } } // Create a new update op outside the warp op. - Value newUpdateOp = rewriter.create( + auto newUpdateOp = rewriter.create( newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands, - removeTemporaryLayoutAttributes(updateOp->getAttrs())); + updateOp->getAttrs()); + xegpu::removeLayoutAttrs(newUpdateOp); Value distributedVal = newWarpOp.getResult(operandIdx); // Resolve the distributed type with the original type. - newUpdateOp = - resolveDistributedTy(newUpdateOp, distributedVal.getType(), rewriter); - rewriter.replaceAllUsesWith(distributedVal, newUpdateOp); + Value typeResolved = resolveDistributedTy( + newUpdateOp.getResult(), distributedVal.getType(), rewriter); + rewriter.replaceAllUsesWith(distributedVal, typeResolved); return success(); } }; @@ -758,10 +757,10 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern { /// ``` struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { auto yield = cast( - subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator()); + warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); Operation *lastNode = yield->getPrevNode(); auto prefetchOp = dyn_cast_or_null(lastNode); if (!prefetchOp) @@ -775,7 +774,7 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { SmallVector newYieldTypes = {prefetchOp.getTensorDescType()}; SmallVector newRetIndices; gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns( - rewriter, subgroupOp, newYieldValues, newYieldTypes, newRetIndices); + rewriter, warpOp, newYieldValues, newYieldTypes, newRetIndices); // Create a new prefetch op outside the warp op with updated tensor // descriptor type. Source tensor descriptor require type resolution. xegpu::TensorDescType newTensorDescTy = @@ -783,9 +782,10 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { rewriter.setInsertionPointAfter(newWarpOp); SmallVector newPrefetchOperands = {resolveDistributedTy( newWarpOp.getResult(newRetIndices[0]), newTensorDescTy, rewriter)}; - rewriter.create( - newWarpOp.getLoc(), TypeRange{}, newPrefetchOperands, - removeTemporaryLayoutAttributes(prefetchOp->getAttrs())); + rewriter.create(newWarpOp.getLoc(), TypeRange{}, + newPrefetchOperands, + prefetchOp->getAttrs()); + xegpu::removeLayoutAttrs(prefetchOp); rewriter.eraseOp(prefetchOp); return success(); } @@ -795,17 +795,17 @@ struct PrefetchNdDistribution final : public gpu::WarpDistributionPattern { /// region. This will simply move the barrier op outside of the warp op. struct GpuBarrierDistribution final : public gpu::WarpDistributionPattern { using gpu::WarpDistributionPattern::WarpDistributionPattern; - LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp, + LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp, PatternRewriter &rewriter) const override { auto yield = cast( - subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator()); + warpOp.getBodyRegion().getBlocks().begin()->getTerminator()); Operation *lastNode = yield->getPrevNode(); // The last node must be a gpu::BarrierOp. auto barrierOp = dyn_cast_or_null(lastNode); if (!barrierOp) return failure(); // Move the barrier op outside of the warp op. - rewriter.setInsertionPointAfter(subgroupOp); + rewriter.setInsertionPointAfter(warpOp); rewriter.create( barrierOp.getLoc(), barrierOp->getResultTypes(), barrierOp->getOperands(), barrierOp->getAttrs()); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 6b85a66a8bd36..370d149ee55af 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -184,6 +184,31 @@ void xegpu::setLayoutAttrs(Operation *op, }); } +template +void xegpu::removeLayoutAttr(const T &operandOrResult) { + Operation *owner = operandOrResult.getOwner(); + std::string name = xegpu::getLayoutName(operandOrResult); + if (owner->hasAttrOfType(name)) + owner->removeAttr(name); +} + +// Explicit instantiation for OpResult +template void +xegpu::removeLayoutAttr(const mlir::OpResult &result); + +// Explicit instantiation for OpOperand +template void +xegpu::removeLayoutAttr(const mlir::OpOperand &operand); + +void xegpu::removeLayoutAttrs(Operation *op) { + op->walk([&](Operation *nestOp) { + for (OpOperand &opr : nestOp->getOpOperands()) + removeLayoutAttr(opr); + for (OpResult result : nestOp->getOpResults()) + removeLayoutAttr(result); + }); +} + SmallVector xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc, Value value, ArrayRef shape) { diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir index 0bfbc4a35c03b..e78ae4a17710b 100644 --- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir +++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir @@ -95,10 +95,10 @@ gpu.module @test { // ----- // CHECK-LABEL: gpu.func @load_dpas_store // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> // CHECK-DAG: %[[T5:.*]] = xegpu.create_nd_tdesc %[[ARG2]][%{{.*}}] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> // CHECK: xegpu.store_nd %[[T4]], %[[T5]] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> @@ -120,10 +120,10 @@ gpu.module @test { // ----- // CHECK-LABEL: gpu.func @load_dpas_postop_store // CHECK: (%[[ARG0:[0-9a-zA-Z]+]]: memref<8x16xf16>, %[[ARG1:[0-9a-zA-Z]+]]: memref<16x16xf16>, %[[ARG2:[0-9a-zA-Z]+]]: memref<8x16xf32>) { -// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> -// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%{{.*}}] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> // CHECK: %[[T3:.*]] = xegpu.load_nd %[[T2]] : !xegpu.tensor_desc<8x16xf16> -> vector<8xf16> +// CHECK: %[[T0:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%{{.*}}] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> +// CHECK: %[[T1:.*]] = xegpu.load_nd %[[T0]] <{packed}> : !xegpu.tensor_desc<16x16xf16> -> vector<16xf16> // CHECK-DAG: %[[T4:.*]] = xegpu.dpas %[[T3]], %[[T1]] : vector<8xf16>, vector<16xf16> -> vector<8xf32> // CHECK: %[[T5:.*]] = vector.shape_cast %[[T4]] : vector<8xf32> to vector<8x1xf32> // CHECK: %[[T6:.*]] = math.exp %[[T5]] {{{.*}}} : vector<8x1xf32> From 2a7328dacae39e87ca4cc7548b9abcdba60b946b Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 15:23:55 -0700 Subject: [PATCH 255/813] [flang] Migrate away from ArrayRef(std::nullopt_t) (#149337) ArrayRef(std::nullopt_t) has been deprecated. This patch replaces std::nullopt with {}. A subsequence patch will address those places where we need to replace std::nullopt with mlir::TypeRange{} or mlir::ValueRange{}. --- flang/lib/Lower/Bridge.cpp | 24 ++++++++-------- flang/lib/Lower/ConvertCall.cpp | 5 ++-- flang/lib/Lower/ConvertConstant.cpp | 4 +-- flang/lib/Lower/ConvertExpr.cpp | 7 ++--- flang/lib/Lower/ConvertExprToHLFIR.cpp | 2 +- flang/lib/Lower/ConvertVariable.cpp | 11 ++++---- flang/lib/Lower/HostAssociations.cpp | 6 ++-- flang/lib/Lower/IO.cpp | 2 +- flang/lib/Optimizer/Builder/HLFIRTools.cpp | 2 +- .../Optimizer/Builder/LowLevelIntrinsics.cpp | 13 ++++----- flang/lib/Optimizer/Builder/MutableBox.cpp | 28 +++++++++---------- flang/lib/Optimizer/Dialect/FIROps.cpp | 2 +- .../HLFIR/Transforms/BufferizeHLFIR.cpp | 2 +- .../Optimizer/Builder/CharacterTest.cpp | 2 +- .../Optimizer/Builder/ComplexTest.cpp | 2 +- .../Optimizer/Builder/FIRBuilderTest.cpp | 5 ++-- .../Optimizer/Builder/HLFIRToolsTest.cpp | 2 +- .../Builder/Runtime/RuntimeCallTestBase.h | 5 ++-- .../Optimizer/FortranVariableTest.cpp | 5 ++-- 19 files changed, 59 insertions(+), 70 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 33c1f1e7a3c3a..4241d12601242 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -810,11 +810,11 @@ class FirConverter : public Fortran::lower::AbstractConverter { fir::ExtendedValue read = fir::factory::genMutableBoxRead( *builder, loc, box, /*mayBePolymorphic=*/false); if (auto read_arr_box = read.getBoxOf()) { - fir::factory::genInlinedAllocation( - *builder, loc, *new_box, read_arr_box->getLBounds(), - read_arr_box->getExtents(), - /*lenParams=*/std::nullopt, name, - /*mustBeHeap=*/true); + fir::factory::genInlinedAllocation(*builder, loc, *new_box, + read_arr_box->getLBounds(), + read_arr_box->getExtents(), + /*lenParams=*/{}, name, + /*mustBeHeap=*/true); } else if (auto read_char_arr_box = read.getBoxOf()) { fir::factory::genInlinedAllocation( @@ -825,8 +825,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { } else if (auto read_char_box = read.getBoxOf()) { fir::factory::genInlinedAllocation(*builder, loc, *new_box, - /*lbounds=*/std::nullopt, - /*extents=*/std::nullopt, + /*lbounds=*/{}, + /*extents=*/{}, read_char_box->getLen(), name, /*mustBeHeap=*/true); } else { @@ -4590,8 +4590,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { // the static type of the LHS. if (Fortran::evaluate::UnwrapExpr( assign.rhs)) - return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType, - std::nullopt); + return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType, {}); hlfir::Entity rhs = Fortran::lower::convertExprToHLFIR( loc, *this, assign.rhs, localSymbols, rhsContext); // Create pointer descriptor value from the RHS. @@ -5199,7 +5198,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { "LEN parameters"); lhsRealloc = fir::factory::genReallocIfNeeded( *builder, loc, *lhsMutableBox, - /*shape=*/std::nullopt, lengthParams); + /*shape=*/{}, lengthParams); return lhsRealloc->newValue; } return genExprAddr(assign.lhs, stmtCtx); @@ -5271,7 +5270,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { if (lhsIsWholeAllocatable) { assert(lhsRealloc.has_value()); fir::factory::finalizeRealloc(*builder, loc, *lhsMutableBox, - /*lbounds=*/std::nullopt, + /*lbounds=*/{}, /*takeLboundsIfRealloc=*/false, *lhsRealloc); } @@ -6059,8 +6058,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::func::FuncOp func = fir::FirOpBuilder::createFunction( mlir::UnknownLoc::get(context), getModuleOp(), fir::NameUniquer::doGenerated("Sham"), - mlir::FunctionType::get(context, std::nullopt, std::nullopt), - symbolTable); + mlir::FunctionType::get(context, {}, {}), symbolTable); func.addEntryBlock(); CHECK(!builder && "Expected builder to be uninitialized"); builder = new fir::FirOpBuilder(func, bridge.getKindMap(), symbolTable); diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp index 6ed15df0de754..071513303da25 100644 --- a/flang/lib/Lower/ConvertCall.cpp +++ b/flang/lib/Lower/ConvertCall.cpp @@ -496,8 +496,7 @@ Fortran::lower::genCallOpAndResult( auto *context = builder.getContext(); if (mlir::isa(snd) && mlir::isa(fst.getType())) { - auto funcTy = - mlir::FunctionType::get(context, std::nullopt, std::nullopt); + auto funcTy = mlir::FunctionType::get(context, {}, {}); auto boxProcTy = builder.getBoxProcType(funcTy); if (mlir::Value host = argumentHostAssocs(converter, fst)) { cast = builder.create( @@ -1714,7 +1713,7 @@ void prepareUserCallArguments( /*nonDeferredParams=*/mlir::ValueRange{}, /*mutableProperties=*/{}); fir::factory::associateMutableBox(builder, loc, ptrBox, actualExv, - /*lbounds=*/std::nullopt); + /*lbounds=*/{}); caller.placeInput(arg, irBox); continue; } diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp index 1051d50ce8a9a..1850b67898126 100644 --- a/flang/lib/Lower/ConvertConstant.cpp +++ b/flang/lib/Lower/ConvertConstant.cpp @@ -374,8 +374,8 @@ static mlir::Value genStructureComponentInit( "allocatable component value that is not NULL"); } else { // Handle NULL() initialization - mlir::Value componentValue{fir::factory::createUnallocatedBox( - builder, loc, componentTy, std::nullopt)}; + mlir::Value componentValue{ + fir::factory::createUnallocatedBox(builder, loc, componentTy, {})}; componentValue = builder.createConvert(loc, componentTy, componentValue); return builder.create( diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp index f3430bfa3021e..0a1cd67789772 100644 --- a/flang/lib/Lower/ConvertExpr.cpp +++ b/flang/lib/Lower/ConvertExpr.cpp @@ -596,7 +596,7 @@ absentBoxToUnallocatedBox(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type boxType = box.getType(); assert(mlir::isa(boxType) && "argument must be a fir.box"); mlir::Value emptyBox = - fir::factory::createUnallocatedBox(builder, loc, boxType, std::nullopt); + fir::factory::createUnallocatedBox(builder, loc, boxType, {}); auto safeToReadBox = builder.create(loc, isPresent, box, emptyBox); return fir::substBase(exv, safeToReadBox); @@ -2663,8 +2663,7 @@ class ScalarExprLowering { /*nonDeferredParams=*/mlir::ValueRange{}, /*mutableProperties=*/{}); Fortran::lower::associateMutableBox(converter, loc, pointer, *expr, - /*lbounds=*/std::nullopt, - stmtCtx); + /*lbounds=*/{}, stmtCtx); caller.placeInput(arg, irBox); continue; } @@ -6186,7 +6185,7 @@ class ArrayExprLowering { mlir::FunctionType memcpyType() { auto ptrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext()); llvm::SmallVector args = {ptrTy, ptrTy, builder.getI64Type()}; - return mlir::FunctionType::get(builder.getContext(), args, std::nullopt); + return mlir::FunctionType::get(builder.getContext(), args, {}); } /// Create a call to the LLVM memcpy intrinsic. diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp index 9689f920840fb..7de433d6a201a 100644 --- a/flang/lib/Lower/ConvertExprToHLFIR.cpp +++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp @@ -1945,7 +1945,7 @@ class HlfirBuilder { fir::emitFatalError(loc, "pointer component designator could not be " "lowered to mutable box"); Fortran::lower::associateMutableBox(converter, loc, *toBox, expr, - /*lbounds=*/std::nullopt, stmtCtx); + /*lbounds=*/{}, stmtCtx); continue; } diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 6c4516686f9d0..6cda742874ccf 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -236,9 +236,8 @@ mlir::Value Fortran::lower::genInitialDataTarget( fir::FirOpBuilder &builder = converter.getFirOpBuilder(); if (Fortran::evaluate::UnwrapExpr( initialTarget)) - return fir::factory::createUnallocatedBox( - builder, loc, boxType, - /*nonDeferredParams=*/std::nullopt); + return fir::factory::createUnallocatedBox(builder, loc, boxType, + /*nonDeferredParams=*/{}); // Pointer initial data target, and NULL(mold). for (const auto &sym : Fortran::evaluate::CollectSymbols(initialTarget)) { // Derived type component symbols should not be instantiated as objects @@ -354,8 +353,8 @@ static mlir::Value genComponentDefaultInit( // From a standard point of view, pointer without initialization do not // need to be disassociated, but for sanity and simplicity, do it in // global constructor since this has no runtime cost. - componentValue = fir::factory::createUnallocatedBox( - builder, loc, componentTy, std::nullopt); + componentValue = + fir::factory::createUnallocatedBox(builder, loc, componentTy, {}); } else if (Fortran::lower::hasDefaultInitialization(component)) { // Component type has default initialization. componentValue = genDefaultInitializerValue(converter, loc, component, @@ -554,7 +553,7 @@ fir::GlobalOp Fortran::lower::defineGlobal( createGlobalInitialization(builder, global, [&](fir::FirOpBuilder &b) { mlir::Value box = fir::factory::createUnallocatedBox( b, loc, symTy, - /*nonDeferredParams=*/std::nullopt, + /*nonDeferredParams=*/{}, /*typeSourceBox=*/{}, getAllocatorIdxFromDataAttr(dataAttr)); b.create(loc, box); }); diff --git a/flang/lib/Lower/HostAssociations.cpp b/flang/lib/Lower/HostAssociations.cpp index 6a44be65a6cde..95ea74b791b47 100644 --- a/flang/lib/Lower/HostAssociations.cpp +++ b/flang/lib/Lower/HostAssociations.cpp @@ -410,15 +410,15 @@ class CapturedArrays : public CapturedSymbols { .genThen([&]() { fir::factory::associateMutableBox(builder, loc, boxInTuple, args.hostValue, - /*lbounds=*/std::nullopt); + /*lbounds=*/{}); }) .genElse([&]() { fir::factory::disassociateMutableBox(builder, loc, boxInTuple); }) .end(); } else { - fir::factory::associateMutableBox( - builder, loc, boxInTuple, args.hostValue, /*lbounds=*/std::nullopt); + fir::factory::associateMutableBox(builder, loc, boxInTuple, + args.hostValue, /*lbounds=*/{}); } } diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp index 69d72d9d63b68..53bf61922392d 100644 --- a/flang/lib/Lower/IO.cpp +++ b/flang/lib/Lower/IO.cpp @@ -528,7 +528,7 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter, descAddr = builder.createTemporary(loc, boxType); fir::MutableBoxValue box = fir::MutableBoxValue(descAddr, {}, {}); fir::factory::associateMutableBox(builder, loc, box, exv, - /*lbounds=*/std::nullopt); + /*lbounds=*/{}); } descAddr = builder.createConvert(loc, descRefTy, descAddr); list = builder.create(loc, listTy, list, descAddr, diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index e59a6bf2bf224..fb6f0dbf719fb 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -1403,7 +1403,7 @@ hlfir::Entity hlfir::createStackTempFromMold(mlir::Location loc, builder.createTemporary(loc, sequenceType, tmpName, extents, lenParams); } else { alloc = builder.createTemporary(loc, mold.getFortranElementType(), tmpName, - /*shape=*/std::nullopt, lenParams); + /*shape=*/{}, lenParams); } auto declareOp = builder.create(loc, alloc, tmpName, shape, lenParams, diff --git a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp index 64d70d70829fb..3fb7fab099965 100644 --- a/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp +++ b/flang/lib/Optimizer/Builder/LowLevelIntrinsics.cpp @@ -31,8 +31,7 @@ mlir::func::FuncOp fir::factory::getRealloc(fir::FirOpBuilder &builder) { mlir::func::FuncOp fir::factory::getLlvmGetRounding(fir::FirOpBuilder &builder) { auto int32Ty = builder.getIntegerType(32); - auto funcTy = - mlir::FunctionType::get(builder.getContext(), std::nullopt, {int32Ty}); + auto funcTy = mlir::FunctionType::get(builder.getContext(), {}, {int32Ty}); return builder.createFunction(builder.getUnknownLoc(), "llvm.get.rounding", funcTy); } @@ -40,8 +39,7 @@ fir::factory::getLlvmGetRounding(fir::FirOpBuilder &builder) { mlir::func::FuncOp fir::factory::getLlvmSetRounding(fir::FirOpBuilder &builder) { auto int32Ty = builder.getIntegerType(32); - auto funcTy = - mlir::FunctionType::get(builder.getContext(), {int32Ty}, std::nullopt); + auto funcTy = mlir::FunctionType::get(builder.getContext(), {int32Ty}, {}); return builder.createFunction(builder.getUnknownLoc(), "llvm.set.rounding", funcTy); } @@ -49,8 +47,8 @@ fir::factory::getLlvmSetRounding(fir::FirOpBuilder &builder) { mlir::func::FuncOp fir::factory::getLlvmInitTrampoline(fir::FirOpBuilder &builder) { auto ptrTy = builder.getRefType(builder.getIntegerType(8)); - auto funcTy = mlir::FunctionType::get(builder.getContext(), - {ptrTy, ptrTy, ptrTy}, std::nullopt); + auto funcTy = + mlir::FunctionType::get(builder.getContext(), {ptrTy, ptrTy, ptrTy}, {}); return builder.createFunction(builder.getUnknownLoc(), "llvm.init.trampoline", funcTy); } @@ -90,8 +88,7 @@ mlir::func::FuncOp fir::factory::getFeenableexcept(fir::FirOpBuilder &builder) { mlir::func::FuncOp fir::factory::getFegetexcept(fir::FirOpBuilder &builder) { auto int32Ty = builder.getIntegerType(32); - auto funcTy = - mlir::FunctionType::get(builder.getContext(), std::nullopt, {int32Ty}); + auto funcTy = mlir::FunctionType::get(builder.getContext(), {}, {int32Ty}); return builder.createFunction(builder.getUnknownLoc(), "fegetexcept", funcTy); } diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp index d944a4c98473e..93abedc43936d 100644 --- a/flang/lib/Optimizer/Builder/MutableBox.cpp +++ b/flang/lib/Optimizer/Builder/MutableBox.cpp @@ -521,23 +521,23 @@ void fir::factory::associateMutableBox(fir::FirOpBuilder &builder, mlir::Value sourceBox; if (auto *polyBox = source.getBoxOf()) sourceBox = polyBox->getSourceBox(); - writer.updateMutableBox(p.getAddr(), /*lbounds=*/std::nullopt, - /*extents=*/std::nullopt, - /*lengths=*/std::nullopt, sourceBox); + writer.updateMutableBox(p.getAddr(), /*lbounds=*/{}, + /*extents=*/{}, + /*lengths=*/{}, sourceBox); }, [&](const fir::UnboxedValue &addr) { - writer.updateMutableBox(addr, /*lbounds=*/std::nullopt, - /*extents=*/std::nullopt, - /*lengths=*/std::nullopt); + writer.updateMutableBox(addr, /*lbounds=*/{}, + /*extents=*/{}, + /*lengths=*/{}); }, [&](const fir::CharBoxValue &ch) { - writer.updateMutableBox(ch.getAddr(), /*lbounds=*/std::nullopt, - /*extents=*/std::nullopt, {ch.getLen()}); + writer.updateMutableBox(ch.getAddr(), /*lbounds=*/{}, + /*extents=*/{}, {ch.getLen()}); }, [&](const fir::ArrayBoxValue &arr) { writer.updateMutableBox(arr.getAddr(), lbounds.empty() ? arr.getLBounds() : lbounds, - arr.getExtents(), /*lengths=*/std::nullopt); + arr.getExtents(), /*lengths=*/{}); }, [&](const fir::CharArrayBoxValue &arr) { writer.updateMutableBox(arr.getAddr(), @@ -634,11 +634,11 @@ void fir::factory::associateMutableBoxWithRemap( source.match( [&](const fir::PolymorphicValue &p) { writer.updateMutableBox(cast(p.getAddr()), lbounds, extents, - /*lengths=*/std::nullopt); + /*lengths=*/{}); }, [&](const fir::UnboxedValue &addr) { writer.updateMutableBox(cast(addr), lbounds, extents, - /*lengths=*/std::nullopt); + /*lengths=*/{}); }, [&](const fir::CharBoxValue &ch) { writer.updateMutableBox(cast(ch.getAddr()), lbounds, extents, @@ -646,7 +646,7 @@ void fir::factory::associateMutableBoxWithRemap( }, [&](const fir::ArrayBoxValue &arr) { writer.updateMutableBox(cast(arr.getAddr()), lbounds, extents, - /*lengths=*/std::nullopt); + /*lengths=*/{}); }, [&](const fir::CharArrayBoxValue &arr) { writer.updateMutableBox(cast(arr.getAddr()), lbounds, extents, @@ -755,8 +755,8 @@ static mlir::Value allocateAndInitNewStorage(fir::FirOpBuilder &builder, // there is no way to know here if a derived type needs it or not. But the // information is available at compile time and could be reflected here // somehow. - mlir::Value irBox = createNewFirBox(builder, loc, box, newStorage, - std::nullopt, extents, lengths); + mlir::Value irBox = + createNewFirBox(builder, loc, box, newStorage, {}, extents, lengths); fir::runtime::genDerivedTypeInitialize(builder, loc, irBox); } return newStorage; diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index b6bf2753b80ce..cf20d84cbbcdb 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -4448,7 +4448,7 @@ llvm::LogicalResult fir::UnboxProcOp::verify() { void fir::IfOp::build(mlir::OpBuilder &builder, mlir::OperationState &result, mlir::Value cond, bool withElseRegion) { - build(builder, result, std::nullopt, cond, withElseRegion); + build(builder, result, {}, cond, withElseRegion); } void fir::IfOp::build(mlir::OpBuilder &builder, mlir::OperationState &result, diff --git a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp index 58f2b57712974..00ca6731c035b 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/BufferizeHLFIR.cpp @@ -296,7 +296,7 @@ struct SetLengthOpConversion llvm::StringRef tmpName{".tmp"}; llvm::SmallVector lenParams{adaptor.getLength()}; auto alloca = builder.createTemporary(loc, charType, tmpName, - /*shape=*/std::nullopt, lenParams); + /*shape=*/{}, lenParams); auto declareOp = builder.create( loc, alloca, tmpName, /*shape=*/mlir::Value{}, lenParams, /*dummy_scope=*/nullptr, fir::FortranVariableFlagsAttr{}); diff --git a/flang/unittests/Optimizer/Builder/CharacterTest.cpp b/flang/unittests/Optimizer/Builder/CharacterTest.cpp index 6d912b81d9541..d8d2da40ba9a6 100644 --- a/flang/unittests/Optimizer/Builder/CharacterTest.cpp +++ b/flang/unittests/Optimizer/Builder/CharacterTest.cpp @@ -29,7 +29,7 @@ struct CharacterTest : public testing::Test { moduleOp = builder.create(loc); builder.setInsertionPointToStart(moduleOp->getBody()); mlir::func::FuncOp func = builder.create( - loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt)); + loc, "func1", builder.getFunctionType({}, {})); auto *entryBlock = func.addEntryBlock(); builder.setInsertionPointToStart(entryBlock); diff --git a/flang/unittests/Optimizer/Builder/ComplexTest.cpp b/flang/unittests/Optimizer/Builder/ComplexTest.cpp index 689af4642b0b6..d5f00c9b61108 100644 --- a/flang/unittests/Optimizer/Builder/ComplexTest.cpp +++ b/flang/unittests/Optimizer/Builder/ComplexTest.cpp @@ -25,7 +25,7 @@ struct ComplexTest : public testing::Test { moduleOp = builder.create(loc); builder.setInsertionPointToStart(moduleOp->getBody()); mlir::func::FuncOp func = builder.create( - loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt)); + loc, "func1", builder.getFunctionType({}, {})); auto *entryBlock = func.addEntryBlock(); builder.setInsertionPointToStart(entryBlock); diff --git a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp index 3e2af24c47b96..e4c21f6b65a36 100644 --- a/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp +++ b/flang/unittests/Optimizer/Builder/FIRBuilderTest.cpp @@ -29,7 +29,7 @@ struct FIRBuilderTest : public testing::Test { moduleOp = builder.create(loc); builder.setInsertionPointToStart(moduleOp->getBody()); mlir::func::FuncOp func = builder.create( - loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt)); + loc, "func1", builder.getFunctionType({}, {})); auto *entryBlock = func.addEntryBlock(); builder.setInsertionPointToStart(entryBlock); @@ -176,8 +176,7 @@ TEST_F(FIRBuilderTest, getNamedFunction) { auto func2 = builder.getNamedFunction("func2"); EXPECT_EQ(nullptr, func2); auto loc = builder.getUnknownLoc(); - func2 = builder.createFunction( - loc, "func2", builder.getFunctionType(std::nullopt, std::nullopt)); + func2 = builder.createFunction(loc, "func2", builder.getFunctionType({}, {})); auto func2query = builder.getNamedFunction("func2"); EXPECT_EQ(func2, func2query); } diff --git a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp index 29700d2d3dbff..a0785198b078d 100644 --- a/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp +++ b/flang/unittests/Optimizer/Builder/HLFIRToolsTest.cpp @@ -28,7 +28,7 @@ struct HLFIRToolsTest : public testing::Test { moduleOp = builder.create(loc); builder.setInsertionPointToStart(moduleOp->getBody()); mlir::func::FuncOp func = builder.create( - loc, "func1", builder.getFunctionType(std::nullopt, std::nullopt)); + loc, "func1", builder.getFunctionType({}, {})); auto *entryBlock = func.addEntryBlock(); builder.setInsertionPointToStart(entryBlock); diff --git a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h index 40abf567400b3..4ecec92f42dc2 100644 --- a/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h +++ b/flang/unittests/Optimizer/Builder/Runtime/RuntimeCallTestBase.h @@ -26,9 +26,8 @@ struct RuntimeCallTest : public testing::Test { // Set the insertion point in the function entry block. moduleOp = builder.create(loc); builder.setInsertionPointToStart(moduleOp->getBody()); - mlir::func::FuncOp func = - builder.create(loc, "runtime_unit_tests_func", - builder.getFunctionType(std::nullopt, std::nullopt)); + mlir::func::FuncOp func = builder.create( + loc, "runtime_unit_tests_func", builder.getFunctionType({}, {})); auto *entryBlock = func.addEntryBlock(); builder.setInsertionPointToStart(entryBlock); diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp index 30c23b63b4d56..98270adaa7c73 100644 --- a/flang/unittests/Optimizer/FortranVariableTest.cpp +++ b/flang/unittests/Optimizer/FortranVariableTest.cpp @@ -21,9 +21,8 @@ struct FortranVariableTest : public testing::Test { // Set the insertion point in the function entry block. moduleOp = builder->create(loc); builder->setInsertionPointToStart(moduleOp->getBody()); - mlir::func::FuncOp func = - builder->create(loc, "fortran_variable_tests", - builder->getFunctionType(std::nullopt, std::nullopt)); + mlir::func::FuncOp func = builder->create( + loc, "fortran_variable_tests", builder->getFunctionType({}, {})); auto *entryBlock = func.addEntryBlock(); builder->setInsertionPointToStart(entryBlock); } From f48e2bbe9844540a7164eb62ab8bf5f2cd56743d Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 15:24:02 -0700 Subject: [PATCH 256/813] [AST] Remove an unnecessary cast (NFC) (#149338) getFinallyStmt() already returns ObjCAtFinallyStmt *. --- clang/lib/AST/StmtPrinter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index be02bdde38a3d..6ba5ec89964a9 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -610,7 +610,7 @@ void StmtPrinter::VisitObjCAtTryStmt(ObjCAtTryStmt *Node) { } } - if (auto *FS = static_cast(Node->getFinallyStmt())) { + if (ObjCAtFinallyStmt *FS = Node->getFinallyStmt()) { Indent() << "@finally"; if (auto *CS = dyn_cast(FS->getFinallyBody())) { PrintRawCompoundStmt(CS); From be6893af87e7fb0b09ab9bb5360997f28150fd34 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 15:24:10 -0700 Subject: [PATCH 257/813] [CodeGen] Remove an unnecessary cast (NFC) (#149339) getExceptionMode() already returns LangOptions::FPExceptionModeKind. --- clang/lib/CodeGen/CodeGenFunction.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 776a646ceb32f..0fda31c8e5fa1 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -161,8 +161,7 @@ void CodeGenFunction::CGFPOptionsRAII::ConstructorHelper(FPOptions FPFeatures) { llvm::RoundingMode NewRoundingBehavior = FPFeatures.getRoundingMode(); CGF.Builder.setDefaultConstrainedRounding(NewRoundingBehavior); auto NewExceptionBehavior = - ToConstrainedExceptMD(static_cast( - FPFeatures.getExceptionMode())); + ToConstrainedExceptMD(FPFeatures.getExceptionMode()); CGF.Builder.setDefaultConstrainedExcept(NewExceptionBehavior); CGF.SetFastMathFlags(FPFeatures); From 2d7ff097f22660311a01c25e6ff001192dd385c4 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 15:24:18 -0700 Subject: [PATCH 258/813] [Sema] Remove unnecessary casts (NFC) (#149340) getArrayIndex(), getArrayRangeStart(), and getArrayRangeEnd() already return Expr *. --- clang/lib/Sema/SemaInit.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 95746b35f71ef..1c6f292454ed6 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -3572,7 +3572,7 @@ ExprResult Sema::ActOnDesignatedInitializer(Designation &Desig, Designators.push_back(ASTDesignator::CreateFieldDesignator( D.getFieldDecl(), D.getDotLoc(), D.getFieldLoc())); } else if (D.isArrayDesignator()) { - Expr *Index = static_cast(D.getArrayIndex()); + Expr *Index = D.getArrayIndex(); llvm::APSInt IndexValue; if (!Index->isTypeDependent() && !Index->isValueDependent()) Index = CheckArrayDesignatorExpr(*this, Index, IndexValue).get(); @@ -3584,8 +3584,8 @@ ExprResult Sema::ActOnDesignatedInitializer(Designation &Desig, InitExpressions.push_back(Index); } } else if (D.isArrayRangeDesignator()) { - Expr *StartIndex = static_cast(D.getArrayRangeStart()); - Expr *EndIndex = static_cast(D.getArrayRangeEnd()); + Expr *StartIndex = D.getArrayRangeStart(); + Expr *EndIndex = D.getArrayRangeEnd(); llvm::APSInt StartValue; llvm::APSInt EndValue; bool StartDependent = StartIndex->isTypeDependent() || From 2da59287aa22a81bb07da07bd1ff8759d08a3368 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 15:24:25 -0700 Subject: [PATCH 259/813] [Target] Remove unnecessary casts (NFC) (#149342) getFunction().getParent() already returns Module *. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index dfe6f65d240e6..27212fda7638c 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9308,7 +9308,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); case Intrinsic::amdgcn_reloc_constant: { - Module *M = const_cast(MF.getFunction().getParent()); + Module *M = MF.getFunction().getParent(); const MDNode *Metadata = cast(Op.getOperand(1))->getMD(); auto SymbolName = cast(Metadata->getOperand(0))->getString(); auto *RelocSymbol = cast( diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 65d1c4e2d6515..8d139883ef913 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -3545,8 +3545,7 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, auto AFI = DAG.getMachineFunction().getInfo(); auto T = const_cast(CP->getType()); auto C = const_cast(CP->getConstVal()); - auto M = const_cast(DAG.getMachineFunction(). - getFunction().getParent()); + auto M = DAG.getMachineFunction().getFunction().getParent(); auto GV = new GlobalVariable( *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C, Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + From 100d8f7cc72328d7f0dfabb1128b35865fd653aa Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 17 Jul 2025 15:27:43 -0700 Subject: [PATCH 260/813] [clang][docs] Fix example in SanitizerSpecialCaseList.rst (#149244) As-ie example suppresses buffer overflow in malloc, and leave memory leak in place. It can be confusing. Fixes #62421. --- clang/docs/SanitizerSpecialCaseList.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/docs/SanitizerSpecialCaseList.rst b/clang/docs/SanitizerSpecialCaseList.rst index 2c50778d0f491..194f2fc5a7825 100644 --- a/clang/docs/SanitizerSpecialCaseList.rst +++ b/clang/docs/SanitizerSpecialCaseList.rst @@ -39,6 +39,7 @@ Example void bad_foo() { int *a = (int*)malloc(40); a[10] = 1; + free(a); } int main() { bad_foo(); } $ cat ignorelist.txt From 8f4deff5d51ac190e056a6738018fc8aa3114151 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Thu, 17 Jul 2025 23:50:39 +0100 Subject: [PATCH 261/813] [libcxx][fstream][NFC] Make __failed helper lambda a member function (#149390) This patch makes the `__failed` lambda a member function on `fstream`. This fixes two LLDB expression evaluation test failures that got introduced with https://github.com/llvm/llvm-project/pull/147389: ``` 16:22:51 ******************** 16:22:51 Unresolved Tests (2): 16:22:51 lldb-api :: commands/expression/import-std-module/list-dbg-info-content/TestDbgInfoContentListFromStdModule.py 16:22:51 lldb-api :: commands/expression/import-std-module/list/TestListFromStdModule.py ``` The expression evaluator is asserting in the Clang parser: ``` Assertion failed: (capture_size() == Class->capture_size() && "Wrong number of captures"), function LambdaExpr, file ExprCXX.cpp, line 1277. PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace. ``` Ideally we'd figure out why LLDB is falling over on this lambda. But to unblock CI for now, make this a member function. In the long run we should figure out the LLDB bug here so libc++ doesn't need to care about whether it uses lambdas like this or not. --- libcxx/include/fstream | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/libcxx/include/fstream b/libcxx/include/fstream index dc5c47304f014..6d3f20fff688f 100644 --- a/libcxx/include/fstream +++ b/libcxx/include/fstream @@ -401,6 +401,14 @@ private: } } } + + _LIBCPP_HIDE_FROM_ABI typename traits_type::int_type __overflow_failed() { + if (this->pptr() == this->epptr() + 1) { + this->pbump(-1); // lose the character we overflowed above -- we don't really have a + // choice since we couldn't commit the contents of the put area + } + return traits_type::eof(); + } }; template @@ -821,14 +829,6 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits> template typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>::overflow(int_type __c) { - auto __failed = [this]() { - if (this->pptr() == this->epptr() + 1) { - this->pbump(-1); // lose the character we overflowed above -- we don't really have a - // choice since we couldn't commit the contents of the put area - } - return traits_type::eof(); - }; - if (__file_ == nullptr) return traits_type::eof(); __write_mode(); @@ -850,7 +850,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits> if (__always_noconv_) { size_t __n = static_cast(this->pptr() - this->pbase()); if (std::fwrite(this->pbase(), sizeof(char_type), __n, __file_) != __n) { - return __failed(); + return __overflow_failed(); } } else { if (!__cv_) @@ -864,14 +864,14 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits> do { codecvt_base::result __r = __cv_->out(__st_, __b, __p, __end, __extbuf_, __extbuf_ + __ebs_, __extbuf_end); if (__end == __b) { - return __failed(); + return __overflow_failed(); } // No conversion needed: output characters directly to the file, done. if (__r == codecvt_base::noconv) { size_t __n = static_cast(__p - __b); if (std::fwrite(__b, 1, __n, __file_) != __n) { - return __failed(); + return __overflow_failed(); } break; @@ -879,7 +879,7 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits> } else if (__r == codecvt_base::ok) { size_t __n = static_cast(__extbuf_end - __extbuf_); if (std::fwrite(__extbuf_, 1, __n, __file_) != __n) { - return __failed(); + return __overflow_failed(); } break; @@ -888,13 +888,13 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits> } else if (__r == codecvt_base::partial) { size_t __n = static_cast(__extbuf_end - __extbuf_); if (std::fwrite(__extbuf_, 1, __n, __file_) != __n) { - return __failed(); + return __overflow_failed(); } __b = const_cast(__end); continue; } else { - return __failed(); + return __overflow_failed(); } } while (true); } From 6ff471883f7e716fe2a993f4d393c65f003994b4 Mon Sep 17 00:00:00 2001 From: Han-Chung Wang Date: Thu, 17 Jul 2025 16:06:06 -0700 Subject: [PATCH 262/813] [mlir][linalg] Improve linalg.pack consumer fusion. (#148993) If a dimension is not tiled, it is always valid to fuse the pack op, even if it has padding semantics. Because it always generates a full slice along the dimension. If a dimension is tiled and it does not need extra padding, the fusion is valid. The revision also formats corresponding tests for consistency. --------- Signed-off-by: hanhanW --- .../Linalg/Transforms/TilingInterfaceImpl.cpp | 53 +- .../tile-and-fuse-consumer.mlir | 451 +++++++++++++----- 2 files changed, 376 insertions(+), 128 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp index 513cecef29b61..5a10883a6043c 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp @@ -20,6 +20,7 @@ #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/Interfaces/TilingInterface.h" #include "mlir/Interfaces/ValueBoundsOpInterface.h" #include "llvm/Support/Debug.h" @@ -887,26 +888,55 @@ struct PackOpTiling ArrayRef offsets(allOffsets[0]); ArrayRef sizes(allSizes[0]); - auto packOp = cast(op); - // It is not trivial to infer dest tile from source tile if `packOp` has - // padding semantic. - if (packOp.getPaddingValue()) - return failure(); - Location loc = packOp.getLoc(); - SmallVector outerDimOffsets, outerDimSizes; DenseMap dimAndTileMapping = packOp.getDimAndTileMapping(); for (auto dim : llvm::seq(packOp.getSourceRank())) { if (dimAndTileMapping.count(dim)) { - FailureOr cstSize = + FailureOr cstTileSize = ValueBoundsConstraintSet::computeConstantBound( presburger::BoundType::UB, sizes[dim], /*stopCondition=*/nullptr, /*closedUB=*/true); std::optional cstInnerSize = getConstantIntValue(dimAndTileMapping[dim]); + + // If a dimension is not tiled, it is always valid to fuse the pack op, + // even if the op has padding semantics. Because it always generates a + // full slice along the dimension. + // TODO: It could be untiled if the `srcDimSize` is dynamic. It is a + // hard check to determine if a dimension is tiled or not. + int64_t srcDimSize = packOp.getSourceType().getDimSize(dim); + int64_t destDimSize = packOp.getDestType().getDimSize(dim); + bool isTiled = failed(cstTileSize) || + ShapedType::isDynamic(srcDimSize) || + cstTileSize.value() != srcDimSize; + if (!isTiled) { + outerDimOffsets.push_back(offsets[dim]); + if (ShapedType::isStatic(destDimSize)) { + outerDimSizes.push_back(b.getIndexAttr(destDimSize)); + } else { + outerDimSizes.push_back( + b.createOrFold(loc, packOp.getDest(), dim)); + } + continue; + } + + // If the dimension needs padding, it is not supported because there are + // iterations that only write padding values to the whole tile. The + // consumer fusion is driven by the source, so it is not possible to map + // an empty slice to the tile. + bool needExtraPadding = + ShapedType::isDynamic(destDimSize) || !cstInnerSize || + destDimSize * cstInnerSize.value() != srcDimSize; + // Prioritize the case that the op already says that it does not need + // padding. + if (!packOp.getPaddingValue()) + needExtraPadding = false; + if (needExtraPadding) + return failure(); + // Currently fusing `packOp` as consumer only expects perfect tiling // scenario because even if without padding semantic, the `packOp` may // also yield incomplete tiles. E.g. tensor<30xf32> -> tensor<5x6xf32>, @@ -921,9 +951,9 @@ struct PackOpTiling // another word, we can only support tiling with consumer if the tile // size for the producer is a multiple of the inner tile size for the // packed dimensions at this moment. - if (failed(cstSize) || !cstInnerSize || *cstSize % *cstInnerSize != 0) { + if ((failed(cstTileSize) || !cstInnerSize || + *cstTileSize % *cstInnerSize != 0)) return failure(); - } using AV = affine::AffineValueExpr; affine::AffineBuilder ab(b, loc); @@ -988,7 +1018,8 @@ struct PackOpTiling loc, packOp.getDest(), outputOffsets, outputSizes, strides); tiledOperands.push_back(outSlice); - assert(!packOp.getPaddingValue() && "Expect no padding semantic"); + if (auto val = packOp.getPaddingValue()) + tiledOperands.push_back(val); for (auto tile : packOp.getInnerTiles()) tiledOperands.push_back(tile); diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index d09373bdb3f14..7b0a8494a8acb 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -193,33 +193,33 @@ module attributes {transform.with_named_sequence} { #map = affine_map<(d0, d1) -> (d0, d1)> module { - func.func @fuse_tileable_consumer_scf_forall_multi_yielding_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x64xf32>, %arg3: tensor<64x32xf32>) -> (tensor<64x64xf32>, tensor<2048xf32>) { - %c4 = arith.constant 4 : index - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %0:2 = scf.forall (%arg4, %arg5) in (2, 2) shared_outs(%arg6 = %arg3, %arg7 = %arg2) -> (tensor<64x32xf32>, tensor<64x64xf32>) { - %extracted_slice = tensor.extract_slice %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> - %extracted_slice_0 = tensor.extract_slice %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x64xf32> to tensor<32x32xf32> - %6 = linalg.matmul ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) -> tensor<32x32xf32> - scf.forall.in_parallel { - tensor.parallel_insert_slice %6 into %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x64xf32> - tensor.parallel_insert_slice %extracted_slice_0 into %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> - } + func.func @fuse_tileable_consumer_scf_forall_multi_yielding_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x64xf32>, %arg3: tensor<64x32xf32>) -> (tensor<64x64xf32>, tensor<2048xf32>) { + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %0:2 = scf.forall (%arg4, %arg5) in (2, 2) shared_outs(%arg6 = %arg3, %arg7 = %arg2) -> (tensor<64x32xf32>, tensor<64x64xf32>) { + %extracted_slice = tensor.extract_slice %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> + %extracted_slice_0 = tensor.extract_slice %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<64x64xf32> to tensor<32x32xf32> + %6 = linalg.matmul ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) -> tensor<32x32xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %6 into %arg7[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x64xf32> + tensor.parallel_insert_slice %extracted_slice_0 into %arg6[%arg4, %arg5] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> } - %1 = tensor.empty() : tensor<64x64xf32> - %2 = tensor.empty() : tensor<64x64xf32> - %3 = tensor.empty() : tensor<64x64xf32> - %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%0#1, %1 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2, %3 : tensor<64x64xf32>, tensor<64x64xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32, %out_1: f32): - %6 = arith.mulf %in, %in_0 : f32 - %7 = arith.subf %out, %6 : f32 - %8 = arith.addf %out_1, %in : f32 - linalg.yield %7, %8 : f32, f32 - } -> (tensor<64x64xf32>, tensor<64x64xf32>) - %5 = tensor.empty() : tensor<2048xf32> - %unpack = linalg.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32> - return %4#1, %unpack : tensor<64x64xf32>, tensor<2048xf32> } + %1 = tensor.empty() : tensor<64x64xf32> + %2 = tensor.empty() : tensor<64x64xf32> + %3 = tensor.empty() : tensor<64x64xf32> + %4:2 = linalg.generic {indexing_maps = [#map, #map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%0#1, %1 : tensor<64x64xf32>, tensor<64x64xf32>) outs(%2, %3 : tensor<64x64xf32>, tensor<64x64xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32, %out_1: f32): + %6 = arith.mulf %in, %in_0 : f32 + %7 = arith.subf %out, %6 : f32 + %8 = arith.addf %out_1, %in : f32 + linalg.yield %7, %8 : f32, f32 + } -> (tensor<64x64xf32>, tensor<64x64xf32>) + %5 = tensor.empty() : tensor<2048xf32> + %unpack = linalg.unpack %0#0 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %5 : tensor<64x32xf32> -> tensor<2048xf32> + return %4#1, %unpack : tensor<64x64xf32>, tensor<2048xf32> + } } module attributes {transform.with_named_sequence} { @@ -269,38 +269,38 @@ module attributes {transform.with_named_sequence} { #map = affine_map<(d0, d1) -> (d0, d1)> module { - func.func @fuse_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2048xf32> { - %c4 = arith.constant 4 : index - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) { - %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> - %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) { - ^bb0(%in: f32, %in_16: f32, %out: f32): - %13 = arith.mulf %in, %in_16 : f32 - %14 = arith.addf %out, %13 : f32 - linalg.yield %14 : f32 - } -> tensor<32x32xf32> - scf.forall.in_parallel { - tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> - } - } - %output = tensor.empty() : tensor<2048xf32> - %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32> - return %unpack : tensor<2048xf32> + func.func @fuse_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2048xf32> { + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) { + %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) { + ^bb0(%in: f32, %in_16: f32, %out: f32): + %13 = arith.mulf %in, %in_16 : f32 + %14 = arith.addf %out, %13 : f32 + linalg.yield %14 : f32 + } -> tensor<32x32xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> + } } + %output = tensor.empty() : tensor<2048xf32> + %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2048xf32> + return %unpack : tensor<2048xf32> + } } - + module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { - %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1 - : (!transform.any_op) -> !transform.any_op - %loop = transform.structured.match ops{["scf.forall"]} in %arg1 - : (!transform.any_op) -> !transform.any_op - %a, %b = transform.test.fuse_consumer %slice_op in (%loop) - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.yield - } + transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { + %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %loop = transform.structured.match ops{["scf.forall"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %a, %b = transform.test.fuse_consumer %slice_op in (%loop) + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } } // CHECK-DAG: #[[UNPACK_RESULT_OFFSET_MAP:.*]] = affine_map<(d0) -> (d0 * 32)> // CHECK-DAG: #[[UNPACK_RESULT_SIZE_MAP:.*]] = affine_map<(d0) -> (1024, d0 * -32 + 2048)> @@ -332,38 +332,38 @@ module attributes {transform.with_named_sequence} { #map = affine_map<(d0, d1) -> (d0, d1)> module { - func.func @fuse_unaligned_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2047xf32> { - %c4 = arith.constant 4 : index - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) { - %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> - %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) { - ^bb0(%in: f32, %in_16: f32, %out: f32): - %13 = arith.mulf %in, %in_16 : f32 - %14 = arith.addf %out, %13 : f32 - linalg.yield %14 : f32 - } -> tensor<32x32xf32> - scf.forall.in_parallel { - tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> - } - } - %output = tensor.empty() : tensor<2047xf32> - %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32> - return %unpack : tensor<2047xf32> + func.func @fuse_unaligned_unpack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<2047xf32> { + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %1 = scf.forall (%arg3, %arg4) = (0, 0) to (64, 32) step (32, 32) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) { + %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) { + ^bb0(%in: f32, %in_16: f32, %out: f32): + %13 = arith.mulf %in, %in_16 : f32 + %14 = arith.addf %out, %13 : f32 + linalg.yield %14 : f32 + } -> tensor<32x32xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> + } } + %output = tensor.empty() : tensor<2047xf32> + %unpack = linalg.unpack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %output : tensor<64x32xf32> -> tensor<2047xf32> + return %unpack : tensor<2047xf32> + } } - + module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { - %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1 - : (!transform.any_op) -> !transform.any_op - %loop = transform.structured.match ops{["scf.forall"]} in %arg1 - : (!transform.any_op) -> !transform.any_op - %a, %b = transform.test.fuse_consumer %slice_op in (%loop) - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.yield - } + transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { + %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %loop = transform.structured.match ops{["scf.forall"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %a, %b = transform.test.fuse_consumer %slice_op in (%loop) + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } } // CHECK-DAG: #[[UNPACK_RESULT_OFFSET_MAP:.*]] = affine_map<(d0) -> (d0 * 32)> // CHECK-DAG: #[[UNPACK_RESULT_SIZE_MAP:.*]] = affine_map<(d0) -> (1024, d0 * -32 + 2047)> @@ -395,46 +395,46 @@ module attributes {transform.with_named_sequence} { #map = affine_map<(d0, d1) -> (d0, d1)> module { - func.func @fuse_pack_consumer_into_scf_forall(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> { - %c4 = arith.constant 4 : index - %c64 = arith.constant 64 : index - %c0 = arith.constant 0 : index - %1 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) { - %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> - %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) { - ^bb0(%in: f32, %in_16: f32, %out: f32): - %13 = arith.mulf %in, %in_16 : f32 - %14 = arith.addf %out, %13 : f32 - linalg.yield %14 : f32 - } -> tensor<32x32xf32> - scf.forall.in_parallel { - tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> - } - } - %output = tensor.empty() : tensor<4x32x16xf32> - %pack = linalg.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> - return %pack : tensor<4x32x16xf32> + func.func @fuse_perfect_tiling_pack_consumer(%arg0: tensor<32x32xf32>, %arg1: tensor<32x32xf32>, %arg2: tensor<64x32xf32>) -> tensor<4x32x16xf32> { + %c4 = arith.constant 4 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %1 = scf.forall (%arg3, %arg4) in (2, 1) shared_outs(%arg5 = %arg2) -> (tensor<64x32xf32>) { + %extracted_slice = tensor.extract_slice %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<64x32xf32> to tensor<32x32xf32> + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg0, %arg1 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%extracted_slice : tensor<32x32xf32>) { + ^bb0(%in: f32, %in_16: f32, %out: f32): + %13 = arith.mulf %in, %in_16 : f32 + %14 = arith.addf %out, %13 : f32 + linalg.yield %14 : f32 + } -> tensor<32x32xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %3 into %arg5[%arg3, %arg4] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<64x32xf32> + } } + %output = tensor.empty() : tensor<4x32x16xf32> + %pack = linalg.pack %1 inner_dims_pos = [0] inner_tiles = [16] into %output : tensor<64x32xf32> -> tensor<4x32x16xf32> + return %pack : tensor<4x32x16xf32> + } } - + module attributes {transform.with_named_sequence} { - transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { - %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1 - : (!transform.any_op) -> !transform.any_op - %loop = transform.structured.match ops{["scf.forall"]} in %arg1 - : (!transform.any_op) -> !transform.any_op - %a, %b = transform.test.fuse_consumer %slice_op in (%loop) - : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.yield - } + transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) { + %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %loop = transform.structured.match ops{["scf.forall"]} in %arg1 + : (!transform.any_op) -> !transform.any_op + %a, %b = transform.test.fuse_consumer %slice_op in (%loop) + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } } // CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)> -// CHECK: func.func @fuse_pack_consumer_into_scf_forall( +// CHECK: func.func @fuse_perfect_tiling_pack_consumer( // CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<32x32xf32> // CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<32x32xf32> // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<64x32xf32>) // CHECK: %[[OUT_INIT:.*]] = tensor.empty() : tensor<4x32x16xf32> -// CHECK: %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 2) +// CHECK: %[[FINAL_RESULT:.*]]:2 = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 1) // CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG2]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]]) // CHECK-SAME: { // CHECK: %[[GENERIC_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], %[[IV2]]] [32, 32] [1, 1] @@ -451,6 +451,223 @@ module attributes {transform.with_named_sequence} { // ----- +// It is valid to fuse the pack op in perfect tiling scenario when the dimension +// is dynamic and padding is not needed. + +func.func @fuse_pack_consumer_with_no_pad_dynamic_dim(%arg0: tensor<64x?xf32>, %arg1: tensor<64x?xf32>, %1: tensor<64x?x16xf32>) -> tensor<64x?x16xf32> { + %c1 = arith.constant 1 : index + %d1 = tensor.dim %arg0, %c1 : tensor<64x?xf32> + %0 = scf.forall (%arg2) = (0) to (%d1) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x?xf32>) { + %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x?xf32> to tensor<64x16xf32> + %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x?xf32> to tensor<64x16xf32> + %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x?xf32> + } + } + %pack = linalg.pack %0 inner_dims_pos = [1] inner_tiles = [16] into %1 : tensor<64x?xf32> -> tensor<64x?x16xf32> + return %pack : tensor<64x?x16xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK: func.func @fuse_pack_consumer_with_no_pad_dynamic_dim( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] +// CHECK: %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (%{{.+}}) step (16) +// CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[ARG2]]) +// CHECK: %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: %[[ELEM:.*]] = linalg.exp +// CHECK-SAME: ins(%[[ELEM_SRC]] +// CHECK-SAME: outs(%[[ELEM_DEST]] +// CHECK-DAG: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]]) +// CHECK-DAG: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0] [64, 1, 16] [1, 1, 1] +// CHECK: %[[PACK:.*]] = linalg.pack %[[ELEM]] +// CHECK-SAME: inner_dims_pos = [1] inner_tiles = [16] +// CHECK-SAME: into %[[TILED_PACK_DEST]] +// CHECK: scf.forall.in_parallel { +// CHECK: tensor.parallel_insert_slice %[[ELEM]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0] [64, 1, 16] [1, 1, 1] + +// ----- + +// It is valid to fuse the pack op with padding semantics if the tiled +// dimensions do not need padding. + +func.func @fuse_pack_consumer_with_padding_semantics(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<22x2x3x16xf32> { + %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) { + %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32> + %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32> + %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32> + } + } + %1 = tensor.empty() : tensor<22x2x3x16xf32> + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<22x2x3x16xf32> + return %pack : tensor<22x2x3x16xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK: func.func @fuse_pack_consumer_with_padding_semantics( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] +// CHECK-DAG: %[[OUT_INIT:.*]] = tensor.empty() : tensor<22x2x3x16xf32> +// CHECK-DAG: %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16) +// CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]]) +// CHECK: %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: %[[ELEM:.*]] = linalg.exp +// CHECK-SAME: ins(%[[ELEM_SRC]] +// CHECK-SAME: outs(%[[ELEM_DEST]] +// CHECK-DAG: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]]) +// CHECK-DAG: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [22, 1, 3, 16] [1, 1, 1, 1] +// CHECK: %[[TILED_PACK_OUT:.*]] = linalg.pack %[[ELEM]] +// CHECK-SAME: padding_value(%[[PAD_VAL]] : f32) +// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [3, 16] +// CHECK-SAME: into %[[TILED_PACK_DEST]] +// CHECK: scf.forall.in_parallel { +// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [22, 1, 3, 16] [1, 1, 1, 1] + +// ----- + +// It is valid to fuse the pack if the dimension is not tiled even when it needs +// extra padding. + +func.func @fuse_pack_consumer_with_untiled_extra_padding(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<33x2x3x16xf32> { + %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) { + %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32> + %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32> + %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32> + } + } + %1 = tensor.empty() : tensor<33x2x3x16xf32> + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<33x2x3x16xf32> + return %pack : tensor<33x2x3x16xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK: func.func @fuse_pack_consumer_with_untiled_extra_padding( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] +// CHECK-DAG: %[[OUT_INIT:.*]] = tensor.empty() : tensor<33x2x3x16xf32> +// CHECK-DAG: %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16) +// CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[OUT_INIT]]) +// CHECK: %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: %[[ELEM:.*]] = linalg.exp +// CHECK-SAME: ins(%[[ELEM_SRC]] +// CHECK-SAME: outs(%[[ELEM_DEST]] +// CHECK-DAG: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]]) +// CHECK-DAG: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [33, 1, 3, 16] [1, 1, 1, 1] +// CHECK: %[[TILED_PACK_OUT:.*]] = linalg.pack %[[ELEM]] +// CHECK-SAME: padding_value(%[[PAD_VAL]] : f32) +// CHECK-SAME: inner_dims_pos = [0, 1] inner_tiles = [3, 16] +// CHECK-SAME: into %[[TILED_PACK_DEST]] +// CHECK: scf.forall.in_parallel { +// CHECK: tensor.parallel_insert_slice %[[GENERIC_OUT]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[TILED_PACK_OUT]] into %[[PACK_OUT_ARG]][0, %[[PACK_RESULT_OFFSET]], 0, 0] [33, 1, 3, 16] [1, 1, 1, 1] + +// ----- + +// If the dimension is tiled and it needs extra padding, do not fuse the pack +// op. + +func.func @nofuse_pack_consumer_with_extra_padding(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>) -> tensor<23x32x3x16xf32> { + %0 = scf.forall (%arg2) = (0) to (32) step (16) shared_outs(%arg3 = %arg1) -> (tensor<64x32xf32>) { + %src = tensor.extract_slice %arg0[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32> + %dest = tensor.extract_slice %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32> + %2 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32> + scf.forall.in_parallel { + // expected-error @below {{failed to fuse consumer of slice}} + tensor.parallel_insert_slice %2 into %arg3[0, %arg2] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32> + } + } + %1 = tensor.empty() : tensor<23x32x3x16xf32> + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %0 padding_value(%cst : f32) inner_dims_pos = [0, 1] inner_tiles = [3, 16] into %1 : tensor<64x32xf32> -> tensor<23x32x3x16xf32> + return %pack : tensor<23x32x3x16xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + +// Imperfect tiling is not supported in pack op consumer fusion. + +#map = affine_map<(d0) -> (d0 * 5)> +#map1 = affine_map<(d0) -> (d0)> +func.func @nofuse_pack_with_imperfect_tiling(%arg0: tensor<30xf32>) -> tensor<5x6xf32> { + %0 = tensor.empty() : tensor<30xf32> + %1 = scf.forall (%arg1) in (6) shared_outs(%arg2 = %0) -> (tensor<30xf32>) { + %3 = affine.apply #map(%arg1) + %extracted_slice = tensor.extract_slice %arg0[%3] [5] [1] : tensor<30xf32> to tensor<5xf32> + %extracted_slice_0 = tensor.extract_slice %arg2[%3] [5] [1] : tensor<30xf32> to tensor<5xf32> + %4 = linalg.generic {indexing_maps = [#map1, #map1], iterator_types = ["parallel"]} ins(%extracted_slice : tensor<5xf32>) outs(%extracted_slice_0 : tensor<5xf32>) { + ^bb0(%in: f32, %out: f32): + %5 = arith.addf %in, %in : f32 + linalg.yield %5 : f32 + } -> tensor<5xf32> + scf.forall.in_parallel { + // expected-error @below {{failed to fuse consumer of slice}} + tensor.parallel_insert_slice %4 into %arg2[%3] [5] [1] : tensor<5xf32> into tensor<30xf32> + } + } + %2 = tensor.empty() : tensor<5x6xf32> + %pack = linalg.pack %1 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [6] into %2 : tensor<30xf32> -> tensor<5x6xf32> + return %pack : tensor<5x6xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} + +// ----- + module { func.func @fuse_add_multiple_tilable_consumers(%arg0: tensor<256x256xf32>, %arg1: tensor<256x256xf32>, %arg2: tensor<256x256xf32>) -> (tensor<256x256xf32>, tensor<256x256xf32>) { %c0 = arith.constant 0 : index @@ -489,7 +706,7 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32> // CHECK: %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32> // CHECK: %[[LOOP_RESULT:.*]]:3 = scf.for %[[IV1:.*]] = %[[C0]] -// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG:.*]] = %[[dest0]], %[[SECOND_OUT_ARG:.*]] = %[[dest0]], %[[THIRD_OUT_ARG:.*]] = %[[dest0]]) +// CHECK-SAME: iter_args(%[[FIRST_OUT_ARG:.*]] = %[[dest0]], %[[SECOND_OUT_ARG:.*]] = %[[dest0]], %[[THIRD_OUT_ARG:.*]] = %[[dest0]]) // CHECK-SAME: { // CHECK: %[[ADD_OUT_SLICE:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][%[[IV1]], 0] [64, 256] [1, 1] // CHECK: %[[ADD_INS0_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[IV1]], 0] [64, 256] [1, 1] @@ -645,7 +862,7 @@ func.func @multi_slice_fusion1(%arg0 : tensor, %arg1 : tensor, % scf.forall.in_parallel { tensor.parallel_insert_slice %generic#0 into %init0[%iv0] [%tilesize] [1] : tensor into tensor tensor.parallel_insert_slice %generic#1 into %init1[%iv0] [%tilesize] [1] : tensor into tensor - } + } } %empty = tensor.empty(%dim0) : tensor %result = linalg.generic { @@ -719,7 +936,7 @@ func.func @multi_slice_fusion2(%arg0 : tensor, %arg1 : tensor, % scf.forall.in_parallel { tensor.parallel_insert_slice %generic0 into %init0[%iv0] [%tilesize] [1] : tensor into tensor tensor.parallel_insert_slice %generic1 into %init1[%iv0] [%tilesize] [1] : tensor into tensor - } + } } %empty = tensor.empty(%dim0) : tensor %result = linalg.generic { From c15a50ad22efde55f2db46ca389735cacb92a2ad Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 17 Jul 2025 16:10:59 -0700 Subject: [PATCH 263/813] [AMDGPU] More flatGVS gfx1250 patterns (#149410) --- llvm/lib/Target/AMDGPU/FLATInstructions.td | 33 +- .../inst-select-load-atomic-flat.mir | 30 +- .../inst-select-load-atomic-global.mir | 24 +- .../inst-select-store-atomic-flat.mir | 22 +- llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll | 322 ++++-------------- 5 files changed, 133 insertions(+), 298 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c84962b52befd..c8a4e22ed1dae 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1724,6 +1724,7 @@ let OtherPredicates = [D16PreservesUnusedBits, HasFlatAddressSpace], True16Predi defm : FlatLoadPats ; defm : FlatLoadPats ; +defm : FlatLoadPats ; defm : FlatStorePats ; defm : FlatStorePats ; @@ -1735,7 +1736,7 @@ defm : FlatStorePats ; foreach vt = VReg_64.RegTypes in { defm : FlatStorePats ; -def : FlatLoadPat ; +defm : FlatLoadPats ; } defm : FlatStorePats ; @@ -1747,6 +1748,7 @@ defm : FlatStorePats ; defm : FlatStorePats ; defm : FlatStorePats ; +defm : FlatStorePats ; defm : FlatStorePats ; defm : FlatStorePats ; @@ -1792,6 +1794,9 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>; } // end foreach as +defm : FlatStorePats ; +defm : FlatStorePats ; + let SubtargetPredicate = isGFX12Plus in { defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; @@ -1806,19 +1811,19 @@ defm : FlatStorePats ; let OtherPredicates = [D16PreservesUnusedBits] in { // TODO: Handle atomic loads -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; -def : FlatLoadPat_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; +defm : FlatLoadPats_D16 ; } } // End OtherPredicates = [HasFlatAddressSpace] @@ -1890,6 +1895,7 @@ defm : GlobalFLATStorePats ; // appropriate waits. defm : GlobalFLATLoadPats ; defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; @@ -1929,6 +1935,7 @@ defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>; defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir index cebdffc74847c..eba64b853ac05 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir @@ -223,37 +223,37 @@ body: | ; GFX7-LABEL: name: load_atomic_flat_v2s32_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>)) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX9-LABEL: name: load_atomic_flat_v2s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>)) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX10-LABEL: name: load_atomic_flat_v2s32_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>)) + ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX11-LABEL: name: load_atomic_flat_v2s32_seq_cst ; GFX11: liveins: $vgpr0_vgpr1 ; GFX11-NEXT: {{ $}} - ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX11-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) - ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX11-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>)) + ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX12-LABEL: name: load_atomic_flat_v2s32_seq_cst ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} - ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>)) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX12-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>)) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 0) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir index eafc96dd32bdd..474f1308d8e24 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir @@ -252,30 +252,30 @@ body: | ; GFX7-LABEL: name: load_atomic_global_v2s32_seq_cst ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1) - ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>), addrspace 1) + ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s32_seq_cst ; GFX7-FLAT: liveins: $vgpr0_vgpr1 ; GFX7-FLAT-NEXT: {{ $}} - ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1) - ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (<2 x s32>), addrspace 1) + ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]] ; ; GFX9-LABEL: name: load_atomic_global_v2s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1) - ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 1) + ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] ; ; GFX10-LABEL: name: load_atomic_global_v2s32_seq_cst ; GFX10: liveins: $vgpr0_vgpr1 ; GFX10-NEXT: {{ $}} - ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1) - ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX10-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (<2 x s32>), addrspace 1) + ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]] %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load seq_cst (<2 x s32>), align 8, addrspace 1) $vgpr0_vgpr1 = COPY %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir index 2675295ea98ed..ae010a872a41d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-atomic-flat.mir @@ -22,6 +22,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2 ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (s32)) + ; ; GFX9-LABEL: name: atomic_store_flat_s32_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX9-NEXT: {{ $}} @@ -51,6 +52,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s16>) = COPY $vgpr0 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2 ; GFX7-NEXT: G_STORE [[COPY]](<2 x s16>), [[COPY1]](p0) :: (store seq_cst (<2 x s16>)) + ; ; GFX9-LABEL: name: atomic_store_flat_v2s16_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX9-NEXT: {{ $}} @@ -80,6 +82,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2 ; GFX7-NEXT: G_STORE [[COPY]](p3), [[COPY1]](p0) :: (store seq_cst (p3)) + ; ; GFX9-LABEL: name: atomic_store_flat_p3_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX9-NEXT: {{ $}} @@ -109,6 +112,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p5) = COPY $vgpr0 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2 ; GFX7-NEXT: G_STORE [[COPY]](p5), [[COPY1]](p0) :: (store seq_cst (p5)) + ; ; GFX9-LABEL: name: atomic_store_flat_p5_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX9-NEXT: {{ $}} @@ -138,6 +142,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p6) = COPY $vgpr0 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr1_vgpr2 ; GFX7-NEXT: G_STORE [[COPY]](p6), [[COPY1]](p0) :: (store seq_cst (p6)) + ; ; GFX9-LABEL: name: atomic_store_flat_p6_seq_cst ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2 ; GFX9-NEXT: {{ $}} @@ -167,6 +172,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (s64)) + ; ; GFX9-LABEL: name: atomic_store_flat_s64_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -193,15 +199,16 @@ body: | ; GFX7-LABEL: name: atomic_store_flat_v2s32_seq_cst ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX7-NEXT: {{ $}} - ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3 - ; GFX7-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p0) :: (store seq_cst (<2 x s32>)) + ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<2 x s32>)) + ; ; GFX9-LABEL: name: atomic_store_flat_v2s32_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} - ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p0) :: (store seq_cst (<2 x s32>)) + ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3 + ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY1]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (store seq_cst (<2 x s32>)) %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 %1:vgpr(p0) = COPY $vgpr2_vgpr3 G_STORE %0, %1 :: (store seq_cst (<2 x s32>), align 8, addrspace 0) @@ -225,6 +232,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3 ; GFX7-NEXT: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p0) :: (store seq_cst (<4 x s16>)) + ; ; GFX9-LABEL: name: atomic_store_flat_v4s16_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -254,6 +262,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3 ; GFX7-NEXT: G_STORE [[COPY]](p0), [[COPY1]](p0) :: (store seq_cst (p0)) + ; ; GFX9-LABEL: name: atomic_store_flat_p0_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} @@ -282,6 +291,7 @@ body: | ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(p0) = COPY $vgpr2_vgpr3 ; GFX7-NEXT: G_STORE [[COPY]](p1), [[COPY1]](p0) :: (store seq_cst (p1)) + ; ; GFX9-LABEL: name: atomic_store_flat_p1_seq_cst ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 ; GFX9-NEXT: {{ $}} diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll index f0988a17b35f0..f54fbbaabe9f5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll @@ -998,24 +998,11 @@ define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32 } define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_f64: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_f64: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_f64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load double, ptr %gep0 @@ -1024,24 +1011,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset } define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_f64_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_f64_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_f64_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1051,24 +1025,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i3 } define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i64: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i64: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load i64, ptr %gep0 @@ -1077,24 +1038,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset } define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_i64_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_i64_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_i64_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1104,24 +1052,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i3 } define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_v2f32: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_v2f32: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_v2f32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load <2 x float>, ptr %gep0 @@ -1129,24 +1064,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffs } define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_v2f32_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_v2f32_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_v2f32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1155,24 +1077,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, } define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_v2i32: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_v2i32: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_v2i32: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load <2 x i32>, ptr %gep0 @@ -1181,24 +1090,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffs } define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_v2i32_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_v2i32_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_v2i32_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1208,24 +1104,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, } define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_v4i16: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_v4i16: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_v4i16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load <4 x i16>, ptr %gep0 @@ -1234,24 +1117,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffs } define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_v4i16_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_v4i16_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_v4i16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1261,24 +1131,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, } define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_v4f16: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_v4f16: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_v4f16: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load <4 x half>, ptr %gep0 @@ -1287,24 +1144,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffs } define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_v4f16_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_v4f16_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_v4f16_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 @@ -1314,24 +1158,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, } define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_p1: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_p1: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_p1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %load = load ptr, ptr %gep0 @@ -1341,24 +1172,11 @@ define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) } define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32 %voffset) { -; GFX1250-SDAG-LABEL: flat_load_saddr_p1_immneg128: -; GFX1250-SDAG: ; %bb.0: -; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 -; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] -; GFX1250-SDAG-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-SDAG-NEXT: ; return to shader part epilog -; -; GFX1250-GISEL-LABEL: flat_load_saddr_p1_immneg128: -; GFX1250-GISEL: ; %bb.0: -; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0 -; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v3, vcc_lo -; GFX1250-GISEL-NEXT: flat_load_b64 v[0:1], v[0:1] offset:-128 -; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX1250-GISEL-NEXT: ; return to shader part epilog +; GFX1250-LABEL: flat_load_saddr_p1_immneg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_load_b64 v[0:1], v0, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 From 0b6df5485ef77e76fcb09a349b5e1c39d926de5f Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 17 Jul 2025 16:11:25 -0700 Subject: [PATCH 264/813] [AMDGPU] Reenable tanh real-true16 run line. NFC. (#149411) --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll index 91a2a0b651132..81db7354757d9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll @@ -1,10 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s -; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select. ; FIXME: GlobalISel does not work with bf16 declare float @llvm.amdgcn.tanh.f32(float) #0 From cf36f49c042f93e4e204ee434173f1c6a6ad4cac Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 18 Jul 2025 07:50:35 +0800 Subject: [PATCH 265/813] [libclc] Enable `clang fp reciprocal` in clc_native_divide/recip/rsqrt/tan (#149269) The pragma adds `arcp` flag to `fdiv` instruction in these functions. The flag can provide better performance. --- libclc/clc/lib/generic/math/clc_native_divide.inc | 1 + libclc/clc/lib/generic/math/clc_native_recip.inc | 1 + libclc/clc/lib/generic/math/clc_native_rsqrt.inc | 1 + libclc/clc/lib/generic/math/clc_native_tan.inc | 1 + 4 files changed, 4 insertions(+) diff --git a/libclc/clc/lib/generic/math/clc_native_divide.inc b/libclc/clc/lib/generic/math/clc_native_divide.inc index fdf1794812c5a..dac176fb986bd 100644 --- a/libclc/clc/lib/generic/math/clc_native_divide.inc +++ b/libclc/clc/lib/generic/math/clc_native_divide.inc @@ -8,5 +8,6 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_divide(__CLC_GENTYPE x, __CLC_GENTYPE y) { + _Pragma("clang fp reciprocal(on)"); return x / y; } diff --git a/libclc/clc/lib/generic/math/clc_native_recip.inc b/libclc/clc/lib/generic/math/clc_native_recip.inc index 57eb35a9522f8..e7246dc08a77c 100644 --- a/libclc/clc/lib/generic/math/clc_native_recip.inc +++ b/libclc/clc/lib/generic/math/clc_native_recip.inc @@ -7,5 +7,6 @@ //===----------------------------------------------------------------------===// _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_recip(__CLC_GENTYPE val) { + _Pragma("clang fp reciprocal(on)"); return 1.0f / val; } diff --git a/libclc/clc/lib/generic/math/clc_native_rsqrt.inc b/libclc/clc/lib/generic/math/clc_native_rsqrt.inc index 7a3b0b2af2721..2b2c4bdada9f9 100644 --- a/libclc/clc/lib/generic/math/clc_native_rsqrt.inc +++ b/libclc/clc/lib/generic/math/clc_native_rsqrt.inc @@ -7,5 +7,6 @@ //===----------------------------------------------------------------------===// _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_rsqrt(__CLC_GENTYPE val) { + _Pragma("clang fp reciprocal(on)"); return 1.0f / __clc_native_sqrt(val); } diff --git a/libclc/clc/lib/generic/math/clc_native_tan.inc b/libclc/clc/lib/generic/math/clc_native_tan.inc index f61a78968a754..f0c6c6d37d2b7 100644 --- a/libclc/clc/lib/generic/math/clc_native_tan.inc +++ b/libclc/clc/lib/generic/math/clc_native_tan.inc @@ -7,5 +7,6 @@ //===----------------------------------------------------------------------===// _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_native_tan(__CLC_GENTYPE val) { + _Pragma("clang fp reciprocal(on)"); return __clc_native_sin(val) / __clc_native_cos(val); } From 45477add8dfe9851605697bd908b49f0ec244625 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Jul 2025 08:53:32 +0900 Subject: [PATCH 266/813] RuntimeLibcalls: Add methods to recognize libcall names (#149001) Also replace the current static DenseMap of preserved symbol names in the Symtab hack with this. That was broken statefulness across compiles, so this at least fixes that. However this is still broken, llvm-as shouldn't really depend on the triple. --- llvm/include/llvm/ADT/StringTable.h | 9 ++++ llvm/include/llvm/IR/RuntimeLibcalls.h | 12 +++++ llvm/lib/IR/RuntimeLibcalls.cpp | 45 +++++++++++++++++++ llvm/lib/Object/IRSymtab.cpp | 45 +++++++++---------- .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 13 +++++- 5 files changed, 100 insertions(+), 24 deletions(-) diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h index c089a070d4b57..575b3c929e40c 100644 --- a/llvm/include/llvm/ADT/StringTable.h +++ b/llvm/include/llvm/ADT/StringTable.h @@ -118,6 +118,13 @@ class StringTable { constexpr Iterator(const Iterator &RHS) = default; constexpr Iterator(Iterator &&RHS) = default; + Iterator &operator=(const Iterator &RHS) { + Table = RHS.Table; + O = RHS.O; + S = RHS.S; + return *this; + } + bool operator==(const Iterator &RHS) const { assert(Table == RHS.Table && "Compared iterators for unrelated tables!"); return O == RHS.O; @@ -132,6 +139,8 @@ class StringTable { O = O.value() + (*Table)[O].size() + 1; return *this; } + + Offset offset() const { return O; } }; constexpr Iterator begin() const { return Iterator(*this, 0); } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 8058c8a4c5510..89ad4e5bc6ca4 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -132,6 +132,10 @@ struct RuntimeLibcallsInfo { return ImplToLibcall[Impl]; } + /// Check if this is valid libcall for the current module, otherwise + /// RTLIB::Unsupported. + RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const; + private: static const RTLIB::LibcallImpl DefaultLibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1]; @@ -156,6 +160,14 @@ struct RuntimeLibcallsInfo { /// Map from a concrete LibcallImpl implementation to its RTLIB::Libcall kind. LLVM_ABI static const RTLIB::Libcall ImplToLibcall[RTLIB::NumLibcallImpls]; + /// Check if a function name is a recognized runtime call of any kind. This + /// does not consider if this call is available for any current compilation, + /// just that it is a known call somewhere. This returns the set of all + /// LibcallImpls which match the name; multiple implementations with the same + /// name may exist but differ in interpretation based on the target context. + LLVM_ABI static iterator_range::const_iterator> + getRecognizedLibcallImpls(StringRef FuncName); + static bool darwinHasSinCosStret(const Triple &TT) { if (!TT.isOSDarwin()) return false; diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index b1864897dafa6..5936ac7d0287f 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -135,6 +135,51 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, } } +RTLIB::LibcallImpl +RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const { + const ArrayRef RuntimeLibcallNameOffsets( + RuntimeLibcallNameOffsetTable); + + iterator_range::const_iterator> Range = + getRecognizedLibcallImpls(FuncName); + + for (auto I = Range.begin(); I != Range.end(); ++I) { + RTLIB::LibcallImpl Impl = + static_cast(I - RuntimeLibcallNameOffsets.begin()); + + // FIXME: This should not depend on looking up ImplToLibcall, only the list + // of libcalls for the module. + RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]]; + if (Recognized != RTLIB::Unsupported) + return Recognized; + } + + return RTLIB::Unsupported; +} + +iterator_range::const_iterator> +RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) { + StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName); + if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName) + return iterator_range(ArrayRef()); + + uint16_t IndexVal = It.offset().value(); + const ArrayRef TableRef(RuntimeLibcallNameOffsetTable); + + ArrayRef::const_iterator E = TableRef.end(); + ArrayRef::const_iterator EntriesBegin = + std::lower_bound(TableRef.begin(), E, IndexVal); + ArrayRef::const_iterator EntriesEnd = EntriesBegin; + + while (EntriesEnd != E && *EntriesEnd == IndexVal) + ++EntriesEnd; + + assert(EntriesBegin != E && + "libcall found in name table but not offset table"); + + return make_range(EntriesBegin, EntriesEnd); +} + bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { switch (TT.getOS()) { case Triple::MacOSX: diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 2579fa37935f0..79eeb08cddeef 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -54,6 +54,11 @@ static const char *PreservedSymbols[] = { "__stack_chk_guard", }; +static bool isPreservedGlobalVarName(StringRef Name) { + return StringRef(PreservedSymbols[0]) == Name || + StringRef(PreservedSymbols[1]) == Name; +} + namespace { const char *getExpectedProducerName() { @@ -81,12 +86,16 @@ struct Builder { // The StringTableBuilder does not create a copy of any strings added to it, // so this provides somewhere to store any strings that we create. Builder(SmallVector &Symtab, StringTableBuilder &StrtabBuilder, - BumpPtrAllocator &Alloc) - : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {} + BumpPtrAllocator &Alloc, const Triple &TT) + : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT), + Libcalls(TT) {} DenseMap ComdatMap; Mangler Mang; - Triple TT; + const Triple &TT; + + // FIXME: This shouldn't be here. + RTLIB::RuntimeLibcallsInfo Libcalls; std::vector Comdats; std::vector Mods; @@ -98,6 +107,10 @@ struct Builder { std::vector DependentLibraries; + bool isPreservedLibFuncName(StringRef Name) { + return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported; + } + void setStr(storage::Str &S, StringRef Value) { S.Offset = StrtabBuilder.add(Value); S.Size = Value.size(); @@ -213,18 +226,6 @@ Expected Builder::getComdatIndex(const Comdat *C, const Module *M) { return P.first->second; } -static DenseSet buildPreservedSymbolsSet(const Triple &TT) { - DenseSet PreservedSymbolSet(std::begin(PreservedSymbols), - std::end(PreservedSymbols)); - // FIXME: Do we need to pass in ABI fields from TargetOptions? - RTLIB::RuntimeLibcallsInfo Libcalls(TT); - for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) { - if (Impl != RTLIB::Unsupported) - PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl)); - } - return PreservedSymbolSet; -} - Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, const SmallPtrSet &Used, ModuleSymbolTable::Symbol Msym) { @@ -278,13 +279,11 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, return Error::success(); } - setStr(Sym.IRName, GV->getName()); - - static const DenseSet PreservedSymbolsSet = - buildPreservedSymbolsSet(GV->getParent()->getTargetTriple()); - bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName()); + StringRef GVName = GV->getName(); + setStr(Sym.IRName, GVName); - if (Used.count(GV) || IsPreservedSymbol) + if (Used.count(GV) || isPreservedLibFuncName(GVName) || + isPreservedGlobalVarName(GVName)) Sym.Flags |= 1 << storage::Symbol::FB_used; if (GV->isThreadLocal()) Sym.Flags |= 1 << storage::Symbol::FB_tls; @@ -351,7 +350,6 @@ Error Builder::build(ArrayRef IRMods) { setStr(Hdr.Producer, kExpectedProducerName); setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str()); setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName()); - TT = IRMods[0]->getTargetTriple(); for (auto *M : IRMods) if (Error Err = addModule(M)) @@ -377,7 +375,8 @@ Error Builder::build(ArrayRef IRMods) { Error irsymtab::build(ArrayRef Mods, SmallVector &Symtab, StringTableBuilder &StrtabBuilder, BumpPtrAllocator &Alloc) { - return Builder(Symtab, StrtabBuilder, Alloc).build(Mods); + const Triple &TT = Mods[0]->getTargetTriple(); + return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods); } // Upgrade a vector of bitcode modules created by an old version of LLVM by diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp index 652bea9dc7f65..7f90d6b4fdacc 100644 --- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp +++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp @@ -236,8 +236,19 @@ class RuntimeLibcallEmitter { for (RuntimeLibcall &LibCall : RuntimeLibcallDefList) Def2RuntimeLibcall[LibCall.getDef()] = &LibCall; - ArrayRef AllRuntimeLibcallImpls = + ArrayRef AllRuntimeLibcallImplsRaw = Records.getAllDerivedDefinitions("RuntimeLibcallImpl"); + + SmallVector AllRuntimeLibcallImpls( + AllRuntimeLibcallImplsRaw); + + // Sort by libcall impl name, not the enum name. This keeps the order + // suitable for using the name table for libcall recognition binary search. + llvm::sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) { + return A->getValueAsString("LibCallFuncName") < + B->getValueAsString("LibCallFuncName"); + }); + RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size()); size_t LibCallImplEnumVal = 1; From 547a49f6b6fd96deacd0234b19e59612c9034c8a Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Thu, 17 Jul 2025 16:53:50 -0700 Subject: [PATCH 267/813] [LangRef] fix non-existant `icmp gte` -> `icmp sge` (#149420) --- llvm/docs/LangRef.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 2759e18301d58..371f356c80b0a 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -4867,7 +4867,7 @@ to be eliminated. This is because '``poison``' is stronger than '``undef``'. %D = undef %E = icmp slt %D, 4 - %F = icmp gte %D, 4 + %F = icmp sge %D, 4 Safe: %A = undef From a131ce960332db56cd1729c4829bf2694ddcb3ea Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Jul 2025 09:03:37 +0900 Subject: [PATCH 268/813] AMDGPU: Handle av imm pseudo in si-fix-sgpr-copies phi fold (#149263) --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 1 + .../CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir | 56 +++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 9b5a46395695d..44d9ef5a0792e 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -378,6 +378,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy, default: return false; case AMDGPU::V_MOV_B32_e32: + case AMDGPU::AV_MOV_B32_IMM_PSEUDO: SMovOp = AMDGPU::S_MOV_B32; break; case AMDGPU::V_MOV_B64_PSEUDO: diff --git a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir index c2c5340639a16..8145a1d7a2072 100644 --- a/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir +++ b/llvm/test/CodeGen/AMDGPU/phi-vgpr-input-moveimm.mir @@ -167,3 +167,59 @@ body: | %1:sreg_32 = COPY %0 S_BRANCH %bb.2 ... + +--- + +name: phi_moveimm_av_pseudo_input +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: phi_moveimm_av_pseudo_input + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $sgpr0, $sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY1]], implicit-def $scc + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: successors: %bb.3(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[PHI:%[0-9]+]]:sreg_32 = PHI %5, %bb.3, [[S_ADD_U32_]], %bb.1 + ; GCN-NEXT: S_BRANCH %bb.3 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.3: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + bb.0: + successors: %bb.1 + liveins: $sgpr0, $sgpr1 + + %0:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + + %4:sreg_32 = COPY $sgpr0 + %5:sreg_32 = COPY $sgpr1 + + bb.1: + successors: %bb.2 + %2:sreg_32 = S_ADD_U32 %4, %5, implicit-def $scc + S_BRANCH %bb.2 + + bb.2: + successors: %bb.3 + %3:sreg_32 = PHI %1, %bb.3, %2, %bb.1 + S_BRANCH %bb.3 + + bb.3: + successors: %bb.2 + %1:sreg_32 = COPY %0 + S_BRANCH %bb.2 +... From 3abecfe9e35ba79926e59dedb85174400f677a2d Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 18 Jul 2025 08:05:07 +0800 Subject: [PATCH 269/813] [NFC][libclc] Delete clc/include/clc/relational/floatn.inc (#149252) llvm-diff shows no change to amdgcn--amdhsa.bc. --- .../include/clc/relational/binary_decl.inc | 10 +- .../clc/include/clc/relational/clc_isfinite.h | 2 +- .../include/clc/relational/clc_isgreater.h | 2 +- .../clc/relational/clc_isgreaterequal.h | 2 +- .../clc/include/clc/relational/clc_isless.h | 2 +- .../include/clc/relational/clc_islessequal.h | 2 +- .../clc/relational/clc_islessgreater.h | 2 +- .../clc/include/clc/relational/clc_isnormal.h | 2 +- .../include/clc/relational/clc_isnotequal.h | 2 +- .../include/clc/relational/clc_isordered.h | 2 +- .../include/clc/relational/clc_isunordered.h | 2 +- .../clc/include/clc/relational/clc_signbit.h | 2 +- libclc/clc/include/clc/relational/floatn.inc | 132 ------------------ .../clc/include/clc/relational/unary_decl.inc | 10 +- .../include/clc/opencl/relational/isfinite.h | 2 +- .../include/clc/opencl/relational/isgreater.h | 2 +- .../clc/opencl/relational/isgreaterequal.h | 2 +- .../include/clc/opencl/relational/isless.h | 2 +- .../clc/opencl/relational/islessequal.h | 2 +- .../clc/opencl/relational/islessgreater.h | 2 +- .../include/clc/opencl/relational/isnormal.h | 2 +- .../clc/opencl/relational/isnotequal.h | 2 +- .../include/clc/opencl/relational/isordered.h | 2 +- .../clc/opencl/relational/isunordered.h | 2 +- .../include/clc/opencl/relational/signbit.h | 2 +- .../lib/generic/relational/binary_def.inc | 10 +- .../opencl/lib/generic/relational/isequal.cl | 2 +- .../opencl/lib/generic/relational/isfinite.cl | 2 +- .../lib/generic/relational/isgreater.cl | 2 +- .../lib/generic/relational/isgreaterequal.cl | 2 +- libclc/opencl/lib/generic/relational/isinf.cl | 2 +- .../opencl/lib/generic/relational/isless.cl | 2 +- .../lib/generic/relational/islessequal.cl | 2 +- .../lib/generic/relational/islessgreater.cl | 2 +- libclc/opencl/lib/generic/relational/isnan.cl | 2 +- .../opencl/lib/generic/relational/isnormal.cl | 2 +- .../lib/generic/relational/isnotequal.cl | 2 +- .../lib/generic/relational/isordered.cl | 2 +- .../lib/generic/relational/isunordered.cl | 2 +- .../opencl/lib/generic/relational/signbit.cl | 2 +- .../lib/generic/relational/unary_def.inc | 10 +- 41 files changed, 72 insertions(+), 172 deletions(-) delete mode 100644 libclc/clc/include/clc/relational/floatn.inc diff --git a/libclc/clc/include/clc/relational/binary_decl.inc b/libclc/clc/include/clc/relational/binary_decl.inc index bcdf5238b8f58..dc8ec9db7b7da 100644 --- a/libclc/clc/include/clc/relational/binary_decl.inc +++ b/libclc/clc/include/clc/relational/binary_decl.inc @@ -6,4 +6,12 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DECL __CLC_INTN FUNCTION(__CLC_FLOATN a, __CLC_FLOATN b); +#if __CLC_VECSIZE_OR_1 == 1 +#define __RETTYPE __CLC_INTN +#else +#define __RETTYPE __CLC_BIT_INTN +#endif + +_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b); + +#undef __RETTYPE diff --git a/libclc/clc/include/clc/relational/clc_isfinite.h b/libclc/clc/include/clc/relational/clc_isfinite.h index 5e71ec7a0640a..444d733039819 100644 --- a/libclc/clc/include/clc/relational/clc_isfinite.h +++ b/libclc/clc/include/clc/relational/clc_isfinite.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_isfinite #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_isgreater.h b/libclc/clc/include/clc/relational/clc_isgreater.h index e2e6911a80cdd..88de46854961d 100644 --- a/libclc/clc/include/clc/relational/clc_isgreater.h +++ b/libclc/clc/include/clc/relational/clc_isgreater.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_isgreater #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_isgreaterequal.h b/libclc/clc/include/clc/relational/clc_isgreaterequal.h index 3fe8835aff9d5..42308036f102f 100644 --- a/libclc/clc/include/clc/relational/clc_isgreaterequal.h +++ b/libclc/clc/include/clc/relational/clc_isgreaterequal.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_isgreaterequal #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_isless.h b/libclc/clc/include/clc/relational/clc_isless.h index 01384cf6fa4a0..6fdc6c54947c0 100644 --- a/libclc/clc/include/clc/relational/clc_isless.h +++ b/libclc/clc/include/clc/relational/clc_isless.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_isless #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_islessequal.h b/libclc/clc/include/clc/relational/clc_islessequal.h index a4b77a451b248..e592287b23099 100644 --- a/libclc/clc/include/clc/relational/clc_islessequal.h +++ b/libclc/clc/include/clc/relational/clc_islessequal.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_islessequal #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_islessgreater.h b/libclc/clc/include/clc/relational/clc_islessgreater.h index 9fb6d641bfa14..a2f10707a677d 100644 --- a/libclc/clc/include/clc/relational/clc_islessgreater.h +++ b/libclc/clc/include/clc/relational/clc_islessgreater.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_islessgreater #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_isnormal.h b/libclc/clc/include/clc/relational/clc_isnormal.h index d580fed5a7395..2281bc4245d03 100644 --- a/libclc/clc/include/clc/relational/clc_isnormal.h +++ b/libclc/clc/include/clc/relational/clc_isnormal.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_isnormal #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_isnotequal.h b/libclc/clc/include/clc/relational/clc_isnotequal.h index 16982fc3c5aaa..c2640fc0899a6 100644 --- a/libclc/clc/include/clc/relational/clc_isnotequal.h +++ b/libclc/clc/include/clc/relational/clc_isnotequal.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_isnotequal #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_isordered.h b/libclc/clc/include/clc/relational/clc_isordered.h index 7ba26662105fc..cb9be31311575 100644 --- a/libclc/clc/include/clc/relational/clc_isordered.h +++ b/libclc/clc/include/clc/relational/clc_isordered.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_isordered #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_isunordered.h b/libclc/clc/include/clc/relational/clc_isunordered.h index eac158d245191..36d314ff0e1be 100644 --- a/libclc/clc/include/clc/relational/clc_isunordered.h +++ b/libclc/clc/include/clc/relational/clc_isunordered.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_isunordered #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/clc_signbit.h b/libclc/clc/include/clc/relational/clc_signbit.h index 892263a09e99c..9e423ab448953 100644 --- a/libclc/clc/include/clc/relational/clc_signbit.h +++ b/libclc/clc/include/clc/relational/clc_signbit.h @@ -12,7 +12,7 @@ #define FUNCTION __clc_signbit #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/clc/include/clc/relational/floatn.inc b/libclc/clc/include/clc/relational/floatn.inc deleted file mode 100644 index 263937f6eef6f..0000000000000 --- a/libclc/clc/include/clc/relational/floatn.inc +++ /dev/null @@ -1,132 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include - -#define __CLC_FLOATN float -#define __CLC_INTN int -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN float2 -#define __CLC_INTN int2 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN float3 -#define __CLC_INTN int3 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN float4 -#define __CLC_INTN int4 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN float8 -#define __CLC_INTN int8 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN float16 -#define __CLC_INTN int16 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#undef __CLC_FLOAT -#undef __CLC_INT - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -#define __CLC_FLOATN double -#define __CLC_INTN int -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN double2 -#define __CLC_INTN long2 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN double3 -#define __CLC_INTN long3 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN double4 -#define __CLC_INTN long4 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN double8 -#define __CLC_INTN long8 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN double16 -#define __CLC_INTN long16 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#endif -#ifdef cl_khr_fp16 -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -#define __CLC_FLOATN half -#define __CLC_INTN int -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN half2 -#define __CLC_INTN short2 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN half3 -#define __CLC_INTN short3 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN half4 -#define __CLC_INTN short4 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN half8 -#define __CLC_INTN short8 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#define __CLC_FLOATN half16 -#define __CLC_INTN short16 -#include __CLC_BODY -#undef __CLC_INTN -#undef __CLC_FLOATN - -#endif - -#undef __CLC_BODY diff --git a/libclc/clc/include/clc/relational/unary_decl.inc b/libclc/clc/include/clc/relational/unary_decl.inc index b9fb36c905469..cc3f2d065529b 100644 --- a/libclc/clc/include/clc/relational/unary_decl.inc +++ b/libclc/clc/include/clc/relational/unary_decl.inc @@ -6,4 +6,12 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DECL __CLC_INTN FUNCTION(__CLC_FLOATN x); +#if __CLC_VECSIZE_OR_1 == 1 +#define __RETTYPE __CLC_INTN +#else +#define __RETTYPE __CLC_BIT_INTN +#endif + +_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE x); + +#undef __RETTYPE diff --git a/libclc/opencl/include/clc/opencl/relational/isfinite.h b/libclc/opencl/include/clc/opencl/relational/isfinite.h index 2548e6acf5109..ac3db6764073a 100644 --- a/libclc/opencl/include/clc/opencl/relational/isfinite.h +++ b/libclc/opencl/include/clc/opencl/relational/isfinite.h @@ -14,7 +14,7 @@ #define FUNCTION isfinite #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/isgreater.h b/libclc/opencl/include/clc/opencl/relational/isgreater.h index 6dfe6eb810e2a..2230055115bcd 100644 --- a/libclc/opencl/include/clc/opencl/relational/isgreater.h +++ b/libclc/opencl/include/clc/opencl/relational/isgreater.h @@ -14,7 +14,7 @@ #define FUNCTION isgreater #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h b/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h index 1db2c5d58d062..f99a620dabd78 100644 --- a/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h +++ b/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h @@ -14,7 +14,7 @@ #define FUNCTION isgreaterequal #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/isless.h b/libclc/opencl/include/clc/opencl/relational/isless.h index 3e2afb32cddf4..74280e543e0b5 100644 --- a/libclc/opencl/include/clc/opencl/relational/isless.h +++ b/libclc/opencl/include/clc/opencl/relational/isless.h @@ -14,7 +14,7 @@ #define FUNCTION isless #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/islessequal.h b/libclc/opencl/include/clc/opencl/relational/islessequal.h index 978e6a9052c16..dcc26c37b73c1 100644 --- a/libclc/opencl/include/clc/opencl/relational/islessequal.h +++ b/libclc/opencl/include/clc/opencl/relational/islessequal.h @@ -14,7 +14,7 @@ #define FUNCTION islessequal #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/islessgreater.h b/libclc/opencl/include/clc/opencl/relational/islessgreater.h index 56cce7db20770..15a1eb5577531 100644 --- a/libclc/opencl/include/clc/opencl/relational/islessgreater.h +++ b/libclc/opencl/include/clc/opencl/relational/islessgreater.h @@ -14,7 +14,7 @@ #define FUNCTION islessgreater #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/isnormal.h b/libclc/opencl/include/clc/opencl/relational/isnormal.h index ee74a990b5eaf..bbb06aad0df2a 100644 --- a/libclc/opencl/include/clc/opencl/relational/isnormal.h +++ b/libclc/opencl/include/clc/opencl/relational/isnormal.h @@ -14,7 +14,7 @@ #define FUNCTION isnormal #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/isnotequal.h b/libclc/opencl/include/clc/opencl/relational/isnotequal.h index 7cf94e3ceec5f..c13aca8ef4be8 100644 --- a/libclc/opencl/include/clc/opencl/relational/isnotequal.h +++ b/libclc/opencl/include/clc/opencl/relational/isnotequal.h @@ -14,7 +14,7 @@ #define FUNCTION isnotequal #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/isordered.h b/libclc/opencl/include/clc/opencl/relational/isordered.h index ad9770bd627f2..ea4ba3fa6fe8d 100644 --- a/libclc/opencl/include/clc/opencl/relational/isordered.h +++ b/libclc/opencl/include/clc/opencl/relational/isordered.h @@ -14,7 +14,7 @@ #define FUNCTION isordered #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/isunordered.h b/libclc/opencl/include/clc/opencl/relational/isunordered.h index 01d2f53837317..76bf85604d1c7 100644 --- a/libclc/opencl/include/clc/opencl/relational/isunordered.h +++ b/libclc/opencl/include/clc/opencl/relational/isunordered.h @@ -14,7 +14,7 @@ #define FUNCTION isunordered #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/relational/signbit.h b/libclc/opencl/include/clc/opencl/relational/signbit.h index 29591c0c126a9..6ad6595c7e294 100644 --- a/libclc/opencl/include/clc/opencl/relational/signbit.h +++ b/libclc/opencl/include/clc/opencl/relational/signbit.h @@ -14,7 +14,7 @@ #define FUNCTION signbit #define __CLC_BODY -#include +#include #undef FUNCTION diff --git a/libclc/opencl/lib/generic/relational/binary_def.inc b/libclc/opencl/lib/generic/relational/binary_def.inc index 54bb237b8f8f5..8416da0475a2c 100644 --- a/libclc/opencl/lib/generic/relational/binary_def.inc +++ b/libclc/opencl/lib/generic/relational/binary_def.inc @@ -10,6 +10,14 @@ #define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) -_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_FLOATN a, __CLC_FLOATN b) { +#if __CLC_VECSIZE_OR_1 == 1 +#define __RETTYPE __CLC_INTN +#else +#define __RETTYPE __CLC_BIT_INTN +#endif + +_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b) { return __IMPL_FUNCTION(FUNCTION)(a, b); } + +#undef __RETTYPE diff --git a/libclc/opencl/lib/generic/relational/isequal.cl b/libclc/opencl/lib/generic/relational/isequal.cl index 94f83f9452666..83002c28ceab3 100644 --- a/libclc/opencl/lib/generic/relational/isequal.cl +++ b/libclc/opencl/lib/generic/relational/isequal.cl @@ -12,4 +12,4 @@ #define FUNCTION isequal #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isfinite.cl b/libclc/opencl/lib/generic/relational/isfinite.cl index 695ffea806d5c..a2017133cead8 100644 --- a/libclc/opencl/lib/generic/relational/isfinite.cl +++ b/libclc/opencl/lib/generic/relational/isfinite.cl @@ -12,4 +12,4 @@ #define FUNCTION isfinite #define __CLC_BODY "unary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isgreater.cl b/libclc/opencl/lib/generic/relational/isgreater.cl index fb46ff20ac608..6eeb2b21c0493 100644 --- a/libclc/opencl/lib/generic/relational/isgreater.cl +++ b/libclc/opencl/lib/generic/relational/isgreater.cl @@ -12,4 +12,4 @@ #define FUNCTION isgreater #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isgreaterequal.cl b/libclc/opencl/lib/generic/relational/isgreaterequal.cl index b8edde2a05b77..e4e4535fd30d3 100644 --- a/libclc/opencl/lib/generic/relational/isgreaterequal.cl +++ b/libclc/opencl/lib/generic/relational/isgreaterequal.cl @@ -12,4 +12,4 @@ #define FUNCTION isgreaterequal #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isinf.cl b/libclc/opencl/lib/generic/relational/isinf.cl index 2c15f1f826762..2ab8c182e02a6 100644 --- a/libclc/opencl/lib/generic/relational/isinf.cl +++ b/libclc/opencl/lib/generic/relational/isinf.cl @@ -12,4 +12,4 @@ #define FUNCTION isinf #define __CLC_BODY "unary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isless.cl b/libclc/opencl/lib/generic/relational/isless.cl index 0af1f53e71042..4212970e7671a 100644 --- a/libclc/opencl/lib/generic/relational/isless.cl +++ b/libclc/opencl/lib/generic/relational/isless.cl @@ -12,4 +12,4 @@ #define FUNCTION isless #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/islessequal.cl b/libclc/opencl/lib/generic/relational/islessequal.cl index 9e32afc718ab2..e7aec262fc762 100644 --- a/libclc/opencl/lib/generic/relational/islessequal.cl +++ b/libclc/opencl/lib/generic/relational/islessequal.cl @@ -12,4 +12,4 @@ #define FUNCTION islessequal #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/islessgreater.cl b/libclc/opencl/lib/generic/relational/islessgreater.cl index c36a857dc3dfc..b775d2484550c 100644 --- a/libclc/opencl/lib/generic/relational/islessgreater.cl +++ b/libclc/opencl/lib/generic/relational/islessgreater.cl @@ -12,4 +12,4 @@ #define FUNCTION islessgreater #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isnan.cl b/libclc/opencl/lib/generic/relational/isnan.cl index 8b03930c5312f..4b7eeb5b919b6 100644 --- a/libclc/opencl/lib/generic/relational/isnan.cl +++ b/libclc/opencl/lib/generic/relational/isnan.cl @@ -12,4 +12,4 @@ #define FUNCTION isnan #define __CLC_BODY "unary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isnormal.cl b/libclc/opencl/lib/generic/relational/isnormal.cl index 4ba21cc3e17fc..60ce9dccaeaf3 100644 --- a/libclc/opencl/lib/generic/relational/isnormal.cl +++ b/libclc/opencl/lib/generic/relational/isnormal.cl @@ -12,4 +12,4 @@ #define FUNCTION isnormal #define __CLC_BODY "unary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isnotequal.cl b/libclc/opencl/lib/generic/relational/isnotequal.cl index 928923b9b2a5e..abb4d3a859663 100644 --- a/libclc/opencl/lib/generic/relational/isnotequal.cl +++ b/libclc/opencl/lib/generic/relational/isnotequal.cl @@ -12,4 +12,4 @@ #define FUNCTION isnotequal #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isordered.cl b/libclc/opencl/lib/generic/relational/isordered.cl index 60ca4d67ff1ea..684ee425e1203 100644 --- a/libclc/opencl/lib/generic/relational/isordered.cl +++ b/libclc/opencl/lib/generic/relational/isordered.cl @@ -12,4 +12,4 @@ #define FUNCTION isordered #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/isunordered.cl b/libclc/opencl/lib/generic/relational/isunordered.cl index 3392d77856ced..84aa8cafb111a 100644 --- a/libclc/opencl/lib/generic/relational/isunordered.cl +++ b/libclc/opencl/lib/generic/relational/isunordered.cl @@ -12,4 +12,4 @@ #define FUNCTION isunordered #define __CLC_BODY "binary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/signbit.cl b/libclc/opencl/lib/generic/relational/signbit.cl index 26feb8d43fa25..d30fea7b9f6f5 100644 --- a/libclc/opencl/lib/generic/relational/signbit.cl +++ b/libclc/opencl/lib/generic/relational/signbit.cl @@ -12,4 +12,4 @@ #define FUNCTION signbit #define __CLC_BODY "unary_def.inc" -#include +#include diff --git a/libclc/opencl/lib/generic/relational/unary_def.inc b/libclc/opencl/lib/generic/relational/unary_def.inc index 47bb33ef2da3d..f184e3cf0be56 100644 --- a/libclc/opencl/lib/generic/relational/unary_def.inc +++ b/libclc/opencl/lib/generic/relational/unary_def.inc @@ -10,6 +10,14 @@ #define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) -_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_FLOATN a) { +#if __CLC_VECSIZE_OR_1 == 1 +#define __RETTYPE __CLC_INTN +#else +#define __RETTYPE __CLC_BIT_INTN +#endif + +_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE a) { return __IMPL_FUNCTION(FUNCTION)(a); } + +#undef __RETTYPE From 8813fc07f2a0d8f7a196ca0a64477b14feb2c166 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Jul 2025 09:05:26 +0900 Subject: [PATCH 270/813] github: Add libclc to PR autolabeler (#149274) --- .github/new-prs-labeler.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index 7905f762bf8e8..8e0fa8d42d735 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -48,6 +48,9 @@ flang:frontend: - flang/Evaluate/**/* - flang/Semantics/**/* +libclc: + - libclc/** + HLSL: - clang/*HLSL*/**/* - clang/**/*HLSL* From c0294f497d65da998e39882c234f157daecebfa8 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 18 Jul 2025 08:06:29 +0800 Subject: [PATCH 271/813] [libclc] Add generic implementation of bitfield_insert/extract,bit_reverse (#149070) The implementation is based on reference implementation in OpenCL-CTS/test_integer_ops. The generic implementations pass OpenCL-CTS/test_integer_ops tests on Intel GPU. --- .../clc/include/clc/integer/clc_bit_reverse.h | 19 +++++++++++++ .../clc/integer/clc_bitfield_extract_decl.inc | 10 +++++++ .../clc/integer/clc_bitfield_extract_signed.h | 23 ++++++++++++++++ .../integer/clc_bitfield_extract_unsigned.h | 23 ++++++++++++++++ .../include/clc/integer/clc_bitfield_insert.h | 18 +++++++++++++ .../clc/integer/clc_bitfield_insert.inc | 11 ++++++++ libclc/clc/lib/generic/SOURCES | 4 +++ .../lib/generic/integer/clc_bit_reverse.cl | 15 +++++++++++ .../integer/clc_bitfield_extract_signed.cl | 12 +++++++++ .../integer/clc_bitfield_extract_signed.inc | 19 +++++++++++++ .../integer/clc_bitfield_extract_unsigned.cl | 12 +++++++++ .../integer/clc_bitfield_extract_unsigned.inc | 16 +++++++++++ .../generic/integer/clc_bitfield_insert.cl | 12 +++++++++ .../generic/integer/clc_bitfield_insert.inc | 20 ++++++++++++++ .../include/clc/opencl/integer/bit_reverse.h | 25 +++++++++++++++++ .../opencl/integer/bitfield_extract_signed.h | 27 +++++++++++++++++++ .../integer/bitfield_extract_unsigned.h | 27 +++++++++++++++++++ .../clc/opencl/integer/bitfield_insert.h | 23 ++++++++++++++++ libclc/opencl/lib/generic/SOURCES | 4 +++ .../opencl/lib/generic/integer/bit_reverse.cl | 19 +++++++++++++ .../generic/integer/bitfield_extract_def.inc | 16 +++++++++++ .../integer/bitfield_extract_signed.cl | 20 ++++++++++++++ .../integer/bitfield_extract_unsigned.cl | 20 ++++++++++++++ .../lib/generic/integer/bitfield_insert.cl | 18 +++++++++++++ .../lib/generic/integer/bitfield_insert.inc | 13 +++++++++ 25 files changed, 426 insertions(+) create mode 100644 libclc/clc/include/clc/integer/clc_bit_reverse.h create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_insert.h create mode 100644 libclc/clc/include/clc/integer/clc_bitfield_insert.inc create mode 100644 libclc/clc/lib/generic/integer/clc_bit_reverse.cl create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_insert.cl create mode 100644 libclc/clc/lib/generic/integer/clc_bitfield_insert.inc create mode 100644 libclc/opencl/include/clc/opencl/integer/bit_reverse.h create mode 100644 libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h create mode 100644 libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h create mode 100644 libclc/opencl/include/clc/opencl/integer/bitfield_insert.h create mode 100644 libclc/opencl/lib/generic/integer/bit_reverse.cl create mode 100644 libclc/opencl/lib/generic/integer/bitfield_extract_def.inc create mode 100644 libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl create mode 100644 libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl create mode 100644 libclc/opencl/lib/generic/integer/bitfield_insert.cl create mode 100644 libclc/opencl/lib/generic/integer/bitfield_insert.inc diff --git a/libclc/clc/include/clc/integer/clc_bit_reverse.h b/libclc/clc/include/clc/integer/clc_bit_reverse.h new file mode 100644 index 0000000000000..c945e326c74fa --- /dev/null +++ b/libclc/clc/include/clc/integer/clc_bit_reverse.h @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_INTEGER_CLC_BIT_REVERSE_H__ +#define __CLC_INTEGER_CLC_BIT_REVERSE_H__ + +#define FUNCTION __clc_bit_reverse +#define __CLC_BODY + +#include + +#undef FUNCTION + +#endif // __CLC_INTEGER_CLC_BIT_REVERSE_H__ diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc b/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc new file mode 100644 index 0000000000000..c93eff08de0bc --- /dev/null +++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc @@ -0,0 +1,10 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_OVERLOAD _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE base, uint offset, + uint count); diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h b/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h new file mode 100644 index 0000000000000..9c2e047b8be00 --- /dev/null +++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__ +#define __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__ + +#include + +#define FUNCTION __clc_bitfield_extract_signed +#define __RETTYPE __CLC_S_GENTYPE + +#define __CLC_BODY +#include + +#undef __RETTYPE +#undef FUNCTION + +#endif // __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__ diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h b/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h new file mode 100644 index 0000000000000..95305a3027e5d --- /dev/null +++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__ +#define __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__ + +#include + +#define FUNCTION __clc_bitfield_extract_unsigned +#define __RETTYPE __CLC_U_GENTYPE + +#define __CLC_BODY +#include + +#undef __RETTYPE +#undef FUNCTION + +#endif // __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__ diff --git a/libclc/clc/include/clc/integer/clc_bitfield_insert.h b/libclc/clc/include/clc/integer/clc_bitfield_insert.h new file mode 100644 index 0000000000000..f4d36b2ad2d2e --- /dev/null +++ b/libclc/clc/include/clc/integer/clc_bitfield_insert.h @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_INTEGER_CLC_BITFIELD_INSERT_H__ +#define __CLC_INTEGER_CLC_BITFIELD_INSERT_H__ + +#include + +#define FUNCTION __clc_bitfield_insert +#define __CLC_BODY +#include + +#endif // __CLC_INTEGER_CLC_BITFIELD_INSERT_H__ diff --git a/libclc/clc/include/clc/integer/clc_bitfield_insert.inc b/libclc/clc/include/clc/integer/clc_bitfield_insert.inc new file mode 100644 index 0000000000000..22f58bdc09830 --- /dev/null +++ b/libclc/clc/include/clc/integer/clc_bitfield_insert.inc @@ -0,0 +1,11 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE base, + __CLC_GENTYPE insert, + uint offset, uint count); diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index bf8736a726315..9d792c4b3d28d 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -15,6 +15,10 @@ geometric/clc_normalize.cl integer/clc_abs.cl integer/clc_abs_diff.cl integer/clc_add_sat.cl +integer/clc_bitfield_extract_signed.cl +integer/clc_bitfield_extract_unsigned.cl +integer/clc_bitfield_insert.cl +integer/clc_bit_reverse.cl integer/clc_clz.cl integer/clc_ctz.cl integer/clc_hadd.cl diff --git a/libclc/clc/lib/generic/integer/clc_bit_reverse.cl b/libclc/clc/lib/generic/integer/clc_bit_reverse.cl new file mode 100644 index 0000000000000..439957383f583 --- /dev/null +++ b/libclc/clc/lib/generic/integer/clc_bit_reverse.cl @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_bit_reverse +#define __IMPL_FUNCTION(x) __builtin_elementwise_bitreverse +#define __CLC_BODY + +#include diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl new file mode 100644 index 0000000000000..d779ed6a43593 --- /dev/null +++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.cl @@ -0,0 +1,12 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc new file mode 100644 index 0000000000000..84cae2166f7ce --- /dev/null +++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_signed.inc @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_OVERLOAD _CLC_DEF __CLC_S_GENTYPE +__clc_bitfield_extract_signed(__CLC_GENTYPE base, uint offset, uint count) { + if (count == 0) + return 0; + __CLC_U_GENTYPE x = __CLC_AS_U_GENTYPE(base) + << (__CLC_GENSIZE - offset - count); + // Implement an arithmetic shift right. + __CLC_U_GENTYPE s = -(x >> (__CLC_GENSIZE - 1)); + __CLC_U_GENTYPE result = ((s ^ x) >> (__CLC_GENSIZE - count)) ^ s; + return __CLC_AS_S_GENTYPE(result); +} diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl new file mode 100644 index 0000000000000..bf7db401034dc --- /dev/null +++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.cl @@ -0,0 +1,12 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc new file mode 100644 index 0000000000000..bc81ce5c98b09 --- /dev/null +++ b/libclc/clc/lib/generic/integer/clc_bitfield_extract_unsigned.inc @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_OVERLOAD _CLC_DEF __CLC_U_GENTYPE +__clc_bitfield_extract_unsigned(__CLC_GENTYPE base, uint offset, uint count) { + if (count == 0) + return 0; + __CLC_U_GENTYPE result = __CLC_AS_U_GENTYPE(base) + << (__CLC_GENSIZE - offset - count); + return result >> (__CLC_GENSIZE - count); +} diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_insert.cl b/libclc/clc/lib/generic/integer/clc_bitfield_insert.cl new file mode 100644 index 0000000000000..a40fc804f2187 --- /dev/null +++ b/libclc/clc/lib/generic/integer/clc_bitfield_insert.cl @@ -0,0 +1,12 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc b/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc new file mode 100644 index 0000000000000..ad8dac28750cc --- /dev/null +++ b/libclc/clc/lib/generic/integer/clc_bitfield_insert.inc @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_bitfield_insert(__CLC_GENTYPE base, + __CLC_GENTYPE insert, + uint offset, + uint count) { + __CLC_U_GENTYPE u_base = __CLC_AS_U_GENTYPE(base); + __CLC_U_GENTYPE u_insert = __CLC_AS_U_GENTYPE(insert); + __CLC_U_GENTYPE mask = (((__CLC_U_GENTYPE)1 << count) - (__CLC_U_GENTYPE)1) + << offset; + mask = count < __CLC_GENSIZE ? mask : ~(__CLC_U_GENTYPE)0; + __CLC_U_GENTYPE result = ((u_insert << offset) & mask) | (u_base & ~mask); + return __CLC_AS_GENTYPE(result); +} diff --git a/libclc/opencl/include/clc/opencl/integer/bit_reverse.h b/libclc/opencl/include/clc/opencl/integer/bit_reverse.h new file mode 100644 index 0000000000000..46b589557631d --- /dev/null +++ b/libclc/opencl/include/clc/opencl/integer/bit_reverse.h @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_INTEGER_BIT_REVERSE_H__ +#define __CLC_OPENCL_INTEGER_BIT_REVERSE_H__ + +#ifdef cl_khr_extended_bit_ops + +#include + +#define FUNCTION bit_reverse +#define __CLC_BODY + +#include + +#undef FUNCTION + +#endif // cl_khr_extended_bit_ops + +#endif // __CLC_OPENCL_INTEGER_BIT_REVERSE_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h new file mode 100644 index 0000000000000..0a902b2a21d6d --- /dev/null +++ b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__ +#define __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__ + +#ifdef cl_khr_extended_bit_ops + +#include + +#define FUNCTION bitfield_extract_signed +#define __RETTYPE __CLC_S_GENTYPE + +#define __CLC_BODY +#include + +#undef __RETTYPE +#undef FUNCTION + +#endif // cl_khr_extended_bit_ops + +#endif // __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_SIGNED_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h new file mode 100644 index 0000000000000..28064c08b113e --- /dev/null +++ b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h @@ -0,0 +1,27 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__ +#define __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__ + +#ifdef cl_khr_extended_bit_ops + +#include + +#define FUNCTION bitfield_extract_unsigned +#define __RETTYPE __CLC_U_GENTYPE + +#define __CLC_BODY +#include + +#undef __RETTYPE +#undef FUNCTION + +#endif // cl_khr_extended_bit_ops + +#endif // __CLC_OPENCL_INTEGER_BITFIELD_EXTRACT_UNSIGNED_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h b/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h new file mode 100644 index 0000000000000..e77d7a4f0b957 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__ +#define __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__ + +#ifdef cl_khr_extended_bit_ops + +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // cl_khr_extended_bit_ops + +#endif // __CLC_OPENCL_INTEGER_BITFIELD_INSERT_H__ diff --git a/libclc/opencl/lib/generic/SOURCES b/libclc/opencl/lib/generic/SOURCES index 46ce6d6e36c24..a59a82ee325ec 100644 --- a/libclc/opencl/lib/generic/SOURCES +++ b/libclc/opencl/lib/generic/SOURCES @@ -43,6 +43,10 @@ geometric/normalize.cl integer/abs.cl integer/abs_diff.cl integer/add_sat.cl +integer/bitfield_extract_signed.cl +integer/bitfield_extract_unsigned.cl +integer/bitfield_insert.cl +integer/bit_reverse.cl integer/clz.cl integer/ctz.cl integer/hadd.cl diff --git a/libclc/opencl/lib/generic/integer/bit_reverse.cl b/libclc/opencl/lib/generic/integer/bit_reverse.cl new file mode 100644 index 0000000000000..23181b6b3eba5 --- /dev/null +++ b/libclc/opencl/lib/generic/integer/bit_reverse.cl @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef cl_khr_extended_bit_ops + +#include +#include + +#define FUNCTION bit_reverse +#define __CLC_BODY + +#include + +#endif // cl_khr_extended_bit_ops diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc b/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc new file mode 100644 index 0000000000000..0262f67732afc --- /dev/null +++ b/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __IMPL_FUNCTION +#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#endif + +_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE base, uint offset, + uint count) { + return __IMPL_FUNCTION(FUNCTION)(base, offset, count); +} diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl b/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl new file mode 100644 index 0000000000000..eaa4ac779cfd1 --- /dev/null +++ b/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef cl_khr_extended_bit_ops + +#include +#include + +#define FUNCTION bitfield_extract_signed +#define __RETTYPE __CLC_S_GENTYPE + +#define __CLC_BODY +#include + +#endif // cl_khr_extended_bit_ops diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl b/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl new file mode 100644 index 0000000000000..fd63d5d6dee30 --- /dev/null +++ b/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef cl_khr_extended_bit_ops + +#include +#include + +#define FUNCTION bitfield_extract_unsigned +#define __RETTYPE __CLC_U_GENTYPE + +#define __CLC_BODY +#include + +#endif // cl_khr_extended_bit_ops diff --git a/libclc/opencl/lib/generic/integer/bitfield_insert.cl b/libclc/opencl/lib/generic/integer/bitfield_insert.cl new file mode 100644 index 0000000000000..6b441155f393b --- /dev/null +++ b/libclc/opencl/lib/generic/integer/bitfield_insert.cl @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef cl_khr_extended_bit_ops + +#include +#include + +#define FUNCTION bitfield_insert +#define __CLC_BODY +#include + +#endif // cl_khr_extended_bit_ops diff --git a/libclc/opencl/lib/generic/integer/bitfield_insert.inc b/libclc/opencl/lib/generic/integer/bitfield_insert.inc new file mode 100644 index 0000000000000..b1f45907a4361 --- /dev/null +++ b/libclc/opencl/lib/generic/integer/bitfield_insert.inc @@ -0,0 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE bitfield_insert(__CLC_GENTYPE base, + __CLC_GENTYPE insert, + uint offset, uint count) { + return __clc_bitfield_insert(base, insert, offset, count); +} From 64205adc3bfdaab0f35f7909de59dadd20ef6e6a Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 18 Jul 2025 08:08:25 +0800 Subject: [PATCH 272/813] [SPIR-V] Map SPIR-V friendly work-item function to built-in variables (#148567) The mapping ensures the function is lowered to SPIR-V built-in variables in SPIR-V. This can fix pre-commit CI fail in https://github.com/intel/llvm/pull/19359 Also add BuiltIn to SPIR-V Builtin function name in __clang_spirv_builtins.h to align with https://github.com/llvm/llvm-project/blob/main/llvm/docs/SPIRVUsage.rst#builtin-variables --- clang/lib/Headers/__clang_spirv_builtins.h | 24 ++-- clang/test/Headers/spirv_ids.cpp | 104 +++++++++--------- llvm/lib/Target/SPIRV/SPIRVBuiltins.td | 27 ++++- .../SPIRV/transcoding/builtin_calls.ll | 100 ++++++++++++++++- 4 files changed, 186 insertions(+), 69 deletions(-) diff --git a/clang/lib/Headers/__clang_spirv_builtins.h b/clang/lib/Headers/__clang_spirv_builtins.h index 9915cdfcae7cd..9c7215f506508 100644 --- a/clang/lib/Headers/__clang_spirv_builtins.h +++ b/clang/lib/Headers/__clang_spirv_builtins.h @@ -52,30 +52,30 @@ // Builtin IDs and sizes extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_num_workgroups) __size_t - __spirv_NumWorkgroups(int); + __spirv_BuiltInNumWorkgroups(int); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_workgroup_size) __size_t - __spirv_WorkgroupSize(int); + __spirv_BuiltInWorkgroupSize(int); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_workgroup_id) __size_t - __spirv_WorkgroupId(int); + __spirv_BuiltInWorkgroupId(int); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_local_invocation_id) __size_t - __spirv_LocalInvocationId(int); + __spirv_BuiltInLocalInvocationId(int); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_invocation_id) __size_t - __spirv_GlobalInvocationId(int); + __spirv_BuiltInGlobalInvocationId(int); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_size) __size_t - __spirv_GlobalSize(int); + __spirv_BuiltInGlobalSize(int); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_global_offset) __size_t - __spirv_GlobalOffset(int); + __spirv_BuiltInGlobalOffset(int); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_size) __uint32_t - __spirv_SubgroupSize(); + __spirv_BuiltInSubgroupSize(); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_max_size) __uint32_t - __spirv_SubgroupMaxSize(); + __spirv_BuiltInSubgroupMaxSize(); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_num_subgroups) __uint32_t - __spirv_NumSubgroups(); + __spirv_BuiltInNumSubgroups(); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_id) __uint32_t - __spirv_SubgroupId(); + __spirv_BuiltInSubgroupId(); extern __SPIRV_BUILTIN_ALIAS(__builtin_spirv_subgroup_local_invocation_id) - __uint32_t __spirv_SubgroupLocalInvocationId(); + __uint32_t __spirv_BuiltInSubgroupLocalInvocationId(); // OpGenericCastToPtrExplicit diff --git a/clang/test/Headers/spirv_ids.cpp b/clang/test/Headers/spirv_ids.cpp index 0cd74dbca53aa..466be5deee87a 100644 --- a/clang/test/Headers/spirv_ids.cpp +++ b/clang/test/Headers/spirv_ids.cpp @@ -53,58 +53,58 @@ // CHECK: call i32 @llvm.spv.subgroup.id() // CHECK: call i32 @llvm.spv.subgroup.local.invocation.id() -// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 0) #2 -// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 1) #2 -// NV: call noundef i64 @_Z21__spirv_NumWorkgroupsi(i32 noundef 2) #2 -// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 0) #2 -// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 1) #2 -// NV: call noundef i64 @_Z21__spirv_WorkgroupSizei(i32 noundef 2) #2 -// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 0) #2 -// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 1) #2 -// NV: call noundef i64 @_Z19__spirv_WorkgroupIdi(i32 noundef 2) #2 -// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 0) #2 -// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 1) #2 -// NV: call noundef i64 @_Z25__spirv_LocalInvocationIdi(i32 noundef 2) #2 -// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 0) #2 -// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 1) #2 -// NV: call noundef i64 @_Z26__spirv_GlobalInvocationIdi(i32 noundef 2) #2 -// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 0) #2 -// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 1) #2 -// NV: call noundef i64 @_Z18__spirv_GlobalSizei(i32 noundef 2) #2 -// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 0) #2 -// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 1) #2 -// NV: call noundef i64 @_Z20__spirv_GlobalOffseti(i32 noundef 2) #2 -// NV: call noundef i32 @_Z20__spirv_SubgroupSizev() #2 -// NV: call noundef i32 @_Z23__spirv_SubgroupMaxSizev() #2 -// NV: call noundef i32 @_Z20__spirv_NumSubgroupsv() #2 -// NV: call noundef i32 @_Z18__spirv_SubgroupIdv() #2 -// NV: call noundef i32 @_Z33__spirv_SubgroupLocalInvocationIdv() #2 +// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 0) #2 +// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 1) #2 +// NV: call noundef i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 noundef 2) #2 +// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 0) #2 +// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 1) #2 +// NV: call noundef i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 noundef 2) #2 +// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 0) #2 +// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 1) #2 +// NV: call noundef i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 noundef 2) #2 +// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 0) #2 +// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 1) #2 +// NV: call noundef i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 noundef 2) #2 +// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 0) #2 +// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 1) #2 +// NV: call noundef i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 noundef 2) #2 +// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 0) #2 +// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 1) #2 +// NV: call noundef i64 @_Z25__spirv_BuiltInGlobalSizei(i32 noundef 2) #2 +// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 0) #2 +// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 1) #2 +// NV: call noundef i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 noundef 2) #2 +// NV: call noundef i32 @_Z27__spirv_BuiltInSubgroupSizev() #2 +// NV: call noundef i32 @_Z30__spirv_BuiltInSubgroupMaxSizev() #2 +// NV: call noundef i32 @_Z27__spirv_BuiltInNumSubgroupsv() #2 +// NV: call noundef i32 @_Z25__spirv_BuiltInSubgroupIdv() #2 +// NV: call noundef i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv() #2 void test_id_and_range() { - __spirv_NumWorkgroups(0); - __spirv_NumWorkgroups(1); - __spirv_NumWorkgroups(2); - __spirv_WorkgroupSize(0); - __spirv_WorkgroupSize(1); - __spirv_WorkgroupSize(2); - __spirv_WorkgroupId(0); - __spirv_WorkgroupId(1); - __spirv_WorkgroupId(2); - __spirv_LocalInvocationId(0); - __spirv_LocalInvocationId(1); - __spirv_LocalInvocationId(2); - __spirv_GlobalInvocationId(0); - __spirv_GlobalInvocationId(1); - __spirv_GlobalInvocationId(2); - __spirv_GlobalSize(0); - __spirv_GlobalSize(1); - __spirv_GlobalSize(2); - __spirv_GlobalOffset(0); - __spirv_GlobalOffset(1); - __spirv_GlobalOffset(2); - unsigned int ssize = __spirv_SubgroupSize(); - unsigned int smax = __spirv_SubgroupMaxSize(); - unsigned int snum = __spirv_NumSubgroups(); - unsigned int sid = __spirv_SubgroupId(); - unsigned int sinvocid = __spirv_SubgroupLocalInvocationId(); + __spirv_BuiltInNumWorkgroups(0); + __spirv_BuiltInNumWorkgroups(1); + __spirv_BuiltInNumWorkgroups(2); + __spirv_BuiltInWorkgroupSize(0); + __spirv_BuiltInWorkgroupSize(1); + __spirv_BuiltInWorkgroupSize(2); + __spirv_BuiltInWorkgroupId(0); + __spirv_BuiltInWorkgroupId(1); + __spirv_BuiltInWorkgroupId(2); + __spirv_BuiltInLocalInvocationId(0); + __spirv_BuiltInLocalInvocationId(1); + __spirv_BuiltInLocalInvocationId(2); + __spirv_BuiltInGlobalInvocationId(0); + __spirv_BuiltInGlobalInvocationId(1); + __spirv_BuiltInGlobalInvocationId(2); + __spirv_BuiltInGlobalSize(0); + __spirv_BuiltInGlobalSize(1); + __spirv_BuiltInGlobalSize(2); + __spirv_BuiltInGlobalOffset(0); + __spirv_BuiltInGlobalOffset(1); + __spirv_BuiltInGlobalOffset(2); + unsigned int ssize = __spirv_BuiltInSubgroupSize(); + unsigned int smax = __spirv_BuiltInSubgroupMaxSize(); + unsigned int snum = __spirv_BuiltInNumSubgroups(); + unsigned int sid = __spirv_BuiltInSubgroupId(); + unsigned int sinvocid = __spirv_BuiltInSubgroupLocalInvocationId(); } diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 6897865eb4e15..ea78dcd135267 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -1364,7 +1364,24 @@ defm : DemangledGetBuiltin<"get_sub_group_gt_mask", OpenCL_std, Variable, Subgro defm : DemangledGetBuiltin<"get_sub_group_le_mask", OpenCL_std, Variable, SubgroupLeMask>; defm : DemangledGetBuiltin<"get_sub_group_lt_mask", OpenCL_std, Variable, SubgroupLtMask>; defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalLinearId", OpenCL_std, Variable, GlobalLinearId>; -defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, Variable, GlobalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationIndex", OpenCL_std, Variable, LocalInvocationIndex>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkDim", OpenCL_std, Variable, WorkDim>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupSize", OpenCL_std, Variable, SubgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupMaxSize", OpenCL_std, Variable, SubgroupMaxSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumSubgroups", OpenCL_std, Variable, NumSubgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumEnqueuedSubgroups", OpenCL_std, Variable, NumEnqueuedSubgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupId", OpenCL_std, Variable, SubgroupId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLocalInvocationId", OpenCL_std, Variable, SubgroupLocalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMask", OpenCL_std, Variable, SubgroupEqMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupEqMaskKHR", OpenCL_std, Variable, SubgroupEqMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMask", OpenCL_std, Variable, SubgroupGeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGeMaskKHR", OpenCL_std, Variable, SubgroupGeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMask", OpenCL_std, Variable, SubgroupGtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupGtMaskKHR", OpenCL_std, Variable, SubgroupGtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMask", OpenCL_std, Variable, SubgroupLeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLeMaskKHR", OpenCL_std, Variable, SubgroupLeMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMask", OpenCL_std, Variable, SubgroupLtMask>; +defm : DemangledGetBuiltin<"__spirv_BuiltInSubgroupLtMaskKHR", OpenCL_std, Variable, SubgroupLtMask>; // GetQuery builtin records: defm : DemangledGetBuiltin<"get_local_id", OpenCL_std, GetQuery, LocalInvocationId>; @@ -1375,6 +1392,14 @@ defm : DemangledGetBuiltin<"get_group_id", OpenCL_std, GetQuery, WorkgroupId>; defm : DemangledGetBuiltin<"get_enqueued_local_size", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; defm : DemangledGetBuiltin<"get_num_groups", OpenCL_std, GetQuery, NumWorkgroups>; defm : DemangledGetBuiltin<"get_global_offset", OpenCL_std, GetQuery, GlobalOffset>; +defm : DemangledGetBuiltin<"__spirv_BuiltInLocalInvocationId", OpenCL_std, GetQuery, LocalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalInvocationId", OpenCL_std, GetQuery, GlobalInvocationId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupSize", OpenCL_std, GetQuery, WorkgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalSize", OpenCL_std, GetQuery, GlobalSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInWorkgroupId", OpenCL_std, GetQuery, WorkgroupId>; +defm : DemangledGetBuiltin<"__spirv_BuiltInEnqueuedWorkgroupSize", OpenCL_std, GetQuery, EnqueuedWorkgroupSize>; +defm : DemangledGetBuiltin<"__spirv_BuiltInNumWorkgroups", OpenCL_std, GetQuery, NumWorkgroups>; +defm : DemangledGetBuiltin<"__spirv_BuiltInGlobalOffset", OpenCL_std, GetQuery, GlobalOffset>; defm : DemangledGetBuiltin<"__hlsl_wave_get_lane_index", GLSL_std_450, Wave, SubgroupLocalInvocationId>; //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll index 0a02a8bf56ace..b179732371d97 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll @@ -1,17 +1,109 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId -; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId -; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]] -; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]] +; CHECK-SPIRV-DAG: OpDecorate %[[#Id0:]] BuiltIn GlobalLinearId +; CHECK-SPIRV-DAG: OpDecorate %[[#Id1:]] BuiltIn GlobalInvocationId +; CHECK-SPIRV-DAG: OpDecorate %[[#Id2:]] BuiltIn LocalInvocationIndex +; CHECK-SPIRV-DAG: OpDecorate %[[#Id3:]] BuiltIn WorkDim +; CHECK-SPIRV-DAG: OpDecorate %[[#Id4:]] BuiltIn SubgroupSize +; CHECK-SPIRV-DAG: OpDecorate %[[#Id5:]] BuiltIn SubgroupMaxSize +; CHECK-SPIRV-DAG: OpDecorate %[[#Id6:]] BuiltIn NumSubgroups +; CHECK-SPIRV-DAG: OpDecorate %[[#Id7:]] BuiltIn NumEnqueuedSubgroups +; CHECK-SPIRV-DAG: OpDecorate %[[#Id8:]] BuiltIn SubgroupId +; CHECK-SPIRV-DAG: OpDecorate %[[#Id9:]] BuiltIn SubgroupLocalInvocationId +; CHECK-SPIRV-DAG: OpDecorate %[[#Id10:]] BuiltIn SubgroupEqMask +; CHECK-SPIRV-DAG: OpDecorate %[[#Id11:]] BuiltIn SubgroupGeMask +; CHECK-SPIRV-DAG: OpDecorate %[[#Id12:]] BuiltIn SubgroupGtMask +; CHECK-SPIRV-DAG: OpDecorate %[[#Id13:]] BuiltIn SubgroupLeMask +; CHECK-SPIRV-DAG: OpDecorate %[[#Id14:]] BuiltIn SubgroupLtMask +; CHECK-SPIRV-DAG: OpDecorate %[[#Id15:]] BuiltIn LocalInvocationId +; CHECK-SPIRV-DAG: OpDecorate %[[#Id16:]] BuiltIn WorkgroupSize +; CHECK-SPIRV-DAG: OpDecorate %[[#Id17:]] BuiltIn GlobalSize +; CHECK-SPIRV-DAG: OpDecorate %[[#Id18:]] BuiltIn WorkgroupId +; CHECK-SPIRV-DAG: OpDecorate %[[#Id19:]] BuiltIn EnqueuedWorkgroupSize +; CHECK-SPIRV-DAG: OpDecorate %[[#Id20:]] BuiltIn NumWorkgroups +; CHECK-SPIRV-DAG: OpDecorate %[[#Id21:]] BuiltIn GlobalOffset + +; CHECK-SPIRV: %[[#Id0:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id1:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id2:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id3:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id4:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id5:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id6:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id7:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id8:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id9:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id10:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id11:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id12:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id13:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id14:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id15:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id16:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id17:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id18:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id19:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id20:]] = OpVariable %[[#]] Input +; CHECK-SPIRV: %[[#Id21:]] = OpVariable %[[#]] Input define spir_kernel void @f() { entry: %0 = call spir_func i32 @_Z29__spirv_BuiltInGlobalLinearIdv() %1 = call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 1) + %2 = call spir_func i64 @_Z35__spirv_BuiltInLocalInvocationIndexv() + %3 = call spir_func i32 @_Z22__spirv_BuiltInWorkDimv() + %4 = call spir_func i32 @_Z27__spirv_BuiltInSubgroupSizev() + %5 = call spir_func i32 @_Z30__spirv_BuiltInSubgroupMaxSizev() + %6 = call spir_func i32 @_Z27__spirv_BuiltInNumSubgroupsv() + %7 = call spir_func i32 @_Z35__spirv_BuiltInNumEnqueuedSubgroupsv() + %8 = call spir_func i32 @_Z25__spirv_BuiltInSubgroupIdv() + %9 = call spir_func i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv() + %10 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupEqMaskv() + %11 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupEqMaskKHRv() + %12 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGeMaskv() + %13 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGeMaskKHRv() + %14 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGtMaskv() + %15 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGtMaskKHRv() + %16 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLeMaskv() + %17 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLeMaskKHRv() + %18 = call spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLtMaskv() + %19 = call spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLtMaskKHRv() + %20 = call spir_func i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32 0) + %21 = call spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32 0) + %22 = call spir_func i64 @_Z25__spirv_BuiltInGlobalSizei(i32 0) + %23 = call spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32 0) + %24 = call spir_func i64 @_Z36__spirv_BuiltInEnqueuedWorkgroupSizei(i32 0) + %25 = call spir_func i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32 0) + %26 = call spir_func i64 @_Z27__spirv_BuiltInGlobalOffseti(i32 0) + ret void } declare spir_func i32 @_Z29__spirv_BuiltInGlobalLinearIdv() declare spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32) +declare spir_func i64 @_Z35__spirv_BuiltInLocalInvocationIndexv() +declare spir_func i32 @_Z22__spirv_BuiltInWorkDimv() +declare spir_func i32 @_Z27__spirv_BuiltInSubgroupSizev() +declare spir_func i32 @_Z30__spirv_BuiltInSubgroupMaxSizev() +declare spir_func i32 @_Z27__spirv_BuiltInNumSubgroupsv() +declare spir_func i32 @_Z35__spirv_BuiltInNumEnqueuedSubgroupsv() +declare spir_func i32 @_Z25__spirv_BuiltInSubgroupIdv() +declare spir_func i32 @_Z40__spirv_BuiltInSubgroupLocalInvocationIdv() +declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupEqMaskv() +declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupEqMaskKHRv() +declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGeMaskv() +declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGeMaskKHRv() +declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupGtMaskv() +declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupGtMaskKHRv() +declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLeMaskv() +declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLeMaskKHRv() +declare spir_func <4 x i32> @_Z29__spirv_BuiltInSubgroupLtMaskv() +declare spir_func <4 x i32> @_Z32__spirv_BuiltInSubgroupLtMaskKHRv() +declare spir_func i64 @_Z32__spirv_BuiltInLocalInvocationIdi(i32) +declare spir_func i64 @_Z28__spirv_BuiltInWorkgroupSizei(i32) +declare spir_func i64 @_Z25__spirv_BuiltInGlobalSizei(i32) +declare spir_func i64 @_Z26__spirv_BuiltInWorkgroupIdi(i32) +declare spir_func i64 @_Z36__spirv_BuiltInEnqueuedWorkgroupSizei(i32) +declare spir_func i64 @_Z28__spirv_BuiltInNumWorkgroupsi(i32) +declare spir_func i64 @_Z27__spirv_BuiltInGlobalOffseti(i32) From 9c26f37ce34de3be3550cbd67fcea534721724fd Mon Sep 17 00:00:00 2001 From: Wenju He Date: Fri, 18 Jul 2025 08:09:14 +0800 Subject: [PATCH 273/813] [libclc] Add generic implementation of some atomic functions in OpenCL spec section 6.15.12.7 (#146814) Add corresponding clc functions, which are implemented with clang __scoped_atomic builtins. OpenCL functions are implemented as a wrapper over clc functions. Also change legacy atomic_inc and atomic_dec to re-use the newly added clc_atomic_inc/dec implementations. llvm-diff only no change to atomic_inc and atomic_dec in bitcode. Notes: * Generic OpenCL built-ins functions uses __ATOMIC_SEQ_CST and __MEMORY_SCOPE_DEVICE for memory order and memory scope parameters. * OpenCL atomic_*_explicit, atomic_flag* built-ins are not implemented yet. * OpenCL built-ins of atomic_intptr_t, atomic_uintptr_t, atomic_size_t and atomic_ptrdiff_t types are not implemented yet. * llvm-diff shows no change to nvptx64--nvidiacl.bc and amdgcn--amdhsa.bc since __opencl_c_atomic_order_seq_cst and __opencl_c_atomic_scope_device are not defined in these two targets. --- libclc/clc/include/clc/atomic/atomic_decl.inc | 47 +++++++++++ .../clc/atomic/clc_atomic_compare_exchange.h | 26 ++++++ .../clc/include/clc/atomic/clc_atomic_dec.h | 23 ++++++ .../include/clc/atomic/clc_atomic_exchange.h | 24 ++++++ .../include/clc/atomic/clc_atomic_fetch_add.h | 24 ++++++ .../include/clc/atomic/clc_atomic_fetch_and.h | 21 +++++ .../include/clc/atomic/clc_atomic_fetch_max.h | 24 ++++++ .../include/clc/atomic/clc_atomic_fetch_min.h | 24 ++++++ .../include/clc/atomic/clc_atomic_fetch_or.h | 21 +++++ .../include/clc/atomic/clc_atomic_fetch_sub.h | 24 ++++++ .../include/clc/atomic/clc_atomic_fetch_xor.h | 21 +++++ .../clc/include/clc/atomic/clc_atomic_inc.h | 23 ++++++ .../clc/include/clc/atomic/clc_atomic_load.h | 26 ++++++ .../clc/include/clc/atomic/clc_atomic_store.h | 26 ++++++ libclc/clc/lib/generic/SOURCES | 13 +++ .../atomic/clc_atomic_compare_exchange.cl | 15 ++++ .../atomic/clc_atomic_compare_exchange.inc | 64 +++++++++++++++ .../clc/lib/generic/atomic/clc_atomic_dec.cl | 16 ++++ .../clc/lib/generic/atomic/clc_atomic_def.inc | 79 +++++++++++++++++++ .../lib/generic/atomic/clc_atomic_exchange.cl | 23 ++++++ .../generic/atomic/clc_atomic_fetch_add.cl | 18 +++++ .../generic/atomic/clc_atomic_fetch_and.cl | 15 ++++ .../generic/atomic/clc_atomic_fetch_max.cl | 18 +++++ .../generic/atomic/clc_atomic_fetch_min.cl | 18 +++++ .../lib/generic/atomic/clc_atomic_fetch_or.cl | 15 ++++ .../generic/atomic/clc_atomic_fetch_sub.cl | 18 +++++ .../generic/atomic/clc_atomic_fetch_xor.cl | 15 ++++ .../clc/lib/generic/atomic/clc_atomic_inc.cl | 16 ++++ .../clc/lib/generic/atomic/clc_atomic_load.cl | 24 ++++++ .../lib/generic/atomic/clc_atomic_store.cl | 22 ++++++ .../include/clc/opencl/atomic/atomic_add.h | 2 +- .../include/clc/opencl/atomic/atomic_and.h | 2 +- .../atomic/atomic_compare_exchange_strong.h | 24 ++++++ .../atomic/atomic_compare_exchange_weak.h | 24 ++++++ .../include/clc/opencl/atomic/atomic_decl.inc | 58 +++++++++++--- .../clc/opencl/atomic/atomic_decl_legacy.inc | 22 ++++++ .../clc/opencl/atomic/atomic_exchange.h | 22 ++++++ .../clc/opencl/atomic/atomic_fetch_add.h | 22 ++++++ .../clc/opencl/atomic/atomic_fetch_and.h | 19 +++++ .../clc/opencl/atomic/atomic_fetch_max.h | 22 ++++++ .../clc/opencl/atomic/atomic_fetch_min.h | 22 ++++++ .../clc/opencl/atomic/atomic_fetch_or.h | 19 +++++ .../clc/opencl/atomic/atomic_fetch_sub.h | 22 ++++++ .../clc/opencl/atomic/atomic_fetch_xor.h | 19 +++++ .../include/clc/opencl/atomic/atomic_load.h | 24 ++++++ .../include/clc/opencl/atomic/atomic_max.h | 2 +- .../include/clc/opencl/atomic/atomic_min.h | 2 +- .../include/clc/opencl/atomic/atomic_or.h | 2 +- .../include/clc/opencl/atomic/atomic_store.h | 24 ++++++ .../include/clc/opencl/atomic/atomic_sub.h | 2 +- .../include/clc/opencl/atomic/atomic_xchg.h | 2 +- .../include/clc/opencl/atomic/atomic_xor.h | 2 +- libclc/opencl/lib/generic/SOURCES | 26 ++++-- .../atomic/atomic_compare_exchange_strong.cl | 25 ++++++ .../atomic/atomic_compare_exchange_weak.cl | 25 ++++++ .../opencl/lib/generic/atomic/atomic_dec.cl | 14 ++-- .../opencl/lib/generic/atomic/atomic_def.inc | 79 +++++++++++++++++++ .../lib/generic/atomic/atomic_exchange.cl | 25 ++++++ .../lib/generic/atomic/atomic_fetch_add.cl | 25 ++++++ .../lib/generic/atomic/atomic_fetch_and.cl | 22 ++++++ .../lib/generic/atomic/atomic_fetch_max.cl | 25 ++++++ .../lib/generic/atomic/atomic_fetch_min.cl | 25 ++++++ .../lib/generic/atomic/atomic_fetch_or.cl | 22 ++++++ .../lib/generic/atomic/atomic_fetch_sub.cl | 25 ++++++ .../lib/generic/atomic/atomic_fetch_xor.cl | 22 ++++++ .../opencl/lib/generic/atomic/atomic_inc.cl | 14 ++-- .../lib/generic/atomic/atomic_inc_dec.inc | 26 ++++++ .../opencl/lib/generic/atomic/atomic_load.cl | 26 ++++++ .../opencl/lib/generic/atomic/atomic_store.cl | 26 ++++++ 69 files changed, 1511 insertions(+), 43 deletions(-) create mode 100644 libclc/clc/include/clc/atomic/atomic_decl.inc create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_dec.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_exchange.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_inc.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_load.h create mode 100644 libclc/clc/include/clc/atomic/clc_atomic_store.h create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_dec.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_def.inc create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_inc.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_load.cl create mode 100644 libclc/clc/lib/generic/atomic/clc_atomic_store.cl create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_load.h create mode 100644 libclc/opencl/include/clc/opencl/atomic/atomic_store.h create mode 100644 libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_def.inc create mode 100644 libclc/opencl/lib/generic/atomic/atomic_exchange.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc create mode 100644 libclc/opencl/lib/generic/atomic/atomic_load.cl create mode 100644 libclc/opencl/lib/generic/atomic/atomic_store.cl diff --git a/libclc/clc/include/clc/atomic/atomic_decl.inc b/libclc/clc/include/clc/atomic/atomic_decl.inc new file mode 100644 index 0000000000000..b790a94c7d288 --- /dev/null +++ b/libclc/clc/include/clc/atomic/atomic_decl.inc @@ -0,0 +1,47 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// MemoryOrder is memory order supported by Clang __scoped_atomic* builtins. +// MemoryScope is memory scope supported by Clang __scoped_atomic* builtins. + +#ifdef __CLC_SCALAR +#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32) + +#ifdef __CLC_NO_VALUE_ARG +#define __CLC_DECLARE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \ + int MemoryScope); +#elif defined(__CLC_RETURN_VOID) +#define __CLC_DECLARE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL void FUNCTION(volatile ADDRSPACE __CLC_GENTYPE *Ptr, \ + __CLC_GENTYPE Value, int MemoryOrder, \ + int MemoryScope); +#elif defined(__CLC_COMPARE_EXCHANGE) +#define __CLC_DECLARE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator, \ + __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal, \ + int MemoryScope); +#else +#define __CLC_DECLARE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \ + int MemoryOrder, int MemoryScope); +#endif + +__CLC_DECLARE_ATOMIC(global) +__CLC_DECLARE_ATOMIC(local) +#if _CLC_GENERIC_AS_SUPPORTED +__CLC_DECLARE_ATOMIC() +#endif + +#undef __CLC_DECLARE_ATOMIC + +#endif // defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32) +#endif // __CLC_SCALAR diff --git a/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h b/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h new file mode 100644 index 0000000000000..ae7918ac32e43 --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__ + +#include + +#define FUNCTION __clc_atomic_compare_exchange +#define __CLC_COMPARE_EXCHANGE + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef __CLC_COMPARE_EXCHANGE +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_dec.h b/libclc/clc/include/clc/atomic/clc_atomic_dec.h new file mode 100644 index 0000000000000..ada36ba3ff9b3 --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_dec.h @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_DEC_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_DEC_H__ + +#include + +#define FUNCTION __clc_atomic_dec +#define __CLC_NO_VALUE_ARG + +#define __CLC_BODY +#include + +#undef __CLC_NO_VALUE_ARG +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_DEC_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_exchange.h b/libclc/clc/include/clc/atomic/clc_atomic_exchange.h new file mode 100644 index 0000000000000..7e626d4a8830b --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_exchange.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__ + +#include + +#define FUNCTION __clc_atomic_exchange + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h new file mode 100644 index 0000000000000..ad0c2eb4607a7 --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__ + +#include + +#define FUNCTION __clc_atomic_fetch_add + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h new file mode 100644 index 0000000000000..80810c38cbbb8 --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__ + +#include + +#define FUNCTION __clc_atomic_fetch_and + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h new file mode 100644 index 0000000000000..56f511922e5c7 --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__ + +#include + +#define FUNCTION __clc_atomic_fetch_max + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h new file mode 100644 index 0000000000000..f17408d28a35d --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__ + +#include + +#define FUNCTION __clc_atomic_fetch_min + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h new file mode 100644 index 0000000000000..b82069e6f960e --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__ + +#include + +#define FUNCTION __clc_atomic_fetch_or + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h new file mode 100644 index 0000000000000..6cfd224629d60 --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__ + +#include + +#define FUNCTION __clc_atomic_fetch_sub + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h new file mode 100644 index 0000000000000..b007b47a9369d --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__ + +#include + +#define FUNCTION __clc_atomic_fetch_xor + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_inc.h b/libclc/clc/include/clc/atomic/clc_atomic_inc.h new file mode 100644 index 0000000000000..3ddef4a8bf355 --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_inc.h @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_INC_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_INC_H__ + +#include + +#define FUNCTION __clc_atomic_inc +#define __CLC_NO_VALUE_ARG + +#define __CLC_BODY +#include + +#undef __CLC_NO_VALUE_ARG +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_INC_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_load.h b/libclc/clc/include/clc/atomic/clc_atomic_load.h new file mode 100644 index 0000000000000..a4899b34b88a1 --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_load.h @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__ + +#include + +#define FUNCTION __clc_atomic_load +#define __CLC_NO_VALUE_ARG + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef __CLC_NO_VALUE_ARG +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_store.h b/libclc/clc/include/clc/atomic/clc_atomic_store.h new file mode 100644 index 0000000000000..6baf0eb7ea32b --- /dev/null +++ b/libclc/clc/include/clc/atomic/clc_atomic_store.h @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_ATOMIC_CLC_ATOMIC_STORE_H__ +#define __CLC_ATOMIC_CLC_ATOMIC_STORE_H__ + +#include + +#define FUNCTION __clc_atomic_store +#define __CLC_RETURN_VOID + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef __CLC_RETURN_VOID +#undef FUNCTION + +#endif // __CLC_ATOMIC_CLC_ATOMIC_STORE_H__ diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index 9d792c4b3d28d..ee4f771799e8e 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -1,4 +1,17 @@ async/clc_prefetch.cl +atomic/clc_atomic_compare_exchange.cl +atomic/clc_atomic_dec.cl +atomic/clc_atomic_exchange.cl +atomic/clc_atomic_fetch_add.cl +atomic/clc_atomic_fetch_and.cl +atomic/clc_atomic_fetch_max.cl +atomic/clc_atomic_fetch_min.cl +atomic/clc_atomic_fetch_or.cl +atomic/clc_atomic_fetch_sub.cl +atomic/clc_atomic_fetch_xor.cl +atomic/clc_atomic_inc.cl +atomic/clc_atomic_load.cl +atomic/clc_atomic_store.cl common/clc_degrees.cl common/clc_radians.cl common/clc_sign.cl diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl new file mode 100644 index 0000000000000..796dedcef3857 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.cl @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc new file mode 100644 index 0000000000000..32ff9b45b769e --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_compare_exchange.inc @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef __CLC_SCALAR + +#if defined(__SPIR32__) || defined(CLC_NVPTX) +#if (defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32) || \ + (defined(__CLC_GENSIZE) && (__CLC_GENSIZE == 32)) +#define __CLC_HAS_ATOMIC +#endif +#else // defined(__SPIR32__) || defined(CLC_NVPTX) +#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32) +#define __CLC_HAS_ATOMIC +#endif +#endif // defined(__SPIR32__) || defined(CLC_NVPTX) + +#ifdef __CLC_HAS_ATOMIC + +#ifdef __CLC_FPSIZE + +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_atomic_compare_exchange( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator, \ + __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal, \ + int MemoryScope) { \ + __CLC_U_GENTYPE Comp = __CLC_AS_U_GENTYPE(Comparator); \ + __scoped_atomic_compare_exchange_n( \ + (ADDRSPACE __CLC_U_GENTYPE *)Ptr, &Comp, __CLC_AS_U_GENTYPE(Value), \ + false, MemoryOrderEqual, MemoryOrderUnequal, MemoryScope); \ + return __CLC_AS_GENTYPE(Comp); \ + } + +#else + +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __clc_atomic_compare_exchange( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator, \ + __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal, \ + int MemoryScope) { \ + __scoped_atomic_compare_exchange_n(Ptr, &Comparator, Value, false, \ + MemoryOrderEqual, MemoryOrderUnequal, \ + MemoryScope); \ + return Comparator; \ + } + +#endif // __CLC_FPSIZE + +__CLC_DEFINE_ATOMIC(global) +__CLC_DEFINE_ATOMIC(local) +#if _CLC_GENERIC_AS_SUPPORTED +__CLC_DEFINE_ATOMIC() +#endif + +#undef __CLC_DEFINE_ATOMIC + +#endif // __CLC_HAS_ATOMIC +#undef __CLC_HAS_ATOMIC + +#endif // __CLC_SCALAR diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl b/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl new file mode 100644 index 0000000000000..f35a9624fd013 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_dec +#define __IMPL_FUNCTION __scoped_atomic_fetch_add +#define __CLC_INC_DEC + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc new file mode 100644 index 0000000000000..2c45f49f60848 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef __CLC_SCALAR + +#if defined(__SPIR32__) || defined(CLC_NVPTX) +#if (defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32) || \ + (defined(__CLC_GENSIZE) && (__CLC_GENSIZE == 32)) +#define __CLC_HAS_ATOMIC +#endif +#else // defined(__SPIR32__) || defined(CLC_NVPTX) +#if defined(__CLC_FPSIZE) || (__CLC_GENSIZE >= 32) +#define __CLC_HAS_ATOMIC +#endif +#endif // defined(__SPIR32__) || defined(CLC_NVPTX) + +#ifdef __CLC_HAS_ATOMIC + +#ifndef __CLC_PTR_CASTTYPE +#define __CLC_PTR_CASTTYPE __CLC_GENTYPE +#endif + +#ifndef __CLC_AS_RETTYPE +#define __CLC_AS_RETTYPE(x) x +#endif + +#ifdef __CLC_NO_VALUE_ARG +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \ + int MemoryScope) { \ + return __CLC_AS_RETTYPE(__IMPL_FUNCTION( \ + (ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, MemoryOrder, MemoryScope)); \ + } +#elif defined(__CLC_INC_DEC) +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \ + int MemoryScope) { \ + return __CLC_AS_RETTYPE( \ + __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, (__CLC_GENTYPE)1, \ + MemoryOrder, MemoryScope)); \ + } +#elif defined(__CLC_RETURN_VOID) +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL void FUNCTION(volatile ADDRSPACE __CLC_GENTYPE *Ptr, \ + __CLC_GENTYPE Value, int MemoryOrder, \ + int MemoryScope) { \ + __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, MemoryOrder, \ + MemoryScope); \ + } +#else +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \ + int MemoryOrder, int MemoryScope) { \ + return __CLC_AS_RETTYPE( \ + __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, \ + MemoryOrder, MemoryScope)); \ + } +#endif + +__CLC_DEFINE_ATOMIC(global) +__CLC_DEFINE_ATOMIC(local) +#if _CLC_GENERIC_AS_SUPPORTED +__CLC_DEFINE_ATOMIC() +#endif + +#undef __CLC_DEFINE_ATOMIC + +#endif // __CLC_HAS_ATOMIC +#undef __CLC_HAS_ATOMIC + +#endif // __CLC_SCALAR diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl new file mode 100644 index 0000000000000..52fd11afed6a2 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_exchange +#define __IMPL_FUNCTION __scoped_atomic_exchange_n + +#define __CLC_BODY +#include + +#undef __CLC_PTR_CASTTYPE +#undef __CLC_AS_RETTYPE +#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN +#define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x) + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl new file mode 100644 index 0000000000000..0dc44919627b3 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_fetch_add +#define __IMPL_FUNCTION __scoped_atomic_fetch_add + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl new file mode 100644 index 0000000000000..ec89738bc0f62 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_fetch_and +#define __IMPL_FUNCTION __scoped_atomic_fetch_and + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl new file mode 100644 index 0000000000000..0acac711aa96d --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_fetch_max +#define __IMPL_FUNCTION __scoped_atomic_fetch_max + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl new file mode 100644 index 0000000000000..7a098588ec005 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_fetch_min +#define __IMPL_FUNCTION __scoped_atomic_fetch_min + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl new file mode 100644 index 0000000000000..e0f48fa408350 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_fetch_or +#define __IMPL_FUNCTION __scoped_atomic_fetch_or + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl new file mode 100644 index 0000000000000..a4c2c1da1555c --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_fetch_sub +#define __IMPL_FUNCTION __scoped_atomic_fetch_sub + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl new file mode 100644 index 0000000000000..4424a298178fd --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl @@ -0,0 +1,15 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_fetch_xor +#define __IMPL_FUNCTION __scoped_atomic_fetch_xor + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl b/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl new file mode 100644 index 0000000000000..019aa8d9d6dd8 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl @@ -0,0 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_inc +#define __IMPL_FUNCTION __scoped_atomic_fetch_sub +#define __CLC_INC_DEC + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl new file mode 100644 index 0000000000000..1f083073e43ff --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_load +#define __IMPL_FUNCTION __scoped_atomic_load_n +#define __CLC_NO_VALUE_ARG + +#define __CLC_BODY +#include + +#undef __CLC_PTR_CASTTYPE +#undef __CLC_AS_RETTYPE +#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN +#define __CLC_AS_RETTYPE(x) __CLC_AS_GENTYPE(x) + +#define __CLC_BODY +#include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl new file mode 100644 index 0000000000000..8fd165b9a83b8 --- /dev/null +++ b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#define FUNCTION __clc_atomic_store +#define __IMPL_FUNCTION __scoped_atomic_store_n +#define __CLC_RETURN_VOID + +#define __CLC_BODY +#include + +#undef __CLC_PTR_CASTTYPE +#define __CLC_PTR_CASTTYPE __CLC_BIT_INTN + +#define __CLC_BODY +#include diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_add.h b/libclc/opencl/include/clc/opencl/atomic/atomic_add.h index 821ae7aab05bf..50fb99d1362fc 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_add.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_add.h @@ -12,6 +12,6 @@ #include #define FUNCTION atomic_add -#include +#include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_ADD_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_and.h b/libclc/opencl/include/clc/opencl/atomic/atomic_and.h index d10cfed9b581a..8ce328c9739aa 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_and.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_and.h @@ -12,6 +12,6 @@ #include #define FUNCTION atomic_and -#include +#include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_AND_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h new file mode 100644 index 0000000000000..76eeda7ba3469 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__ + +#define FUNCTION atomic_compare_exchange_strong +#define __CLC_COMPARE_EXCHANGE + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef __CLC_COMPARE_EXCHANGE +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h new file mode 100644 index 0000000000000..12788ad03a2d1 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__ + +#define FUNCTION atomic_compare_exchange_weak +#define __CLC_COMPARE_EXCHANGE + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef __CLC_COMPARE_EXCHANGE +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc index e060e3aaea161..1b2bf17bd6dfd 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc @@ -6,17 +6,55 @@ // //===----------------------------------------------------------------------===// -#define __CLC_DECLARE_ATOMIC(ADDRSPACE, TYPE) \ - _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE); +#ifdef __CLC_SCALAR -#define __CLC_DECLARE_ATOMIC_ADDRSPACE(TYPE) \ - __CLC_DECLARE_ATOMIC(global, TYPE) \ - __CLC_DECLARE_ATOMIC(local, TYPE) +#if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) && \ + defined(cl_khr_int64_extended_atomics)) +#define HAVE_64_ATOMIC +#endif +#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(HAVE_64_ATOMIC)) +#define HAVE_FP_ATOMIC +#endif +#if defined(__CLC_GENSIZE) && \ + ((__CLC_GENSIZE == 32) || \ + (__CLC_GENSIZE == 64 && defined(HAVE_64_ATOMIC))) +#define HAVE_INT_ATOMIC +#endif +#if defined(HAVE_FP_ATOMIC) || defined(HAVE_INT_ATOMIC) -__CLC_DECLARE_ATOMIC_ADDRSPACE(int) -__CLC_DECLARE_ATOMIC_ADDRSPACE(uint) +#define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE) -#undef __CLC_DECLARE_ATOMIC_ADDRSPACE -#undef __CLC_DECLARE_ATOMIC +#ifdef __CLC_NO_VALUE_ARG +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr); +#elif defined(__CLC_RETURN_VOID) +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL void FUNCTION( \ + volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value); +#elif defined(__CLC_COMPARE_EXCHANGE) +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, \ + ADDRSPACE __CLC_GENTYPE *Expected, __CLC_GENTYPE Desired); +#else +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value); +#endif -#undef FUNCTION +__CLC_DEFINE_ATOMIC(global) +__CLC_DEFINE_ATOMIC(local) +#if _CLC_GENERIC_AS_SUPPORTED +__CLC_DEFINE_ATOMIC() +#endif + +#undef __CLC_DEFINE_ATOMIC + +#endif // HAVE_FP_ATOMIC || HAVE_INT_ATOMIC + +#undef HAVE_INT_ATOMIC +#undef HAVE_FP_ATOMIC +#undef HAVE_64_ATOMIC + +#endif // __CLC_SCALAR diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc new file mode 100644 index 0000000000000..e060e3aaea161 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define __CLC_DECLARE_ATOMIC(ADDRSPACE, TYPE) \ + _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE); + +#define __CLC_DECLARE_ATOMIC_ADDRSPACE(TYPE) \ + __CLC_DECLARE_ATOMIC(global, TYPE) \ + __CLC_DECLARE_ATOMIC(local, TYPE) + +__CLC_DECLARE_ATOMIC_ADDRSPACE(int) +__CLC_DECLARE_ATOMIC_ADDRSPACE(uint) + +#undef __CLC_DECLARE_ATOMIC_ADDRSPACE +#undef __CLC_DECLARE_ATOMIC + +#undef FUNCTION diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h b/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h new file mode 100644 index 0000000000000..3949bc13401f2 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__ + +#define FUNCTION atomic_exchange + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h new file mode 100644 index 0000000000000..972c1fa69fe7b --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__ + +#define FUNCTION atomic_fetch_add + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h new file mode 100644 index 0000000000000..fdac049a74d3f --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__ + +#define FUNCTION atomic_fetch_and + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h new file mode 100644 index 0000000000000..513b60fec2727 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__ + +#define FUNCTION atomic_fetch_max + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h new file mode 100644 index 0000000000000..c961c4a641656 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__ + +#define FUNCTION atomic_fetch_min + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h new file mode 100644 index 0000000000000..25923e3647e36 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__ + +#define FUNCTION atomic_fetch_or + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h new file mode 100644 index 0000000000000..b307c30a298b3 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__ + +#define FUNCTION atomic_fetch_sub + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h new file mode 100644 index 0000000000000..52510d018574d --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__ + +#define FUNCTION atomic_fetch_xor + +#define __CLC_BODY +#include + +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_load.h b/libclc/opencl/include/clc/opencl/atomic/atomic_load.h new file mode 100644 index 0000000000000..3998a4de9452b --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_load.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__ + +#define FUNCTION atomic_load +#define __CLC_NO_VALUE_ARG + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef __CLC_NO_VALUE_ARG +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_max.h b/libclc/opencl/include/clc/opencl/atomic/atomic_max.h index 667fa36f16f9d..6b95ad7e68d94 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_max.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_max.h @@ -12,6 +12,6 @@ #include #define FUNCTION atomic_max -#include +#include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_MAX_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_min.h b/libclc/opencl/include/clc/opencl/atomic/atomic_min.h index 91bb636eec875..c1dfacb40b746 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_min.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_min.h @@ -12,6 +12,6 @@ #include #define FUNCTION atomic_min -#include +#include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_MIN_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_or.h b/libclc/opencl/include/clc/opencl/atomic/atomic_or.h index 5c03fd157a2bc..30c32fe4889d5 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_or.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_or.h @@ -12,6 +12,6 @@ #include #define FUNCTION atomic_or -#include +#include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_OR_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_store.h b/libclc/opencl/include/clc/opencl/atomic/atomic_store.h new file mode 100644 index 0000000000000..4893a5b88df03 --- /dev/null +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_store.h @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__ +#define __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__ + +#define FUNCTION atomic_store +#define __CLC_RETURN_VOID + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#undef __CLC_RETURN_VOID +#undef FUNCTION + +#endif // __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h b/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h index 25ffe9ff4a9b7..1e7ac5505b071 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h @@ -12,6 +12,6 @@ #include #define FUNCTION atomic_sub -#include +#include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_SUB_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h b/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h index 6b4206dedb820..043d7825483e4 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h @@ -15,6 +15,6 @@ _CLC_OVERLOAD _CLC_DECL float FUNCTION(volatile local float *, float); _CLC_OVERLOAD _CLC_DECL float FUNCTION(volatile global float *, float); -#include +#include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_XCHG_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h b/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h index e94560cb6b9ed..a9bee007b9344 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h @@ -12,6 +12,6 @@ #include #define FUNCTION atomic_xor -#include +#include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_XOR_H__ diff --git a/libclc/opencl/lib/generic/SOURCES b/libclc/opencl/lib/generic/SOURCES index a59a82ee325ec..61757efbcaad7 100644 --- a/libclc/opencl/lib/generic/SOURCES +++ b/libclc/opencl/lib/generic/SOURCES @@ -8,24 +8,36 @@ atomic/atom_add.cl atomic/atom_and.cl atomic/atom_cmpxchg.cl atomic/atom_dec.cl -atomic/atom_inc.cl -atomic/atom_max.cl -atomic/atom_min.cl -atomic/atom_or.cl -atomic/atom_sub.cl -atomic/atom_xchg.cl -atomic/atom_xor.cl atomic/atomic_add.cl atomic/atomic_and.cl atomic/atomic_cmpxchg.cl +atomic/atomic_compare_exchange_strong.cl +atomic/atomic_compare_exchange_weak.cl atomic/atomic_dec.cl +atomic/atomic_exchange.cl +atomic/atomic_fetch_add.cl +atomic/atomic_fetch_and.cl +atomic/atomic_fetch_max.cl +atomic/atomic_fetch_min.cl +atomic/atomic_fetch_or.cl +atomic/atomic_fetch_sub.cl +atomic/atomic_fetch_xor.cl atomic/atomic_inc.cl +atomic/atomic_load.cl atomic/atomic_max.cl atomic/atomic_min.cl atomic/atomic_or.cl +atomic/atomic_store.cl atomic/atomic_sub.cl atomic/atomic_xchg.cl atomic/atomic_xor.cl +atomic/atom_inc.cl +atomic/atom_max.cl +atomic/atom_min.cl +atomic/atom_or.cl +atomic/atom_sub.cl +atomic/atom_xchg.cl +atomic/atom_xor.cl common/degrees.cl common/mix.cl common/radians.cl diff --git a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl new file mode 100644 index 0000000000000..422c03f292071 --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_compare_exchange_strong +#define __CLC_COMPARE_EXCHANGE + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl new file mode 100644 index 0000000000000..8a6b3c4f0110e --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_compare_exchange_weak +#define __CLC_COMPARE_EXCHANGE + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_dec.cl b/libclc/opencl/lib/generic/atomic/atomic_dec.cl index 6f18cdf13428a..6de55bc0b9845 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_dec.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_dec.cl @@ -6,15 +6,11 @@ // //===----------------------------------------------------------------------===// +#include #include -#define IMPL(TYPE, AS) \ - _CLC_OVERLOAD _CLC_DEF TYPE atomic_dec(volatile AS TYPE *p) { \ - return __sync_fetch_and_sub(p, (TYPE)1); \ - } +#define FUNCTION atomic_dec +#define __IMPL_FUNCTION __clc_atomic_dec -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +#define __CLC_BODY +#include diff --git a/libclc/opencl/lib/generic/atomic/atomic_def.inc b/libclc/opencl/lib/generic/atomic/atomic_def.inc new file mode 100644 index 0000000000000..ce192bf844938 --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_def.inc @@ -0,0 +1,79 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef __CLC_SCALAR + +#if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) && \ + defined(cl_khr_int64_extended_atomics)) +#define HAVE_64_ATOMIC +#endif +#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(HAVE_64_ATOMIC) +#define HAVE_FP_ATOMIC +#endif +#if defined(__CLC_GENSIZE) && \ + ((__CLC_GENSIZE == 32) || \ + (__CLC_GENSIZE == 64 && defined(HAVE_64_ATOMIC))) +#define HAVE_INT_ATOMIC +#endif +#if defined(HAVE_FP_ATOMIC) || defined(HAVE_INT_ATOMIC) + +#define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE) + +#ifdef __CLC_NO_VALUE_ARG +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr) { \ + return __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, \ + __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + } +#elif defined(__CLC_RETURN_VOID) +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF void FUNCTION( \ + volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) { \ + __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value, \ + __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + } +#elif defined(__CLC_COMPARE_EXCHANGE) +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, \ + ADDRSPACE __CLC_GENTYPE *Expected, __CLC_GENTYPE Desired) { \ + __CLC_GENTYPE Comparator = *Expected; \ + __CLC_GENTYPE RetValue = __clc_atomic_compare_exchange( \ + (volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Comparator, Desired, \ + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); \ + if (Comparator != RetValue) { \ + *Expected = RetValue; \ + return true; \ + } \ + return false; \ + } +#else +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) { \ + return __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value, \ + __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + } +#endif + +__CLC_DEFINE_ATOMIC(global) +__CLC_DEFINE_ATOMIC(local) +#if _CLC_GENERIC_AS_SUPPORTED +__CLC_DEFINE_ATOMIC() +#endif + +#undef __CLC_DEFINE_ATOMIC + +#endif // HAVE_FP_ATOMIC || HAVE_INT_ATOMIC + +#undef HAVE_INT_ATOMIC +#undef HAVE_FP_ATOMIC +#undef HAVE_64_ATOMIC + +#endif // __CLC_SCALAR diff --git a/libclc/opencl/lib/generic/atomic/atomic_exchange.cl b/libclc/opencl/lib/generic/atomic/atomic_exchange.cl new file mode 100644 index 0000000000000..6dae6c0a77599 --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_exchange.cl @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_exchange +#define __IMPL_FUNCTION __clc_atomic_exchange + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl new file mode 100644 index 0000000000000..bbaa1c2b0dacf --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_fetch_add +#define __IMPL_FUNCTION __clc_atomic_fetch_add + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl new file mode 100644 index 0000000000000..73925844c9357 --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_fetch_and +#define __IMPL_FUNCTION __clc_atomic_fetch_and + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl new file mode 100644 index 0000000000000..8c8ce11cc575f --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_fetch_max +#define __IMPL_FUNCTION __clc_atomic_fetch_max + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl new file mode 100644 index 0000000000000..550459cee32d6 --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_fetch_min +#define __IMPL_FUNCTION __clc_atomic_fetch_min + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl new file mode 100644 index 0000000000000..2606ff3c99673 --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_fetch_or +#define __IMPL_FUNCTION __clc_atomic_fetch_or + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl new file mode 100644 index 0000000000000..33772233bebed --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_fetch_sub +#define __IMPL_FUNCTION __clc_atomic_fetch_sub + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl new file mode 100644 index 0000000000000..6f6503e588b6f --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl @@ -0,0 +1,22 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_fetch_xor +#define __IMPL_FUNCTION __clc_atomic_fetch_xor + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc.cl b/libclc/opencl/lib/generic/atomic/atomic_inc.cl index 13349e5432e5c..a160b2e2370fc 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_inc.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_inc.cl @@ -6,15 +6,11 @@ // //===----------------------------------------------------------------------===// +#include #include -#define IMPL(TYPE, AS) \ - _CLC_OVERLOAD _CLC_DEF TYPE atomic_inc(volatile AS TYPE *p) { \ - return __sync_fetch_and_add(p, (TYPE)1); \ - } +#define FUNCTION atomic_inc +#define __IMPL_FUNCTION __clc_atomic_inc -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +#define __CLC_BODY +#include diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc new file mode 100644 index 0000000000000..0bcf300dd284a --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifdef __CLC_SCALAR + +#if __CLC_GENSIZE == 32 + +#define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr) { \ + return __IMPL_FUNCTION(Ptr, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + } + +__CLC_DEFINE_ATOMIC(global) +__CLC_DEFINE_ATOMIC(local) + +#undef __CLC_DEFINE_ATOMIC + +#endif // __CLC_GENSIZE == 32 + +#endif // __CLC_SCALAR diff --git a/libclc/opencl/lib/generic/atomic/atomic_load.cl b/libclc/opencl/lib/generic/atomic/atomic_load.cl new file mode 100644 index 0000000000000..459265473a8c8 --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_load.cl @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_load +#define __IMPL_FUNCTION __clc_atomic_load +#define __CLC_NO_VALUE_ARG + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) diff --git a/libclc/opencl/lib/generic/atomic/atomic_store.cl b/libclc/opencl/lib/generic/atomic/atomic_store.cl new file mode 100644 index 0000000000000..67f2c8457fc10 --- /dev/null +++ b/libclc/opencl/lib/generic/atomic/atomic_store.cl @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if defined(__opencl_c_atomic_order_seq_cst) && \ + defined(__opencl_c_atomic_scope_device) + +#include +#include + +#define FUNCTION atomic_store +#define __IMPL_FUNCTION __clc_atomic_store +#define __CLC_RETURN_VOID + +#define __CLC_BODY +#include + +#define __CLC_BODY +#include + +#endif // defined(__opencl_c_atomic_order_seq_cst) && + // defined(__opencl_c_atomic_scope_device) From cd6311b50d4b9d87d986213db76d9810efd7df7e Mon Sep 17 00:00:00 2001 From: Connector Switch Date: Fri, 18 Jul 2025 08:09:38 +0800 Subject: [PATCH 274/813] [flang] Implement `COSPI` (#149343) This feature is added in the Fortran 2023 standard. --- .../flang/Optimizer/Builder/IntrinsicCall.h | 1 + flang/lib/Evaluate/intrinsics.cpp | 1 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 16 ++++++++++++++ flang/test/Lower/Intrinsics/cospi.f90 | 22 +++++++++++++++++++ 4 files changed, 40 insertions(+) create mode 100644 flang/test/Lower/Intrinsics/cospi.f90 diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 01801dbdaffca..acdba7c49e6b3 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -245,6 +245,7 @@ struct IntrinsicLibrary { fir::ExtendedValue genCPtrCompare(mlir::Type, llvm::ArrayRef); mlir::Value genCosd(mlir::Type, llvm::ArrayRef); + mlir::Value genCospi(mlir::Type, llvm::ArrayRef); void genDateAndTime(llvm::ArrayRef); mlir::Value genDim(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genDotProduct(mlir::Type, diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 4773e136c41cb..9957010684d48 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -428,6 +428,7 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"conjg", {{"z", SameComplex}}, SameComplex}, {"cos", {{"x", SameFloating}}, SameFloating}, {"cosd", {{"x", SameFloating}}, SameFloating}, + {"cospi", {{"x", SameFloating}}, SameFloating}, {"cosh", {{"x", SameFloating}}, SameFloating}, {"coshape", {{"coarray", AnyData, Rank::coarray}, SizeDefaultKIND}, KINDInt, Rank::vector, IntrinsicClass::inquiryFunction}, diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 7aa5602d2bc84..d77a656158a37 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -396,6 +396,7 @@ static constexpr IntrinsicHandler handlers[]{ {"command_argument_count", &I::genCommandArgumentCount}, {"conjg", &I::genConjg}, {"cosd", &I::genCosd}, + {"cospi", &I::genCospi}, {"count", &I::genCount, {{{"mask", asAddr}, {"dim", asValue}, {"kind", asValue}}}, @@ -3623,6 +3624,21 @@ mlir::Value IntrinsicLibrary::genCosd(mlir::Type resultType, return getRuntimeCallGenerator("cos", ftype)(builder, loc, {arg}); } +// COSPI +mlir::Value IntrinsicLibrary::genCospi(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 1); + mlir::MLIRContext *context = builder.getContext(); + mlir::FunctionType ftype = + mlir::FunctionType::get(context, {resultType}, {args[0].getType()}); + llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi); + mlir::Value dfactor = + builder.createRealConstant(loc, mlir::Float64Type::get(context), pi); + mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor); + mlir::Value arg = builder.create(loc, args[0], factor); + return getRuntimeCallGenerator("cos", ftype)(builder, loc, {arg}); +} + // COUNT fir::ExtendedValue IntrinsicLibrary::genCount(mlir::Type resultType, diff --git a/flang/test/Lower/Intrinsics/cospi.f90 b/flang/test/Lower/Intrinsics/cospi.f90 new file mode 100644 index 0000000000000..894002566141b --- /dev/null +++ b/flang/test/Lower/Intrinsics/cospi.f90 @@ -0,0 +1,22 @@ +! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefixes="CHECK" + +function test_real4(x) + real :: x, test_real4 + test_real4 = cospi(x) +end function + +! CHECK-LABEL: @_QPtest_real4 +! CHECK: %[[dfactor:.*]] = arith.constant 3.1415926535897931 : f64 +! CHECK: %[[factor:.*]] = fir.convert %[[dfactor]] : (f64) -> f32 +! CHECK: %[[mul:.*]] = arith.mulf %{{.*}}, %[[factor]] fastmath : f32 +! CHECK: %[[cos:.*]] = math.cos %[[mul]] fastmath : f32 + +function test_real8(x) + real(8) :: x, test_real8 + test_real8 = cospi(x) +end function + +! CHECK-LABEL: @_QPtest_real8 +! CHECK: %[[dfactor:.*]] = arith.constant 3.1415926535897931 : f64 +! CHECK: %[[mul:.*]] = arith.mulf %{{.*}}, %[[dfactor]] fastmath : f64 +! CHECK: %[[cos:.*]] = math.cos %[[mul]] fastmath : f64 From 28417e6459bb5174b9502f440e3dbb86f7a0046e Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 17 Jul 2025 17:29:28 -0700 Subject: [PATCH 275/813] [IA] Support vp.load in lowerInterleavedLoad [nfc-ish] (#149174) This continues in the direction started by commit 4b81dc7. We essentially merges the handling for VPLoad - currently in lowerInterleavedVPLoad - into the existing dedicated routine. This removes the last use of the dedicate lowerInterleavedVPLoad and thus we can remove it. This isn't quite NFC as the main callback has support for the strided load optimization whereas the VPLoad specific version didn't. So this adds the ability to form a strided load for a vp.load deinterleave with one shuffle used. --- llvm/include/llvm/CodeGen/TargetLowering.h | 19 +- llvm/lib/CodeGen/InterleavedAccessPass.cpp | 31 +-- .../Target/AArch64/AArch64ISelLowering.cpp | 7 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 7 +- llvm/lib/Target/ARM/ARMISelLowering.h | 2 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 5 +- .../Target/RISCV/RISCVInterleavedAccess.cpp | 181 +++++------------- llvm/lib/Target/X86/X86ISelLowering.h | 2 +- llvm/lib/Target/X86/X86InterleavedAccess.cpp | 7 +- .../rvv/fixed-vectors-interleaved-access.ll | 6 +- 11 files changed, 87 insertions(+), 182 deletions(-) diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 084b788d51828..1a548a536f088 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3201,11 +3201,15 @@ class LLVM_ABI TargetLoweringBase { /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// - /// \p LI is the vector load instruction. + /// \p Load is the vector load instruction. Can be either a plain load + /// instruction or a vp.load intrinsic. + /// \p Mask is a per-segment (i.e. number of lanes equal to that of one + /// component being interwoven) mask. Can be nullptr, in which case the + /// result is uncondiitional. /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. - virtual bool lowerInterleavedLoad(LoadInst *LI, + virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { @@ -3223,17 +3227,6 @@ class LLVM_ABI TargetLoweringBase { return false; } - /// Lower an interleaved load to target specific intrinsics. Return - /// true on success. - /// - /// \p Load is a vp.load instruction. - /// \p Mask is a mask value - /// \p DeinterleaveRes is a list of deinterleaved results. - virtual bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const { - return false; - } - /// Lower an interleaved store to target specific intrinsics. Return /// true on success. /// diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index d43cd46f8ad82..d2b2edf2ebc80 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -367,34 +367,23 @@ bool InterleavedAccessImpl::lowerInterleavedLoad( bool BinOpShuffleChanged = replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load); + Value *Mask = nullptr; if (auto *VPLoad = dyn_cast(Load)) { - Value *LaneMask = - getMask(VPLoad->getMaskParam(), Factor, cast(VecTy)); - if (!LaneMask) + Mask = getMask(VPLoad->getMaskParam(), Factor, cast(VecTy)); + if (!Mask) return false; - LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load: " << *Load << "\n"); - - // Sometimes the number of Shuffles might be less than Factor, we have to - // fill the gaps with null. Also, lowerInterleavedVPLoad - // expects them to be sorted. - SmallVector ShuffleValues(Factor, nullptr); - for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) - ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; - if (!TLI->lowerInterleavedVPLoad(VPLoad, LaneMask, ShuffleValues)) - // If Extracts is not empty, tryReplaceExtracts made changes earlier. - return !Extracts.empty() || BinOpShuffleChanged; } else { LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n"); - - // Try to create target specific intrinsics to replace the load and - // shuffles. - if (!TLI->lowerInterleavedLoad(cast(Load), Shuffles, Indices, - Factor)) - // If Extracts is not empty, tryReplaceExtracts made changes earlier. - return !Extracts.empty() || BinOpShuffleChanged; } + // Try to create target specific intrinsics to replace the load and + // shuffles. + if (!TLI->lowerInterleavedLoad(cast(Load), Mask, Shuffles, + Indices, Factor)) + // If Extracts is not empty, tryReplaceExtracts made changes earlier. + return !Extracts.empty() || BinOpShuffleChanged; + DeadInsts.insert_range(Shuffles); DeadInsts.insert(Load); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ff23f76fadccd..d04e6c45e2103 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -17155,7 +17155,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -17163,6 +17163,11 @@ bool AArch64TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + const DataLayout &DL = LI->getDataLayout(); VectorType *VTy = Shuffles[0]->getType(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 7b1de3d3254f2..713793ec77da3 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -211,7 +211,7 @@ class AArch64TargetLowering : public TargetLowering { unsigned getMaxSupportedInterleaveFactor() const override { return 4; } - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8d139883ef913..fd3b0525c1056 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21584,7 +21584,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -21592,6 +21592,11 @@ bool ARMTargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + auto *VecTy = cast(Shuffles[0]->getType()); Type *EltTy = VecTy->getElementType(); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index 5f4aef55b22c9..9159f3d2c3ed0 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -681,7 +681,7 @@ class VectorType; unsigned getMaxSupportedInterleaveFactor() const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index a5d735c407e5c..e0a8c07b4206e 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -429,7 +429,7 @@ class RISCVTargetLowering : public TargetLowering { bool fallBackToDAGISel(const Instruction &Inst) const override; - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; @@ -444,9 +444,6 @@ class RISCVTargetLowering : public TargetLowering { Instruction *Store, Value *Mask, ArrayRef InterleaveValues) const override; - bool lowerInterleavedVPLoad(VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveRes) const override; - bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask, ArrayRef InterleaveOps) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 0d4f24172b574..38cc0ce00a352 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -115,21 +115,49 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool RISCVTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Indices.size() == Shuffles.size()); - IRBuilder<> Builder(LI); - - const DataLayout &DL = LI->getDataLayout(); + IRBuilder<> Builder(Load); + const DataLayout &DL = Load->getDataLayout(); auto *VTy = cast(Shuffles[0]->getType()); - if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) - return false; + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + + Value *Ptr, *VL; + Align Alignment; + if (auto *LI = dyn_cast(Load)) { + assert(LI->isSimple()); + Ptr = LI->getPointerOperand(); + Alignment = LI->getAlign(); + assert(!Mask && "Unexpected mask on a load\n"); + Mask = Builder.getAllOnesMask(VTy->getElementCount()); + VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); + } else { + auto *VPLoad = cast(Load); + assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load && + "Unexpected intrinsic"); + Ptr = VPLoad->getMemoryPointerParam(); + Alignment = VPLoad->getPointerAlignment().value_or( + DL.getABITypeAlign(VTy->getElementType())); - auto *PtrTy = LI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + assert(Mask && "vp.load needs a mask!"); + + Value *WideEVL = VPLoad->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, DL, Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + } + + Type *PtrTy = Ptr->getType(); + unsigned AS = PtrTy->getPointerAddressSpace(); + if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL)) + return false; // If the segment load is going to be performed segment at a time anyways // and there's only one element used, use a strided load instead. This @@ -138,26 +166,23 @@ bool RISCVTargetLowering::lowerInterleavedLoad( unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); - Value *VL = Builder.CreateElementCount(Builder.getInt32Ty(), - VTy->getElementCount()); - + Value *BasePtr = Builder.CreatePtrAdd(Ptr, Offset); + // Note: Same VL as above, but i32 not xlen due to signature of + // vp.strided.load + VL = Builder.CreateElementCount(Builder.getInt32Ty(), + VTy->getElementCount()); CallInst *CI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, {VTy, BasePtr->getType(), Stride->getType()}, {BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + CI->addParamAttr(0, + Attribute::getWithAlignment(CI->getContext(), Alignment)); Shuffles[0]->replaceAllUsesWith(CI); return true; }; - Value *VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); + FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL}); for (unsigned i = 0; i < Shuffles.size(); i++) { Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); @@ -426,122 +451,6 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( return true; } -/// Lower an interleaved vp.load into a vlsegN intrinsic. -/// -/// E.g. Lower an interleaved vp.load (Factor = 2): -/// %l = call @llvm.vp.load.nxv64i8.p0(ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// %dl = tail call { , } -/// @llvm.vector.deinterleave2.nxv64i8( -/// %l) -/// %r0 = extractvalue { , } %dl, 0 -/// %r1 = extractvalue { , } %dl, 1 -/// -/// Into: -/// %rvl = udiv %wide.rvl, 2 -/// %sl = call { , } -/// @llvm.riscv.vlseg2.mask.nxv32i8.i64( undef, -/// undef, -/// ptr %ptr, -/// %mask, -/// i64 %rvl, -/// i64 1) -/// %r0 = extractvalue { , } %sl, 0 -/// %r1 = extractvalue { , } %sl, 1 -/// -/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be -/// removed by the caller -/// TODO: We probably can loosen the dependency on matching extractvalue when -/// dealing with factor of 2 (extractvalue is still required for most of other -/// factors though). -bool RISCVTargetLowering::lowerInterleavedVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveResults) const { - const unsigned Factor = DeinterleaveResults.size(); - assert(Mask && "Expect a valid mask"); - assert(Load->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - - Value *FirstActive = *llvm::find_if(DeinterleaveResults, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast(FirstActive->getType()); - - auto &DL = Load->getModule()->getDataLayout(); - Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Load); - - Value *WideEVL = Load->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Load->getArgOperand(0)->getType(); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - Value *EVL = - Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - - Value *Return = nullptr; - if (isa(VTy)) { - Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {VTy, PtrTy, XLenTy}, - {Load->getArgOperand(0), Mask, EVL}); - } else { - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), ScalableVlsegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = { - PoisonValue::get(VecTupTy), - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, - RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - - SmallVector AggrTypes{Factor, VTy}; - Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); - } - } - - for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { - if (!DIO) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast(Idx)}); - DIO->replaceAllUsesWith(NewEV); - } - - return true; -} - /// Lower an interleaved vp.store into a vssegN intrinsic. /// /// E.g. Lower an interleaved vp.store (Factor = 2): diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 6bcb7a36e91b5..26369792db26d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1661,7 +1661,7 @@ namespace llvm { /// Lower interleaved load(s) into target specific /// instructions/intrinsics. - bool lowerInterleavedLoad(LoadInst *LI, + bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp index 1eb47e3b2cd18..360293bce54e8 100644 --- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp +++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp @@ -801,7 +801,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // number of shuffles and ISA. // Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX. bool X86TargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, + Instruction *Load, Value *Mask, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); @@ -809,6 +809,11 @@ bool X86TargetLowering::lowerInterleavedLoad( assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); + auto *LI = dyn_cast(Load); + if (!LI) + return false; + assert(!Mask && "Unexpected mask on a load"); + // Create an interleaved access group. IRBuilder<> Builder(LI); X86InterleavedAccessGroup Grp(LI, Shuffles, Indices, Factor, Subtarget, diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index 019bbe2908a2c..dbc8e891ab5f7 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1721,8 +1721,9 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) { define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) { ; CHECK-LABEL: vp_load_factor3_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = tail call <12 x i32> @llvm.vp.load.v12i32.p0(ptr %ptr, <12 x i1> splat (i1 true), i32 12) %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -1732,8 +1733,9 @@ define <4 x i32> @vp_load_factor3_one_active(ptr %ptr) { define <4 x i32> @vp_load_factor5_one_active(ptr %ptr) { ; CHECK-LABEL: vp_load_factor5_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 20 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = tail call <20 x i32> @llvm.vp.load.v20i32.p0(ptr %ptr, <20 x i1> splat (i1 true), i32 20) %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> From 4a9eaad9e1283b872788832d5bce7e7945b97c78 Mon Sep 17 00:00:00 2001 From: Yanzuo Liu Date: Fri, 18 Jul 2025 09:01:47 +0800 Subject: [PATCH 276/813] [Clang][AST][NFC] Introduce `NamespaceBaseDecl` (#149123) Add `NamespaceBaseDecl` as common base class of `NamespaceDecl` and `NamespaceAliasDecl`. This simplifies `NestedNameSpecifier` a bit. Co-authored-by: Matheus Izvekov --- .../bugprone/MoveForwardingReferenceCheck.cpp | 2 +- .../clang-tidy/misc/UnusedAliasDeclsCheck.cpp | 3 +- .../utils/RenamerClangTidyCheck.cpp | 3 +- clang-tools-extra/clangd/AST.cpp | 10 ++- clang-tools-extra/clangd/CodeComplete.cpp | 1 - clang-tools-extra/clangd/DumpAST.cpp | 3 - clang-tools-extra/clangd/FindTarget.cpp | 3 - clang-tools-extra/clangd/IncludeFixer.cpp | 36 +++++---- .../clangd/refactor/tweaks/AddUsing.cpp | 8 +- .../include-cleaner/lib/WalkAST.cpp | 1 - clang/include/clang/AST/AbstractBasicReader.h | 7 +- clang/include/clang/AST/AbstractBasicWriter.h | 6 +- clang/include/clang/AST/Decl.h | 22 ++++- clang/include/clang/AST/DeclCXX.h | 24 +++--- clang/include/clang/AST/NestedNameSpecifier.h | 47 +++-------- clang/include/clang/AST/PropertiesBase.td | 1 + clang/include/clang/AST/RecursiveASTVisitor.h | 2 - clang/include/clang/ASTMatchers/ASTMatchers.h | 6 +- clang/include/clang/Basic/DeclNodes.td | 5 +- clang/include/clang/Sema/DeclSpec.h | 25 ++---- .../Refactoring/RecursiveSymbolVisitor.h | 3 +- clang/lib/AST/ASTContext.cpp | 54 +++++-------- clang/lib/AST/ASTImporter.cpp | 14 +--- clang/lib/AST/ASTStructuralEquivalence.cpp | 3 - clang/lib/AST/DeclCXX.cpp | 20 +++-- clang/lib/AST/ItaniumMangle.cpp | 14 +--- clang/lib/AST/NestedNameSpecifier.cpp | 80 +++++-------------- clang/lib/AST/ODRHash.cpp | 3 - clang/lib/AST/QualTypeNames.cpp | 11 +-- clang/lib/AST/TextNodeDumper.cpp | 4 - clang/lib/ExtractAPI/DeclarationFragments.cpp | 15 +--- clang/lib/Index/IndexTypeSourceInfo.cpp | 4 - clang/lib/Parse/ParseDeclCXX.cpp | 3 +- clang/lib/Sema/DeclSpec.cpp | 15 +--- clang/lib/Sema/SemaCXXScopeSpec.cpp | 6 +- clang/lib/Sema/SemaDeclCXX.cpp | 4 +- clang/lib/Sema/SemaExprCXX.cpp | 1 - clang/lib/Sema/SemaLookup.cpp | 13 ++- clang/lib/Sema/SemaTemplate.cpp | 1 - clang/lib/Sema/TreeTransform.h | 14 +--- clang/lib/Serialization/ASTReader.cpp | 9 +-- clang/lib/Serialization/ASTReaderDecl.cpp | 2 +- clang/lib/Serialization/ASTWriter.cpp | 5 -- clang/lib/Tooling/Syntax/BuildTree.cpp | 1 - clang/tools/libclang/CIndex.cpp | 11 --- .../NestedNameSpecifiers.cpp | 4 +- clang/unittests/Tooling/RefactoringTest.cpp | 3 +- 47 files changed, 168 insertions(+), 364 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp index 33642c407a3a9..bfa2ab51a6d03 100644 --- a/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/MoveForwardingReferenceCheck.cpp @@ -45,7 +45,7 @@ static void replaceMoveWithForward(const UnresolvedLookupExpr *Callee, // We still conservatively put a "std::" in front of the forward because // we don't know whether the code also had a "using std::forward;". Diag << FixItHint::CreateReplacement(CallRange, "std::" + ForwardName); - } else if (const NamespaceDecl *Namespace = NNS->getAsNamespace()) { + } else if (const NamespaceBaseDecl *Namespace = NNS->getAsNamespace()) { if (Namespace->getName() == "std") { if (!NNS->getPrefix()) { // Called as "std::move". diff --git a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp index 2dfaca19a8981..86992cd8a141b 100644 --- a/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UnusedAliasDeclsCheck.cpp @@ -36,7 +36,8 @@ void UnusedAliasDeclsCheck::check(const MatchFinder::MatchResult &Result) { if (const auto *NestedName = Result.Nodes.getNodeAs("nns")) { - if (const auto *AliasDecl = NestedName->getAsNamespaceAlias()) { + if (const auto *AliasDecl = dyn_cast_if_present( + NestedName->getAsNamespace())) { FoundDecls[AliasDecl] = CharSourceRange(); } } diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp index 6cf38ddf3d914..dd28806e008ed 100644 --- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp @@ -282,7 +282,8 @@ class RenamerClangTidyVisitor bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc Loc) { if (const NestedNameSpecifier *Spec = Loc.getNestedNameSpecifier()) { - if (const NamespaceDecl *Decl = Spec->getAsNamespace()) + if (const auto *Decl = + dyn_cast_if_present(Spec->getAsNamespace())) Check->addUsage(Decl, Loc.getLocalSourceRange(), SM); } diff --git a/clang-tools-extra/clangd/AST.cpp b/clang-tools-extra/clangd/AST.cpp index e274236527817..f2631e5abb6a3 100644 --- a/clang-tools-extra/clangd/AST.cpp +++ b/clang-tools-extra/clangd/AST.cpp @@ -666,12 +666,14 @@ std::string getQualification(ASTContext &Context, return getQualification( Context, DestContext, ND->getDeclContext(), [&](NestedNameSpecifier *NNS) { - if (NNS->getKind() != NestedNameSpecifier::Namespace) + const NamespaceDecl *NS = + dyn_cast_if_present(NNS->getAsNamespace()); + if (!NS) return false; - const auto *CanonNSD = NNS->getAsNamespace()->getCanonicalDecl(); + NS = NS->getCanonicalDecl(); return llvm::any_of(VisibleNamespaceDecls, - [CanonNSD](const NamespaceDecl *NSD) { - return NSD->getCanonicalDecl() == CanonNSD; + [NS](const NamespaceDecl *NSD) { + return NSD->getCanonicalDecl() == NS; }); }); } diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp index d5907e3143bf6..184c3c962f063 100644 --- a/clang-tools-extra/clangd/CodeComplete.cpp +++ b/clang-tools-extra/clangd/CodeComplete.cpp @@ -1470,7 +1470,6 @@ bool allowIndex(CodeCompletionContext &CC) { switch (NameSpec->getKind()) { case NestedNameSpecifier::Global: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: return true; case NestedNameSpecifier::Super: case NestedNameSpecifier::TypeSpec: diff --git a/clang-tools-extra/clangd/DumpAST.cpp b/clang-tools-extra/clangd/DumpAST.cpp index 8f24477ecd3de..c6075e75e9a6b 100644 --- a/clang-tools-extra/clangd/DumpAST.cpp +++ b/clang-tools-extra/clangd/DumpAST.cpp @@ -158,7 +158,6 @@ class DumpVisitor : public RecursiveASTVisitor { NNS_KIND(TypeSpec); NNS_KIND(Global); NNS_KIND(Super); - NNS_KIND(NamespaceAlias); #undef NNS_KIND } llvm_unreachable("Unhandled SpecifierKind enum"); @@ -281,8 +280,6 @@ class DumpVisitor : public RecursiveASTVisitor { return NNS.getAsIdentifier()->getName().str() + "::"; case NestedNameSpecifier::Namespace: return NNS.getAsNamespace()->getNameAsString() + "::"; - case NestedNameSpecifier::NamespaceAlias: - return NNS.getAsNamespaceAlias()->getNameAsString() + "::"; default: return ""; } diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp index 91fd3b0f8567b..b1089577ba819 100644 --- a/clang-tools-extra/clangd/FindTarget.cpp +++ b/clang-tools-extra/clangd/FindTarget.cpp @@ -491,9 +491,6 @@ struct TargetFinder { case NestedNameSpecifier::Namespace: add(NNS->getAsNamespace(), Flags); return; - case NestedNameSpecifier::NamespaceAlias: - add(NNS->getAsNamespaceAlias(), Flags); - return; case NestedNameSpecifier::Identifier: if (Resolver) { add(Resolver->resolveNestedNameSpecifierToType(NNS), Flags); diff --git a/clang-tools-extra/clangd/IncludeFixer.cpp b/clang-tools-extra/clangd/IncludeFixer.cpp index 4ff021c4c390a..50bc2bd7ccb94 100644 --- a/clang-tools-extra/clangd/IncludeFixer.cpp +++ b/clang-tools-extra/clangd/IncludeFixer.cpp @@ -403,25 +403,27 @@ std::optional extractUnresolvedNameCheaply( if (auto *Nested = SS->getScopeRep()) { if (Nested->getKind() == NestedNameSpecifier::Global) { Result.ResolvedScope = ""; - } else if (const auto *NS = Nested->getAsNamespace()) { - std::string SpecifiedNS = printNamespaceScope(*NS); - std::optional Spelling = getSpelledSpecifier(*SS, SM); - - // Check the specifier spelled in the source. - // If the resolved scope doesn't end with the spelled scope, the - // resolved scope may come from a sema typo correction. For example, - // sema assumes that "clangd::" is a typo of "clang::" and uses - // "clang::" as the specified scope in: - // namespace clang { clangd::X; } - // In this case, we use the "typo" specifier as extra scope instead - // of using the scope assumed by sema. - if (!Spelling || llvm::StringRef(SpecifiedNS).ends_with(*Spelling)) { - Result.ResolvedScope = std::move(SpecifiedNS); + } else if (const NamespaceBaseDecl *NSB = Nested->getAsNamespace()) { + if (const auto *NS = dyn_cast(NSB)) { + std::string SpecifiedNS = printNamespaceScope(*NS); + std::optional Spelling = getSpelledSpecifier(*SS, SM); + + // Check the specifier spelled in the source. + // If the resolved scope doesn't end with the spelled scope, the + // resolved scope may come from a sema typo correction. For example, + // sema assumes that "clangd::" is a typo of "clang::" and uses + // "clang::" as the specified scope in: + // namespace clang { clangd::X; } + // In this case, we use the "typo" specifier as extra scope instead + // of using the scope assumed by sema. + if (!Spelling || llvm::StringRef(SpecifiedNS).ends_with(*Spelling)) { + Result.ResolvedScope = std::move(SpecifiedNS); + } else { + Result.UnresolvedScope = std::move(*Spelling); + } } else { - Result.UnresolvedScope = std::move(*Spelling); + Result.ResolvedScope = printNamespaceScope(*cast(NSB)->getNamespace()); } - } else if (const auto *ANS = Nested->getAsNamespaceAlias()) { - Result.ResolvedScope = printNamespaceScope(*ANS->getNamespace()); } else { // We don't fix symbols in scopes that are not top-level e.g. class // members, as we don't collect includes for them. diff --git a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp index 00c05ebdb5216..67fc451a6a1a1 100644 --- a/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp +++ b/clang-tools-extra/clangd/refactor/tweaks/AddUsing.cpp @@ -173,7 +173,8 @@ findInsertionPoint(const Tweak::Selection &Inputs, if (SM.isBeforeInTranslationUnit(Inputs.Cursor, U->getUsingLoc())) // "Usings" is sorted, so we're done. break; - if (const auto *Namespace = U->getQualifier()->getAsNamespace()) { + if (const auto *Namespace = dyn_cast_if_present( + U->getQualifier()->getAsNamespace())) { if (Namespace->getCanonicalDecl() == QualifierToRemove.getNestedNameSpecifier() ->getAsNamespace() @@ -232,7 +233,10 @@ findInsertionPoint(const Tweak::Selection &Inputs, bool isNamespaceForbidden(const Tweak::Selection &Inputs, const NestedNameSpecifier &Namespace) { - std::string NamespaceStr = printNamespaceScope(*Namespace.getAsNamespace()); + const auto *NS = dyn_cast(Namespace.getAsNamespace()); + if (!NS) + return true; + std::string NamespaceStr = printNamespaceScope(*NS); for (StringRef Banned : Config::current().Style.FullyQualifiedNamespaces) { StringRef PrefixMatch = NamespaceStr; diff --git a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp index baff90faa6eae..49cc13606f4c2 100644 --- a/clang-tools-extra/include-cleaner/lib/WalkAST.cpp +++ b/clang-tools-extra/include-cleaner/lib/WalkAST.cpp @@ -140,7 +140,6 @@ class ASTWalker : public RecursiveASTVisitor { return true; switch (Qual->getKind()) { case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: case NestedNameSpecifier::Global: return true; case NestedNameSpecifier::TypeSpec: diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h index 514f4cef3a694..0a2db9e205c7c 100644 --- a/clang/include/clang/AST/AbstractBasicReader.h +++ b/clang/include/clang/AST/AbstractBasicReader.h @@ -269,12 +269,7 @@ class DataStreamBasicReader : public BasicReaderBase { case NestedNameSpecifier::Namespace: cur = NestedNameSpecifier::Create(ctx, cur, - asImpl().readNamespaceDeclRef()); - continue; - - case NestedNameSpecifier::NamespaceAlias: - cur = NestedNameSpecifier::Create(ctx, cur, - asImpl().readNamespaceAliasDeclRef()); + asImpl().readNamespaceBaseDeclRef()); continue; case NestedNameSpecifier::TypeSpec: diff --git a/clang/include/clang/AST/AbstractBasicWriter.h b/clang/include/clang/AST/AbstractBasicWriter.h index fedde8a2e46c5..c105bbbe45c92 100644 --- a/clang/include/clang/AST/AbstractBasicWriter.h +++ b/clang/include/clang/AST/AbstractBasicWriter.h @@ -251,11 +251,7 @@ class DataStreamBasicWriter : public BasicWriterBase { continue; case NestedNameSpecifier::Namespace: - asImpl().writeNamespaceDeclRef(NNS->getAsNamespace()); - continue; - - case NestedNameSpecifier::NamespaceAlias: - asImpl().writeNamespaceAliasDeclRef(NNS->getAsNamespaceAlias()); + asImpl().writeNamespaceBaseDeclRef(NNS->getAsNamespace()); continue; case NestedNameSpecifier::TypeSpec: diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h index c75e29c861f82..08fe1f881503b 100644 --- a/clang/include/clang/AST/Decl.h +++ b/clang/include/clang/AST/Decl.h @@ -565,8 +565,28 @@ class LabelDecl : public NamedDecl { static bool classofKind(Kind K) { return K == Label; } }; +/// Represents C++ namespaces and their aliases. +/// +/// FIXME: Move `NamespaceBaseDecl` and `NamespaceDecl` to "DeclCXX.h" or +/// explain why not moving. +class NamespaceBaseDecl : public NamedDecl { +protected: + using NamedDecl::NamedDecl; + +public: + NamespaceDecl *getNamespace(); + const NamespaceDecl *getNamespace() const { + return const_cast(this)->getNamespace(); + } + + static bool classof(const Decl *D) { return classofKind(D->getKind()); } + static bool classofKind(Kind K) { + return K >= firstNamespaceBase && K <= lastNamespaceBase; + } +}; + /// Represent a C++ namespace. -class NamespaceDecl : public NamedDecl, +class NamespaceDecl : public NamespaceBaseDecl, public DeclContext, public Redeclarable { /// The starting location of the source range, pointing diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h index 77bc3cad72ed9..33ae3d604020b 100644 --- a/clang/include/clang/AST/DeclCXX.h +++ b/clang/include/clang/AST/DeclCXX.h @@ -3186,7 +3186,7 @@ class UsingDirectiveDecl : public NamedDecl { /// \code /// namespace Foo = Bar; /// \endcode -class NamespaceAliasDecl : public NamedDecl, +class NamespaceAliasDecl : public NamespaceBaseDecl, public Redeclarable { friend class ASTDeclReader; @@ -3203,14 +3203,14 @@ class NamespaceAliasDecl : public NamedDecl, /// The Decl that this alias points to, either a NamespaceDecl or /// a NamespaceAliasDecl. - NamedDecl *Namespace; + NamespaceBaseDecl *Namespace; NamespaceAliasDecl(ASTContext &C, DeclContext *DC, SourceLocation NamespaceLoc, SourceLocation AliasLoc, IdentifierInfo *Alias, NestedNameSpecifierLoc QualifierLoc, - SourceLocation IdentLoc, NamedDecl *Namespace) - : NamedDecl(NamespaceAlias, DC, AliasLoc, Alias), redeclarable_base(C), - NamespaceLoc(NamespaceLoc), IdentLoc(IdentLoc), + SourceLocation IdentLoc, NamespaceBaseDecl *Namespace) + : NamespaceBaseDecl(NamespaceAlias, DC, AliasLoc, Alias), + redeclarable_base(C), NamespaceLoc(NamespaceLoc), IdentLoc(IdentLoc), QualifierLoc(QualifierLoc), Namespace(Namespace) {} void anchor() override; @@ -3222,13 +3222,11 @@ class NamespaceAliasDecl : public NamedDecl, NamespaceAliasDecl *getMostRecentDeclImpl() override; public: - static NamespaceAliasDecl *Create(ASTContext &C, DeclContext *DC, - SourceLocation NamespaceLoc, - SourceLocation AliasLoc, - IdentifierInfo *Alias, - NestedNameSpecifierLoc QualifierLoc, - SourceLocation IdentLoc, - NamedDecl *Namespace); + static NamespaceAliasDecl * + Create(ASTContext &C, DeclContext *DC, SourceLocation NamespaceLoc, + SourceLocation AliasLoc, IdentifierInfo *Alias, + NestedNameSpecifierLoc QualifierLoc, SourceLocation IdentLoc, + NamespaceBaseDecl *Namespace); static NamespaceAliasDecl *CreateDeserialized(ASTContext &C, GlobalDeclID ID); @@ -3282,7 +3280,7 @@ class NamespaceAliasDecl : public NamedDecl, /// Retrieve the namespace that this alias refers to, which /// may either be a NamespaceDecl or a NamespaceAliasDecl. - NamedDecl *getAliasedNamespace() const { return Namespace; } + NamespaceBaseDecl *getAliasedNamespace() const { return Namespace; } SourceRange getSourceRange() const override LLVM_READONLY { return SourceRange(NamespaceLoc, IdentLoc); diff --git a/clang/include/clang/AST/NestedNameSpecifier.h b/clang/include/clang/AST/NestedNameSpecifier.h index 952c79753d10a..1614f9d7c94e4 100644 --- a/clang/include/clang/AST/NestedNameSpecifier.h +++ b/clang/include/clang/AST/NestedNameSpecifier.h @@ -31,8 +31,7 @@ class ASTContext; class CXXRecordDecl; class IdentifierInfo; class LangOptions; -class NamespaceAliasDecl; -class NamespaceDecl; +class NamespaceBaseDecl; struct PrintingPolicy; class Type; class TypeLoc; @@ -79,12 +78,9 @@ class NestedNameSpecifier : public llvm::FoldingSetNode { /// An identifier, stored as an IdentifierInfo*. Identifier, - /// A namespace, stored as a NamespaceDecl*. + /// A namespace-like entity, stored as a NamespaceBaseDecl*. Namespace, - /// A namespace alias, stored as a NamespaceAliasDecl*. - NamespaceAlias, - /// A type, stored as a Type*. TypeSpec, @@ -121,15 +117,10 @@ class NestedNameSpecifier : public llvm::FoldingSetNode { NestedNameSpecifier *Prefix, const IdentifierInfo *II); - /// Builds a nested name specifier that names a namespace. - static NestedNameSpecifier *Create(const ASTContext &Context, - NestedNameSpecifier *Prefix, - const NamespaceDecl *NS); - - /// Builds a nested name specifier that names a namespace alias. + /// Builds a nested name specifier that names a namespace or namespace alias. static NestedNameSpecifier *Create(const ASTContext &Context, NestedNameSpecifier *Prefix, - const NamespaceAliasDecl *Alias); + const NamespaceBaseDecl *NS); /// Builds a nested name specifier that names a type. static NestedNameSpecifier * @@ -174,13 +165,9 @@ class NestedNameSpecifier : public llvm::FoldingSetNode { return nullptr; } - /// Retrieve the namespace stored in this nested name + /// Retrieve the namespace or namespace alias stored in this nested name /// specifier. - NamespaceDecl *getAsNamespace() const; - - /// Retrieve the namespace alias stored in this nested name - /// specifier. - NamespaceAliasDecl *getAsNamespaceAlias() const; + NamespaceBaseDecl *getAsNamespace() const; /// Retrieve the record declaration stored in this nested name /// specifier. @@ -425,29 +412,15 @@ class NestedNameSpecifierLocBuilder { /// \param Context The AST context in which this nested-name-specifier /// resides. /// - /// \param Namespace The namespace. + /// \param Namespace The namespace or namespace alias. /// - /// \param NamespaceLoc The location of the namespace name. + /// \param NamespaceLoc The location of the namespace name or the namespace + // alias. /// /// \param ColonColonLoc The location of the trailing '::'. - void Extend(ASTContext &Context, NamespaceDecl *Namespace, + void Extend(ASTContext &Context, NamespaceBaseDecl *Namespace, SourceLocation NamespaceLoc, SourceLocation ColonColonLoc); - /// Extend the current nested-name-specifier by another - /// nested-name-specifier component of the form 'namespace-alias::'. - /// - /// \param Context The AST context in which this nested-name-specifier - /// resides. - /// - /// \param Alias The namespace alias. - /// - /// \param AliasLoc The location of the namespace alias - /// name. - /// - /// \param ColonColonLoc The location of the trailing '::'. - void Extend(ASTContext &Context, NamespaceAliasDecl *Alias, - SourceLocation AliasLoc, SourceLocation ColonColonLoc); - /// Turn this (empty) nested-name-specifier into the global /// nested-name-specifier '::'. void MakeGlobal(ASTContext &Context, SourceLocation ColonColonLoc); diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td index 1215056ffde1b..0438e4dfbafac 100644 --- a/clang/include/clang/AST/PropertiesBase.td +++ b/clang/include/clang/AST/PropertiesBase.td @@ -91,6 +91,7 @@ def DeclRef : RefPropertyType<"Decl"> { let ConstWhenWriting = 1; } SubclassPropertyType<"FunctionDecl", DeclRef>; def NamedDeclRef : SubclassPropertyType<"NamedDecl", DeclRef>; + def NamespaceBaseDeclRef : SubclassPropertyType<"NamespaceBaseDecl", DeclRef>; def NamespaceDeclRef : SubclassPropertyType<"NamespaceDecl", DeclRef>; def NamespaceAliasDeclRef : diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 5cb2f57edffe4..519a811775c01 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -789,7 +789,6 @@ bool RecursiveASTVisitor::TraverseNestedNameSpecifier( switch (NNS->getKind()) { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: case NestedNameSpecifier::Global: case NestedNameSpecifier::Super: return true; @@ -813,7 +812,6 @@ bool RecursiveASTVisitor::TraverseNestedNameSpecifierLoc( switch (NNS.getNestedNameSpecifier()->getKind()) { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: case NestedNameSpecifier::Global: case NestedNameSpecifier::Super: return true; diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h index b364b6556d0b3..08c898f7758ec 100644 --- a/clang/include/clang/ASTMatchers/ASTMatchers.h +++ b/clang/include/clang/ASTMatchers/ASTMatchers.h @@ -7894,9 +7894,9 @@ AST_MATCHER_P_OVERLOAD(NestedNameSpecifierLoc, hasPrefix, /// matches "ns::" AST_MATCHER_P(NestedNameSpecifier, specifiesNamespace, internal::Matcher, InnerMatcher) { - if (!Node.getAsNamespace()) - return false; - return InnerMatcher.matches(*Node.getAsNamespace(), Finder, Builder); + if (auto *NS = dyn_cast_if_present(Node.getAsNamespace())) + return InnerMatcher.matches(*NS, Finder, Builder); + return false; } /// Matches attributes. diff --git a/clang/include/clang/Basic/DeclNodes.td b/clang/include/clang/Basic/DeclNodes.td index f1ebaf1db3fc0..8d6731b50f509 100644 --- a/clang/include/clang/Basic/DeclNodes.td +++ b/clang/include/clang/Basic/DeclNodes.td @@ -15,9 +15,10 @@ def PragmaComment : DeclNode; def PragmaDetectMismatch : DeclNode; def ExternCContext : DeclNode, DeclContext; def Named : DeclNode; - def Namespace : DeclNode, DeclContext; + def NamespaceBase : DeclNode; + def Namespace : DeclNode, DeclContext; + def NamespaceAlias : DeclNode; def UsingDirective : DeclNode; - def NamespaceAlias : DeclNode; def Label : DeclNode; def Type : DeclNode; def TypedefName : DeclNode; diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h index 4b7c8d609735f..e5680813e74de 100644 --- a/clang/include/clang/Sema/DeclSpec.h +++ b/clang/include/clang/Sema/DeclSpec.h @@ -44,8 +44,7 @@ namespace clang { class TypeLoc; class LangOptions; class IdentifierInfo; - class NamespaceAliasDecl; - class NamespaceDecl; + class NamespaceBaseDecl; class ObjCDeclSpec; class Sema; class Declarator; @@ -129,29 +128,15 @@ class CXXScopeSpec { /// \param Context The AST context in which this nested-name-specifier /// resides. /// - /// \param Namespace The namespace. + /// \param Namespace The namespace or the namespace alias. /// - /// \param NamespaceLoc The location of the namespace name. + /// \param NamespaceLoc The location of the namespace name or the namespace + /// alias. /// /// \param ColonColonLoc The location of the trailing '::'. - void Extend(ASTContext &Context, NamespaceDecl *Namespace, + void Extend(ASTContext &Context, NamespaceBaseDecl *Namespace, SourceLocation NamespaceLoc, SourceLocation ColonColonLoc); - /// Extend the current nested-name-specifier by another - /// nested-name-specifier component of the form 'namespace-alias::'. - /// - /// \param Context The AST context in which this nested-name-specifier - /// resides. - /// - /// \param Alias The namespace alias. - /// - /// \param AliasLoc The location of the namespace alias - /// name. - /// - /// \param ColonColonLoc The location of the trailing '::'. - void Extend(ASTContext &Context, NamespaceAliasDecl *Alias, - SourceLocation AliasLoc, SourceLocation ColonColonLoc); - /// Turn this (empty) nested-name-specifier into the global /// nested-name-specifier '::'. void MakeGlobal(ASTContext &Context, SourceLocation ColonColonLoc); diff --git a/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h b/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h index 015dbba26f688..271232e66626e 100644 --- a/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h +++ b/clang/include/clang/Tooling/Refactoring/RecursiveSymbolVisitor.h @@ -115,7 +115,8 @@ class RecursiveSymbolVisitor // The base visitor will visit NNSL prefixes, so we should only look at // the current NNS. if (NNS) { - const NamespaceDecl *ND = NNS.getNestedNameSpecifier()->getAsNamespace(); + const auto *ND = dyn_cast_if_present( + NNS.getNestedNameSpecifier()->getAsNamespace()); if (!visit(ND, NNS.getLocalBeginLoc(), NNS.getLocalEndLoc())) return false; } diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 50bd93a143a28..232a4b6557b92 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -7387,21 +7387,9 @@ bool ASTContext::isSameDefaultTemplateArgument(const NamedDecl *X, return hasSameTemplateName(TAX.getAsTemplate(), TAY.getAsTemplate()); } -static NamespaceDecl *getNamespace(const NestedNameSpecifier *X) { - if (auto *NS = X->getAsNamespace()) - return NS; - if (auto *NAS = X->getAsNamespaceAlias()) - return NAS->getNamespace(); - return nullptr; -} - static bool isSameQualifier(const NestedNameSpecifier *X, const NestedNameSpecifier *Y) { - if (auto *NSX = getNamespace(X)) { - auto *NSY = getNamespace(Y); - if (!NSY || NSX->getCanonicalDecl() != NSY->getCanonicalDecl()) - return false; - } else if (X->getKind() != Y->getKind()) + if (X->getKind() != Y->getKind()) return false; // FIXME: For namespaces and types, we're permitted to check that the entity @@ -7412,8 +7400,8 @@ static bool isSameQualifier(const NestedNameSpecifier *X, return false; break; case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: - // We've already checked that we named the same namespace. + if (!declaresSameEntity(X->getAsNamespace(), Y->getAsNamespace())) + return false; break; case NestedNameSpecifier::TypeSpec: if (X->getAsType()->getCanonicalTypeInternal() != @@ -7836,17 +7824,10 @@ ASTContext::getCanonicalNestedNameSpecifier(NestedNameSpecifier *NNS) const { NNS->getAsIdentifier()); case NestedNameSpecifier::Namespace: - // A namespace is canonical; build a nested-name-specifier with - // this namespace and no prefix. - return NestedNameSpecifier::Create(*this, nullptr, - NNS->getAsNamespace()->getFirstDecl()); - - case NestedNameSpecifier::NamespaceAlias: // A namespace is canonical; build a nested-name-specifier with // this namespace and no prefix. return NestedNameSpecifier::Create( - *this, nullptr, - NNS->getAsNamespaceAlias()->getNamespace()->getFirstDecl()); + *this, nullptr, NNS->getAsNamespace()->getNamespace()->getFirstDecl()); // The difference between TypeSpec and TypeSpecWithTemplate is that the // latter will have the 'template' keyword when printed. @@ -13698,26 +13679,27 @@ static NestedNameSpecifier *getCommonNNS(ASTContext &Ctx, R = NestedNameSpecifier::Create(Ctx, P, II); break; } - case NestedNameSpecifier::SpecifierKind::Namespace: - case NestedNameSpecifier::SpecifierKind::NamespaceAlias: { - assert(K2 == NestedNameSpecifier::SpecifierKind::Namespace || - K2 == NestedNameSpecifier::SpecifierKind::NamespaceAlias); + case NestedNameSpecifier::SpecifierKind::Namespace: { + assert(K2 == NestedNameSpecifier::SpecifierKind::Namespace); // The prefixes for namespaces are not significant, its declaration // identifies it uniquely. NestedNameSpecifier *P = ::getCommonNNS(Ctx, NNS1->getPrefix(), NNS2->getPrefix(), /*IsSame=*/false); - NamespaceAliasDecl *A1 = NNS1->getAsNamespaceAlias(), - *A2 = NNS2->getAsNamespaceAlias(); - // Are they the same namespace alias? - if (declaresSameEntity(A1, A2)) { - R = NestedNameSpecifier::Create(Ctx, P, ::getCommonDeclChecked(A1, A2)); + NamespaceBaseDecl *Namespace1 = NNS1->getAsNamespace(), + *Namespace2 = NNS2->getAsNamespace(); + auto Kind = Namespace1->getKind(); + if (Kind != Namespace2->getKind() || + (Kind == Decl::NamespaceAlias && + !declaresSameEntity(Namespace1, Namespace2))) { + R = NestedNameSpecifier::Create( + Ctx, P, + ::getCommonDeclChecked(Namespace1->getNamespace(), + Namespace2->getNamespace())); break; } - // Otherwise, look at the namespaces only. - NamespaceDecl *N1 = A1 ? A1->getNamespace() : NNS1->getAsNamespace(), - *N2 = A2 ? A2->getNamespace() : NNS2->getAsNamespace(); - R = NestedNameSpecifier::Create(Ctx, P, ::getCommonDeclChecked(N1, N2)); + R = NestedNameSpecifier::Create( + Ctx, P, ::getCommonDeclChecked(Namespace1, Namespace2)); break; } case NestedNameSpecifier::SpecifierKind::TypeSpec: { diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index 4d3bd985739fb..b5f6c5a8c6abe 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -10063,17 +10063,10 @@ ASTImporter::Import(NestedNameSpecifier *FromNNS) { case NestedNameSpecifier::Namespace: if (ExpectedDecl NSOrErr = Import(FromNNS->getAsNamespace())) { return NestedNameSpecifier::Create(ToContext, Prefix, - cast(*NSOrErr)); + cast(*NSOrErr)); } else return NSOrErr.takeError(); - case NestedNameSpecifier::NamespaceAlias: - if (ExpectedDecl NSADOrErr = Import(FromNNS->getAsNamespaceAlias())) - return NestedNameSpecifier::Create(ToContext, Prefix, - cast(*NSADOrErr)); - else - return NSADOrErr.takeError(); - case NestedNameSpecifier::Global: return NestedNameSpecifier::GlobalSpecifier(ToContext); @@ -10139,11 +10132,6 @@ ASTImporter::Import(NestedNameSpecifierLoc FromNNS) { ToLocalEndLoc); break; - case NestedNameSpecifier::NamespaceAlias: - Builder.Extend(getToContext(), Spec->getAsNamespaceAlias(), - ToLocalBeginLoc, ToLocalEndLoc); - break; - case NestedNameSpecifier::TypeSpec: { SourceLocation ToTLoc; if (Error Err = importInto(ToTLoc, NNS.getTypeLoc().getBeginLoc())) diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp index 3aa6b37844103..289c6d7737de7 100644 --- a/clang/lib/AST/ASTStructuralEquivalence.cpp +++ b/clang/lib/AST/ASTStructuralEquivalence.cpp @@ -598,9 +598,6 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, case NestedNameSpecifier::Namespace: return IsStructurallyEquivalent(Context, NNS1->getAsNamespace(), NNS2->getAsNamespace()); - case NestedNameSpecifier::NamespaceAlias: - return IsStructurallyEquivalent(Context, NNS1->getAsNamespaceAlias(), - NNS2->getAsNamespaceAlias()); case NestedNameSpecifier::TypeSpec: return IsStructurallyEquivalent(Context, QualType(NNS1->getAsType(), 0), QualType(NNS2->getAsType(), 0)); diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp index 4514965009793..673e3f73858c7 100644 --- a/clang/lib/AST/DeclCXX.cpp +++ b/clang/lib/AST/DeclCXX.cpp @@ -3211,6 +3211,12 @@ UsingDirectiveDecl *UsingDirectiveDecl::CreateDeserialized(ASTContext &C, SourceLocation(), nullptr, nullptr); } +NamespaceDecl *NamespaceBaseDecl::getNamespace() { + if (auto *Alias = dyn_cast(this)) + return Alias->getNamespace(); + return cast(this); +} + NamespaceDecl *UsingDirectiveDecl::getNominatedNamespace() { if (auto *NA = dyn_cast_or_null(NominatedNamespace)) return NA->getNamespace(); @@ -3221,7 +3227,7 @@ NamespaceDecl::NamespaceDecl(ASTContext &C, DeclContext *DC, bool Inline, SourceLocation StartLoc, SourceLocation IdLoc, IdentifierInfo *Id, NamespaceDecl *PrevDecl, bool Nested) - : NamedDecl(Namespace, DC, IdLoc, Id), DeclContext(Namespace), + : NamespaceBaseDecl(Namespace, DC, IdLoc, Id), DeclContext(Namespace), redeclarable_base(C), LocStart(StartLoc) { setInline(Inline); setNested(Nested); @@ -3268,13 +3274,11 @@ NamespaceAliasDecl *NamespaceAliasDecl::getMostRecentDeclImpl() { return getMostRecentDecl(); } -NamespaceAliasDecl *NamespaceAliasDecl::Create(ASTContext &C, DeclContext *DC, - SourceLocation UsingLoc, - SourceLocation AliasLoc, - IdentifierInfo *Alias, - NestedNameSpecifierLoc QualifierLoc, - SourceLocation IdentLoc, - NamedDecl *Namespace) { +NamespaceAliasDecl *NamespaceAliasDecl::Create( + ASTContext &C, DeclContext *DC, SourceLocation UsingLoc, + SourceLocation AliasLoc, IdentifierInfo *Alias, + NestedNameSpecifierLoc QualifierLoc, SourceLocation IdentLoc, + NamespaceBaseDecl *Namespace) { // FIXME: Preserve the aliased namespace as written. if (auto *NS = dyn_cast_or_null(Namespace)) Namespace = NS->getFirstDecl(); diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 0520987ce6b3a..6d082b31a9caa 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -1384,14 +1384,6 @@ void CXXNameMangler::mangleUnresolvedPrefix(NestedNameSpecifier *qualifier, Out << "sr"; mangleSourceNameWithAbiTags(qualifier->getAsNamespace()); break; - case NestedNameSpecifier::NamespaceAlias: - if (qualifier->getPrefix()) - mangleUnresolvedPrefix(qualifier->getPrefix(), - /*recursive*/ true); - else - Out << "sr"; - mangleSourceNameWithAbiTags(qualifier->getAsNamespaceAlias()); - break; case NestedNameSpecifier::TypeSpec: { const Type *type = qualifier->getAsType(); @@ -2185,11 +2177,7 @@ void CXXNameMangler::manglePrefix(NestedNameSpecifier *qualifier) { llvm_unreachable("Can't mangle __super specifier"); case NestedNameSpecifier::Namespace: - mangleName(qualifier->getAsNamespace()); - return; - - case NestedNameSpecifier::NamespaceAlias: - mangleName(qualifier->getAsNamespaceAlias()->getNamespace()); + mangleName(qualifier->getAsNamespace()->getNamespace()); return; case NestedNameSpecifier::TypeSpec: diff --git a/clang/lib/AST/NestedNameSpecifier.cpp b/clang/lib/AST/NestedNameSpecifier.cpp index db1ad89565189..56f74b92412d2 100644 --- a/clang/lib/AST/NestedNameSpecifier.cpp +++ b/clang/lib/AST/NestedNameSpecifier.cpp @@ -66,10 +66,9 @@ NestedNameSpecifier *NestedNameSpecifier::Create(const ASTContext &Context, return FindOrInsert(Context, Mockup); } -NestedNameSpecifier * -NestedNameSpecifier::Create(const ASTContext &Context, - NestedNameSpecifier *Prefix, - const NamespaceDecl *NS) { +NestedNameSpecifier *NestedNameSpecifier::Create(const ASTContext &Context, + NestedNameSpecifier *Prefix, + const NamespaceBaseDecl *NS) { assert(NS && "Namespace cannot be NULL"); assert((!Prefix || (Prefix->getAsType() == nullptr && @@ -78,23 +77,7 @@ NestedNameSpecifier::Create(const ASTContext &Context, NestedNameSpecifier Mockup; Mockup.Prefix.setPointer(Prefix); Mockup.Prefix.setInt(StoredDecl); - Mockup.Specifier = const_cast(NS); - return FindOrInsert(Context, Mockup); -} - -NestedNameSpecifier * -NestedNameSpecifier::Create(const ASTContext &Context, - NestedNameSpecifier *Prefix, - const NamespaceAliasDecl *Alias) { - assert(Alias && "Namespace alias cannot be NULL"); - assert((!Prefix || - (Prefix->getAsType() == nullptr && - Prefix->getAsIdentifier() == nullptr)) && - "Broken nested name specifier"); - NestedNameSpecifier Mockup; - Mockup.Prefix.setPointer(Prefix); - Mockup.Prefix.setInt(StoredDecl); - Mockup.Specifier = const_cast(Alias); + Mockup.Specifier = const_cast(NS); return FindOrInsert(Context, Mockup); } @@ -147,9 +130,7 @@ NestedNameSpecifier::SpecifierKind NestedNameSpecifier::getKind() const { case StoredDecl: { NamedDecl *ND = static_cast(Specifier); - if (isa(ND)) - return Super; - return isa(ND) ? Namespace : NamespaceAlias; + return isa(ND) ? Super : Namespace; } case StoredTypeSpec: @@ -159,18 +140,11 @@ NestedNameSpecifier::SpecifierKind NestedNameSpecifier::getKind() const { llvm_unreachable("Invalid NNS Kind!"); } -/// Retrieve the namespace stored in this nested name specifier. -NamespaceDecl *NestedNameSpecifier::getAsNamespace() const { +/// Retrieve the namespace or namespace alias stored in this nested name +/// specifier. +NamespaceBaseDecl *NestedNameSpecifier::getAsNamespace() const { if (Prefix.getInt() == StoredDecl) - return dyn_cast(static_cast(Specifier)); - - return nullptr; -} - -/// Retrieve the namespace alias stored in this nested name specifier. -NamespaceAliasDecl *NestedNameSpecifier::getAsNamespaceAlias() const { - if (Prefix.getInt() == StoredDecl) - return dyn_cast(static_cast(Specifier)); + return dyn_cast(static_cast(Specifier)); return nullptr; } @@ -204,7 +178,6 @@ NestedNameSpecifierDependence NestedNameSpecifier::getDependence() const { } case Namespace: - case NamespaceAlias: case Global: return NestedNameSpecifierDependence::None; @@ -284,7 +257,6 @@ NestedNameSpecifier::translateToType(const ASTContext &Context) const { } case SpecifierKind::Global: case SpecifierKind::Namespace: - case SpecifierKind::NamespaceAlias: case SpecifierKind::Super: // These are not representable as types. return nullptr; @@ -305,16 +277,16 @@ void NestedNameSpecifier::print(raw_ostream &OS, const PrintingPolicy &Policy, OS << getAsIdentifier()->getName(); break; - case Namespace: - if (getAsNamespace()->isAnonymousNamespace()) - return; - - OS << getAsNamespace()->getName(); - break; - - case NamespaceAlias: - OS << getAsNamespaceAlias()->getName(); + case Namespace: { + NamespaceBaseDecl *Namespace = getAsNamespace(); + if (const auto *NS = dyn_cast(Namespace)) { + assert(!NS->isAnonymousNamespace()); + OS << NS->getName(); + } else { + OS << cast(Namespace)->getName(); + } break; + } case Global: OS << "::"; @@ -367,7 +339,6 @@ NestedNameSpecifierLoc::getLocalDataLength(NestedNameSpecifier *Qualifier) { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: case NestedNameSpecifier::Super: // The location of the identifier or namespace name. Length += sizeof(SourceLocation::UIntTy); @@ -418,7 +389,6 @@ SourceRange NestedNameSpecifierLoc::getLocalSourceRange() const { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: case NestedNameSpecifier::Super: return SourceRange( LoadSourceLocation(Data, Offset), @@ -569,7 +539,7 @@ void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context, } void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context, - NamespaceDecl *Namespace, + NamespaceBaseDecl *Namespace, SourceLocation NamespaceLoc, SourceLocation ColonColonLoc) { Representation = NestedNameSpecifier::Create(Context, Representation, @@ -580,17 +550,6 @@ void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context, SaveSourceLocation(ColonColonLoc, Buffer, BufferSize, BufferCapacity); } -void NestedNameSpecifierLocBuilder::Extend(ASTContext &Context, - NamespaceAliasDecl *Alias, - SourceLocation AliasLoc, - SourceLocation ColonColonLoc) { - Representation = NestedNameSpecifier::Create(Context, Representation, Alias); - - // Push source-location info into the buffer. - SaveSourceLocation(AliasLoc, Buffer, BufferSize, BufferCapacity); - SaveSourceLocation(ColonColonLoc, Buffer, BufferSize, BufferCapacity); -} - void NestedNameSpecifierLocBuilder::MakeGlobal(ASTContext &Context, SourceLocation ColonColonLoc) { assert(!Representation && "Already have a nested-name-specifier!?"); @@ -627,7 +586,6 @@ void NestedNameSpecifierLocBuilder::MakeTrivial(ASTContext &Context, switch (NNS->getKind()) { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: SaveSourceLocation(R.getBegin(), Buffer, BufferSize, BufferCapacity); break; diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp index 7fdfcfa3014f3..bd87d4418484b 100644 --- a/clang/lib/AST/ODRHash.cpp +++ b/clang/lib/AST/ODRHash.cpp @@ -127,9 +127,6 @@ void ODRHash::AddNestedNameSpecifier(const NestedNameSpecifier *NNS) { case NestedNameSpecifier::Namespace: AddDecl(NNS->getAsNamespace()); break; - case NestedNameSpecifier::NamespaceAlias: - AddDecl(NNS->getAsNamespaceAlias()); - break; case NestedNameSpecifier::TypeSpec: AddType(NNS->getAsType()); break; diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp index 39703d6d7b882..b43bcd8d1f1c1 100644 --- a/clang/lib/AST/QualTypeNames.cpp +++ b/clang/lib/AST/QualTypeNames.cpp @@ -218,16 +218,7 @@ static NestedNameSpecifier *getFullyQualifiedNestedNameSpecifier( return Scope; case NestedNameSpecifier::Namespace: return TypeName::createNestedNameSpecifier( - Ctx, Scope->getAsNamespace(), WithGlobalNsPrefix); - case NestedNameSpecifier::NamespaceAlias: - // Namespace aliases are only valid for the duration of the - // scope where they were introduced, and therefore are often - // invalid at the end of the TU. So use the namespace name more - // likely to be valid at the end of the TU. - return TypeName::createNestedNameSpecifier( - Ctx, - Scope->getAsNamespaceAlias()->getNamespace()->getCanonicalDecl(), - WithGlobalNsPrefix); + Ctx, Scope->getAsNamespace()->getNamespace(), WithGlobalNsPrefix); case NestedNameSpecifier::Identifier: // A function or some other construct that makes it un-namable // at the end of the TU. Skip the current component of the name, diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 9d7c2757d6ee4..3d9397fb0b540 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -1050,10 +1050,6 @@ void clang::TextNodeDumper::dumpNestedNameSpecifier(const NestedNameSpecifier *N OS << " "; // "Namespace" is printed as the decl kind. dumpBareDeclRef(NNS->getAsNamespace()); break; - case NestedNameSpecifier::NamespaceAlias: - OS << " "; // "NamespaceAlias" is printed as the decl kind. - dumpBareDeclRef(NNS->getAsNamespaceAlias()); - break; case NestedNameSpecifier::TypeSpec: OS << " TypeSpec"; dumpType(QualType(NNS->getAsType(), 0)); diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp index 791afc1a97575..51a6f6b779e77 100644 --- a/clang/lib/ExtractAPI/DeclarationFragments.cpp +++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp @@ -220,8 +220,9 @@ DeclarationFragmentsBuilder::getFragmentsForNNS(const NestedNameSpecifier *NNS, break; case NestedNameSpecifier::Namespace: { - const NamespaceDecl *NS = NNS->getAsNamespace(); - if (NS->isAnonymousNamespace()) + const NamespaceBaseDecl *NS = NNS->getAsNamespace(); + if (const auto *Namespace = dyn_cast(NS); + Namespace && Namespace->isAnonymousNamespace()) return Fragments; SmallString<128> USR; index::generateUSRForDecl(NS, USR); @@ -230,16 +231,6 @@ DeclarationFragmentsBuilder::getFragmentsForNNS(const NestedNameSpecifier *NNS, break; } - case NestedNameSpecifier::NamespaceAlias: { - const NamespaceAliasDecl *Alias = NNS->getAsNamespaceAlias(); - SmallString<128> USR; - index::generateUSRForDecl(Alias, USR); - Fragments.append(Alias->getName(), - DeclarationFragments::FragmentKind::Identifier, USR, - Alias); - break; - } - case NestedNameSpecifier::Global: // The global specifier `::` at the beginning. No stored value. break; diff --git a/clang/lib/Index/IndexTypeSourceInfo.cpp b/clang/lib/Index/IndexTypeSourceInfo.cpp index 98b5513128fbe..adc33b3abd822 100644 --- a/clang/lib/Index/IndexTypeSourceInfo.cpp +++ b/clang/lib/Index/IndexTypeSourceInfo.cpp @@ -271,10 +271,6 @@ void IndexingContext::indexNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS, handleReference(NNS.getNestedNameSpecifier()->getAsNamespace(), Loc, Parent, DC, SymbolRoleSet()); break; - case NestedNameSpecifier::NamespaceAlias: - handleReference(NNS.getNestedNameSpecifier()->getAsNamespaceAlias(), - Loc, Parent, DC, SymbolRoleSet()); - break; case NestedNameSpecifier::TypeSpec: indexTypeLoc(NNS.getTypeLoc(), Parent, DC); diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 9cae4f9a23ef0..31392d1dd8d4b 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -591,8 +591,7 @@ bool Parser::ParseUsingDeclarator(DeclaratorContext Context, NextToken().isRegularKeywordAttribute() || NextToken().is(tok::kw___attribute)) && D.SS.isNotEmpty() && LastII == Tok.getIdentifierInfo() && - !D.SS.getScopeRep()->getAsNamespace() && - !D.SS.getScopeRep()->getAsNamespaceAlias()) { + D.SS.getScopeRep()->getKind() != NestedNameSpecifier::Namespace) { SourceLocation IdLoc = ConsumeToken(); ParsedType Type = Actions.getInheritingConstructorName(D.SS, IdLoc, *LastII); diff --git a/clang/lib/Sema/DeclSpec.cpp b/clang/lib/Sema/DeclSpec.cpp index ee5a862c32509..f0f1d66f66e93 100644 --- a/clang/lib/Sema/DeclSpec.cpp +++ b/clang/lib/Sema/DeclSpec.cpp @@ -72,7 +72,7 @@ void CXXScopeSpec::Extend(ASTContext &Context, IdentifierInfo *Identifier, "NestedNameSpecifierLoc range computation incorrect"); } -void CXXScopeSpec::Extend(ASTContext &Context, NamespaceDecl *Namespace, +void CXXScopeSpec::Extend(ASTContext &Context, NamespaceBaseDecl *Namespace, SourceLocation NamespaceLoc, SourceLocation ColonColonLoc) { Builder.Extend(Context, Namespace, NamespaceLoc, ColonColonLoc); @@ -85,19 +85,6 @@ void CXXScopeSpec::Extend(ASTContext &Context, NamespaceDecl *Namespace, "NestedNameSpecifierLoc range computation incorrect"); } -void CXXScopeSpec::Extend(ASTContext &Context, NamespaceAliasDecl *Alias, - SourceLocation AliasLoc, - SourceLocation ColonColonLoc) { - Builder.Extend(Context, Alias, AliasLoc, ColonColonLoc); - - if (Range.getBegin().isInvalid()) - Range.setBegin(AliasLoc); - Range.setEnd(ColonColonLoc); - - assert(Range == Builder.getSourceRange() && - "NestedNameSpecifierLoc range computation incorrect"); -} - void CXXScopeSpec::MakeGlobal(ASTContext &Context, SourceLocation ColonColonLoc) { Builder.MakeGlobal(Context, ColonColonLoc); diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp index ab83f625d2849..6ac04837708f6 100644 --- a/clang/lib/Sema/SemaCXXScopeSpec.cpp +++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp @@ -137,10 +137,7 @@ DeclContext *Sema::computeDeclContext(const CXXScopeSpec &SS, llvm_unreachable("Dependent nested-name-specifier has no DeclContext"); case NestedNameSpecifier::Namespace: - return NNS->getAsNamespace(); - - case NestedNameSpecifier::NamespaceAlias: - return NNS->getAsNamespaceAlias()->getNamespace(); + return NNS->getAsNamespace()->getNamespace(); case NestedNameSpecifier::TypeSpec: { const TagType *Tag = NNS->getAsType()->getAs(); @@ -992,7 +989,6 @@ bool Sema::ShouldEnterDeclaratorScope(Scope *S, const CXXScopeSpec &SS) { switch (Qualifier->getKind()) { case NestedNameSpecifier::Global: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: // These are always namespace scopes. We never want to enter a // namespace scope from anything but a file context. return CurContext->getRedeclContext()->isFileContext(); diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index c8638420aebb5..f5b4614576086 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -11930,7 +11930,7 @@ Decl *Sema::ActOnStartNamespaceDef(Scope *NamespcScope, /// getNamespaceDecl - Returns the namespace a decl represents. If the decl /// is a namespace alias, returns the namespace it points to. -static inline NamespaceDecl *getNamespaceDecl(NamedDecl *D) { +static inline NamespaceDecl *getNamespaceDecl(NamespaceBaseDecl *D) { if (NamespaceAliasDecl *AD = dyn_cast_or_null(D)) return AD->getNamespace(); return dyn_cast_or_null(D); @@ -13829,7 +13829,7 @@ Decl *Sema::ActOnNamespaceAliasDef(Scope *S, SourceLocation NamespaceLoc, } } assert(!R.isAmbiguous() && !R.empty()); - NamedDecl *ND = R.getRepresentativeDecl(); + auto *ND = cast(R.getRepresentativeDecl()); // Check if we have a previous declaration with the same name. LookupResult PrevR(*this, Alias, AliasLoc, LookupOrdinaryName, diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index f851c9e1d5015..fd95f4ec54229 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -518,7 +518,6 @@ bool Sema::checkLiteralOperatorId(const CXXScopeSpec &SS, case NestedNameSpecifier::Global: case NestedNameSpecifier::Super: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: return false; } diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index 6d6e07a2c03c7..8bde18f64f80b 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -4560,15 +4560,14 @@ static void getNestedNameSpecifierIdentifiers( II = NNS->getAsIdentifier(); break; - case NestedNameSpecifier::Namespace: - if (NNS->getAsNamespace()->isAnonymousNamespace()) + case NestedNameSpecifier::Namespace: { + const NamespaceBaseDecl *Namespace = NNS->getAsNamespace(); + if (const auto *NS = dyn_cast(Namespace); + NS && NS->isAnonymousNamespace()) return; - II = NNS->getAsNamespace()->getIdentifier(); - break; - - case NestedNameSpecifier::NamespaceAlias: - II = NNS->getAsNamespaceAlias()->getIdentifier(); + II = Namespace->getIdentifier(); break; + } case NestedNameSpecifier::TypeSpec: II = QualType(NNS->getAsType(), 0).getBaseTypeIdentifier(); diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index b76619fc50268..698d1270be634 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -6299,7 +6299,6 @@ bool UnnamedLocalNoLinkageFinder::VisitNestedNameSpecifier( switch (NNS->getKind()) { case NestedNameSpecifier::Identifier: case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: case NestedNameSpecifier::Global: case NestedNameSpecifier::Super: return false; diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 3e38f8b183dfd..286c2b486c0f9 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -4621,22 +4621,12 @@ NestedNameSpecifierLoc TreeTransform::TransformNestedNameSpecifierLoc( } case NestedNameSpecifier::Namespace: { - NamespaceDecl *NS = - cast_or_null(getDerived().TransformDecl( - Q.getLocalBeginLoc(), QNNS->getAsNamespace())); + auto *NS = cast(getDerived().TransformDecl( + Q.getLocalBeginLoc(), QNNS->getAsNamespace())); SS.Extend(SemaRef.Context, NS, Q.getLocalBeginLoc(), Q.getLocalEndLoc()); break; } - case NestedNameSpecifier::NamespaceAlias: { - NamespaceAliasDecl *Alias = - cast_or_null(getDerived().TransformDecl( - Q.getLocalBeginLoc(), QNNS->getAsNamespaceAlias())); - SS.Extend(SemaRef.Context, Alias, Q.getLocalBeginLoc(), - Q.getLocalEndLoc()); - break; - } - case NestedNameSpecifier::Global: // There is no meaningful transformation that one could perform on the // global scope. diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index d0bb7fb1d06ad..3596d2240167e 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -10107,19 +10107,12 @@ ASTRecordReader::readNestedNameSpecifierLoc() { } case NestedNameSpecifier::Namespace: { - NamespaceDecl *NS = readDeclAs(); + auto *NS = readDeclAs(); SourceRange Range = readSourceRange(); Builder.Extend(Context, NS, Range.getBegin(), Range.getEnd()); break; } - case NestedNameSpecifier::NamespaceAlias: { - NamespaceAliasDecl *Alias = readDeclAs(); - SourceRange Range = readSourceRange(); - Builder.Extend(Context, Alias, Range.getBegin(), Range.getEnd()); - break; - } - case NestedNameSpecifier::TypeSpec: { TypeSourceInfo *T = readTypeSourceInfo(); if (!T) diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index b918bfbd549c3..bd84a9741d01b 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -1889,7 +1889,7 @@ void ASTDeclReader::VisitNamespaceAliasDecl(NamespaceAliasDecl *D) { D->NamespaceLoc = readSourceLocation(); D->IdentLoc = readSourceLocation(); D->QualifierLoc = Record.readNestedNameSpecifierLoc(); - D->Namespace = readDeclAs(); + D->Namespace = readDeclAs(); mergeRedeclarable(D, Redecl); } diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 847283e9842e5..e868afeb1a145 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -7093,11 +7093,6 @@ void ASTRecordWriter::AddNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) { AddSourceRange(NNS.getLocalSourceRange()); break; - case NestedNameSpecifier::NamespaceAlias: - AddDeclRef(NNS.getNestedNameSpecifier()->getAsNamespaceAlias()); - AddSourceRange(NNS.getLocalSourceRange()); - break; - case NestedNameSpecifier::TypeSpec: AddTypeRef(NNS.getTypeLoc().getType()); AddTypeLoc(NNS.getTypeLoc()); diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp index caac719caf8e8..eb9fa7a7fa1e8 100644 --- a/clang/lib/Tooling/Syntax/BuildTree.cpp +++ b/clang/lib/Tooling/Syntax/BuildTree.cpp @@ -950,7 +950,6 @@ class BuildTreeVisitor : public RecursiveASTVisitor { case NestedNameSpecifier::Global: return syntax::NodeKind::GlobalNameSpecifier; case NestedNameSpecifier::Namespace: - case NestedNameSpecifier::NamespaceAlias: case NestedNameSpecifier::Identifier: return syntax::NodeKind::IdentifierNameSpecifier; case NestedNameSpecifier::TypeSpec: { diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 9089984fa4a54..75afa87947be4 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -1438,10 +1438,6 @@ bool CursorVisitor::VisitNestedNameSpecifier(NestedNameSpecifier *NNS, return Visit( MakeCursorNamespaceRef(NNS->getAsNamespace(), Range.getBegin(), TU)); - case NestedNameSpecifier::NamespaceAlias: - return Visit(MakeCursorNamespaceRef(NNS->getAsNamespaceAlias(), - Range.getBegin(), TU)); - case NestedNameSpecifier::TypeSpec: { // If the type has a form where we know that the beginning of the source // range matches up with a reference cursor. Visit the appropriate reference @@ -1483,13 +1479,6 @@ bool CursorVisitor::VisitNestedNameSpecifierLoc( break; - case NestedNameSpecifier::NamespaceAlias: - if (Visit(MakeCursorNamespaceRef(NNS->getAsNamespaceAlias(), - Q.getLocalBeginLoc(), TU))) - return true; - - break; - case NestedNameSpecifier::TypeSpec: if (Visit(Q.getTypeLoc())) return true; diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp index ddc663e2b6fd3..23a2df42ff08c 100644 --- a/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp +++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/NestedNameSpecifiers.cpp @@ -25,8 +25,8 @@ class NestedNameSpecifiersVisitor : public ExpectedLocationVisitor { bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNS) override { if (!NNS) return true; - if (const NamespaceDecl *ND = - NNS.getNestedNameSpecifier()->getAsNamespace()) + if (const auto *ND = dyn_cast_if_present( + NNS.getNestedNameSpecifier()->getAsNamespace())) Match(ND->getName(), NNS.getLocalBeginLoc()); return ExpectedLocationVisitor::TraverseNestedNameSpecifierLoc(NNS); } diff --git a/clang/unittests/Tooling/RefactoringTest.cpp b/clang/unittests/Tooling/RefactoringTest.cpp index 254d95bc20cb0..35d114343b517 100644 --- a/clang/unittests/Tooling/RefactoringTest.cpp +++ b/clang/unittests/Tooling/RefactoringTest.cpp @@ -748,7 +748,8 @@ class NestedNameSpecifierAVisitor : public TestVisitor { public: bool TraverseNestedNameSpecifierLoc(NestedNameSpecifierLoc NNSLoc) override { if (NNSLoc.getNestedNameSpecifier()) { - if (const NamespaceDecl* NS = NNSLoc.getNestedNameSpecifier()->getAsNamespace()) { + if (const auto *NS = dyn_cast_if_present( + NNSLoc.getNestedNameSpecifier()->getAsNamespace())) { if (NS->getName() == "a") { Replace = Replacement(*SM, &NNSLoc, "", Context->getLangOpts()); } From 8aa4fc0a9f80475dc0efb2ce14a860f86147d52d Mon Sep 17 00:00:00 2001 From: Ryan Prichard Date: Thu, 17 Jul 2025 18:08:52 -0700 Subject: [PATCH 277/813] [libc++][Android] Reenable 2 tests for Android (#149415) Now that the Android clang has been upgraded to clang-r563880 (llvm.org/pr148998), these two tests pass again. --- .../meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp | 3 --- .../has_unique_object_representations.compile.pass.cpp | 3 --- 2 files changed, 6 deletions(-) diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp index 97e3afed1c036..8e57e8913dcbe 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp @@ -7,9 +7,6 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14, c++17 -// The Clang version that Android currently uses in the CI is too old. -// UNSUPPORTED: LIBCXX-ANDROID-FIXME - // type_traits // is_bounded_array diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp index ac63fec691377..bd7da40daf2bc 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp @@ -8,9 +8,6 @@ // UNSUPPORTED: c++03, c++11, c++14 -// The Clang version that Android currently uses in the CI is too old. -// UNSUPPORTED: LIBCXX-ANDROID-FIXME - // type_traits // has_unique_object_representations From baa291bfb58e73a253669b86ac604cf8e6792b6c Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Fri, 18 Jul 2025 09:28:29 +0800 Subject: [PATCH 278/813] [mlir][mesh] Add null check for dyn_cast to prevent crash (#149266) This PR adds a null check for dyn_cast result before use to prevent crash, and use `isa` instead `dyn_cast` to make code clean. Fixes #148619. --- .../mlir/Dialect/Mesh/Transforms/Simplifications.h | 8 ++++---- mlir/test/Dialect/Mesh/simplifications.mlir | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h index 3f1041cb25103..243dbf081b999 100644 --- a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h +++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h @@ -62,9 +62,11 @@ void populateAllReduceEndomorphismSimplificationPatterns( auto isEndomorphismOp = [reduction](Operation *op, std::optional referenceOp) { auto allReduceOp = llvm::dyn_cast(op); + if (!allReduceOp) + return false; auto inType = cast(allReduceOp.getInput().getType()); auto outType = cast(allReduceOp.getResult().getType()); - if (!allReduceOp || inType.getElementType() != outType.getElementType() || + if (inType.getElementType() != outType.getElementType() || allReduceOp.getReduction() != reduction) { return false; } @@ -87,9 +89,7 @@ void populateAllReduceEndomorphismSimplificationPatterns( return refAllReduceOp->getAttrs() == allReduceOp->getAttrs() && inType.getElementType() == refType.getElementType(); }; - auto isAlgebraicOp = [](Operation *op) { - return static_cast(llvm::dyn_cast(op)); - }; + auto isAlgebraicOp = [](Operation *op) { return isa(op); }; using ConcreteEndomorphismSimplification = EndomorphismSimplification< std::decay_t, diff --git a/mlir/test/Dialect/Mesh/simplifications.mlir b/mlir/test/Dialect/Mesh/simplifications.mlir index 2540fbf9510c4..e955f4c134259 100644 --- a/mlir/test/Dialect/Mesh/simplifications.mlir +++ b/mlir/test/Dialect/Mesh/simplifications.mlir @@ -165,3 +165,15 @@ func.func @all_reduce_arith_minsi_endomorphism( // CHECK: return %[[ALL_REDUCE_RES]] return %2 : tensor<5xi32> } + +// Ensure this case without endomorphism op not crash. +// CHECK-LABEL: func.func @no_endomorphism_op +func.func @no_endomorphism_op(%arg0: tensor<2xi64>) -> i64 { + %c0 = arith.constant 0 : index + %c1_i64 = arith.constant 1 : i64 + // CHECK: tensor.extract + %extracted = tensor.extract %arg0[%c0] : tensor<2xi64> + // CHECK: arith.maxsi + %0 = arith.maxsi %extracted, %c1_i64 : i64 + return %0 : i64 +} From c27e283cfbca2bd22f34592430e98ee76ed60ad8 Mon Sep 17 00:00:00 2001 From: YexuanXiao Date: Fri, 18 Jul 2025 09:45:57 +0800 Subject: [PATCH 279/813] [Clang] Make the SizeType, SignedSizeType and PtrdiffType be named sugar types instead of built-in types (#143653) Including the results of `sizeof`, `sizeof...`, `__datasizeof`, `__alignof`, `_Alignof`, `alignof`, `_Countof`, `size_t` literals, and signed `size_t` literals, the results of pointer-pointer subtraction and checks for standard library functions (and their calls). The goal is to enable clang and downstream tools such as clangd and clang-tidy to provide more portable hints and diagnostics. The previous discussion can be found at #136542. This PR implements this feature by introducing a new subtype of `Type` called `PredefinedSugarType`, which was considered appropriate in discussions. I tried to keep `PredefinedSugarType` simple enough yet not limited to `size_t` and `ptrdiff_t` so that it can be used for other purposes. `PredefinedSugarType` wraps a canonical `Type` and provides a name, conceptually similar to a compiler internal `TypedefType` but without depending on a `TypedefDecl` or a source file. Additionally, checks for the `z` and `t` format specifiers in format strings for `scanf` and `printf` were added. It will precisely match expressions using `typedef`s or built-in expressions. The affected tests indicates that it works very well. Several code require that `SizeType` is canonical, so I kept `SizeType` to its canonical form. The failed tests in CI are allowed to fail. See the [comment](https://github.com/llvm/llvm-project/pull/135386#issuecomment-3049426611) in another PR #135386. --- .../clangd/unittests/FindTargetTests.cpp | 2 +- .../clangd/unittests/HoverTests.cpp | 4 +- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/AST/ASTContext.h | 13 +- clang/include/clang/AST/FormatString.h | 3 +- clang/include/clang/AST/RecursiveASTVisitor.h | 4 + clang/include/clang/AST/Type.h | 56 ++++ clang/include/clang/AST/TypeLoc.h | 10 + clang/include/clang/AST/TypeProperties.td | 9 + clang/include/clang/Basic/TypeNodes.td | 1 + .../clang/Serialization/TypeBitCodes.def | 1 + clang/lib/AST/ASTContext.cpp | 76 +++-- clang/lib/AST/ASTImporter.cpp | 5 + clang/lib/AST/ASTStructuralEquivalence.cpp | 7 + clang/lib/AST/FormatString.cpp | 126 +++++++-- clang/lib/AST/ItaniumMangle.cpp | 4 + clang/lib/AST/PrintfFormatString.cpp | 11 +- clang/lib/AST/ScanfFormatString.cpp | 21 +- clang/lib/AST/Type.cpp | 12 + clang/lib/AST/TypePrinter.cpp | 10 + clang/lib/CodeGen/CGCall.cpp | 2 +- clang/lib/CodeGen/CGCoroutine.cpp | 8 +- clang/lib/CodeGen/CGDebugInfo.cpp | 3 +- clang/lib/CodeGen/CGObjCMac.cpp | 2 +- clang/lib/CodeGen/CodeGenFunction.cpp | 3 +- clang/lib/Sema/SemaChecking.cpp | 4 +- clang/lib/Sema/SemaExpr.cpp | 3 + clang/lib/Sema/SemaExprCXX.cpp | 10 +- clang/lib/Sema/TreeTransform.h | 6 + clang/lib/Serialization/ASTReader.cpp | 5 + clang/lib/Serialization/ASTWriter.cpp | 6 +- .../StaticAnalyzer/Checkers/MallocChecker.cpp | 25 +- .../Checkers/StdLibraryFunctionsChecker.cpp | 80 +++--- .../Checkers/VLASizeChecker.cpp | 2 +- ...d_resource_element_compatible_concept.hlsl | 2 +- clang/test/AST/ast-dump-array.cpp | 2 +- clang/test/AST/ast-dump-expr-json.c | 9 +- clang/test/AST/ast-dump-expr-json.cpp | 24 +- clang/test/AST/ast-dump-expr.c | 6 +- clang/test/AST/ast-dump-expr.cpp | 16 +- ...dump-openmp-distribute-parallel-for-simd.c | 20 +- .../ast-dump-openmp-distribute-parallel-for.c | 20 +- ...arget-teams-distribute-parallel-for-simd.c | 160 +++++------ ...nmp-target-teams-distribute-parallel-for.c | 160 +++++------ ...penmp-teams-distribute-parallel-for-simd.c | 160 +++++------ ...ump-openmp-teams-distribute-parallel-for.c | 160 +++++------ clang/test/AST/ast-dump-stmt-json.cpp | 71 +++-- clang/test/AST/ast-dump-stmt.cpp | 4 +- clang/test/AST/ast-dump-traits.cpp | 8 +- clang/test/AST/ast-dump-types-errors-json.cpp | 3 +- clang/test/Analysis/cfg.cpp | 2 +- clang/test/Analysis/explain-svals.cpp | 2 +- .../std-c-library-functions-arg-weakdeps.c | 2 +- .../Analysis/std-c-library-functions-lookup.c | 2 +- ...td-c-library-functions-vs-stream-checker.c | 4 +- clang/test/Analysis/std-c-library-functions.c | 4 +- clang/test/CXX/drs/cwg2xx.cpp | 2 +- clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp | 10 +- clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp | 6 +- clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp | 2 +- .../test/FixIt/fixit-format-ios-nopedantic.m | 2 +- clang/test/FixIt/format.m | 6 +- .../test/Sema/format-strings-fixit-ssize_t.c | 2 +- clang/test/Sema/format-strings-int-typedefs.c | 12 +- clang/test/Sema/format-strings-scanf.c | 8 +- clang/test/Sema/format-strings-size_t.c | 13 +- clang/test/Sema/matrix-type-builtins.c | 8 +- clang/test/Sema/ptrauth-atomic-ops.c | 2 +- clang/test/Sema/ptrauth.c | 2 +- .../SemaCXX/cxx2c-trivially-relocatable.cpp | 2 +- clang/test/SemaCXX/enum-scoped.cpp | 4 +- .../SemaCXX/microsoft-varargs-diagnostics.cpp | 6 +- clang/test/SemaCXX/new-delete.cpp | 2 +- clang/test/SemaCXX/static-assert-cxx26.cpp | 14 +- ...are-new-delete-basic-free-declarations.cpp | 2 +- .../unavailable_aligned_allocation.cpp | 24 +- clang/test/SemaHLSL/Language/AssignArray.hlsl | 4 +- clang/test/SemaHLSL/Language/InitListAST.hlsl | 264 +++++++++--------- .../SemaObjC/format-size-spec-nsinteger.m | 17 +- clang/test/SemaObjC/matrix-type-builtins.m | 2 +- .../SemaOpenCL/cl20-device-side-enqueue.cl | 6 +- clang/test/SemaTemplate/type_pack_element.cpp | 12 +- clang/tools/libclang/CIndex.cpp | 4 + .../deque/spare_block_handling.pass.cpp | 8 +- .../TypeSystem/Clang/TypeSystemClang.cpp | 4 + 85 files changed, 1070 insertions(+), 756 deletions(-) diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp index 602f61d9ecb41..4d77f9d690ca0 100644 --- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp +++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp @@ -838,7 +838,7 @@ TEST_F(TargetDeclTest, OverloadExpr) { )cpp"; // Sized deallocation is enabled by default in C++14 onwards. EXPECT_DECLS("CXXDeleteExpr", - "void operator delete(void *, unsigned long) noexcept"); + "void operator delete(void *, __size_t) noexcept"); } TEST_F(TargetDeclTest, DependentExprs) { diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp index 775278ccf694b..4a21dafed5e95 100644 --- a/clang-tools-extra/clangd/unittests/HoverTests.cpp +++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp @@ -2794,7 +2794,7 @@ TEST(Hover, All) { })cpp", [](HoverInfo &HI) { HI.Name = "expression"; - HI.Type = "unsigned long"; + HI.Type = {"__size_t", "unsigned long"}; HI.Value = "1"; }}, { @@ -2804,7 +2804,7 @@ TEST(Hover, All) { })cpp", [](HoverInfo &HI) { HI.Name = "expression"; - HI.Type = "unsigned long"; + HI.Type = {"__size_t", "unsigned long"}; HI.Value = "1"; }}, { diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index fcd3887ec7a09..288f2fb9d81ca 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -46,6 +46,7 @@ Potentially Breaking Changes ``endbr64`` instruction at the labels named as possible branch destinations, so it is not safe to use a register-controlled branch instruction to branch to one. (In line with gcc.) +- Added a sugar type `PredefinedSugarType` to improve diagnostic messages. (#GH143653) C/C++ Language Potentially Breaking Changes ------------------------------------------- diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 66ec3395571ea..5b456794e7b13 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -277,6 +277,11 @@ class ASTContext : public RefCountedBase { mutable llvm::ContextualFoldingSet ArrayParameterTypes; + /// Store the unique Type corresponding to each Kind. + mutable std::array + PredefinedSugarTypes{}; + /// The set of nested name specifiers. /// /// This set is managed by the NestedNameSpecifier class. @@ -1567,6 +1572,8 @@ class ASTContext : public RefCountedBase { /// and bit count. QualType getDependentBitIntType(bool Unsigned, Expr *BitsExpr) const; + QualType getPredefinedSugarType(PredefinedSugarType::Kind KD) const; + /// Gets the struct used to keep track of the extended descriptor for /// pointer to blocks. QualType getBlockDescriptorExtendedType() const; @@ -1999,11 +2006,13 @@ class ASTContext : public RefCountedBase { /// . /// /// The sizeof operator requires this (C99 6.5.3.4p4). - CanQualType getSizeType() const; + QualType getSizeType() const; + + CanQualType getCanonicalSizeType() const; /// Return the unique signed counterpart of /// the integer type corresponding to size_t. - CanQualType getSignedSizeType() const; + QualType getSignedSizeType() const; /// Return the unique type for "intmax_t" (C99 7.18.1.5), defined in /// . diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h index 3560766433fe2..a284f2c44d633 100644 --- a/clang/include/clang/AST/FormatString.h +++ b/clang/include/clang/AST/FormatString.h @@ -489,7 +489,8 @@ class FormatSpecifier { /// For a TypedefType QT, if it is a named integer type such as size_t, /// assign the appropriate value to LM and return true. - static bool namedTypeToLengthModifier(QualType QT, LengthModifier &LM); + static bool namedTypeToLengthModifier(ASTContext &Ctx, QualType QT, + LengthModifier &LM); }; } // end analyze_format_string namespace diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 519a811775c01..62991d986e675 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -1208,6 +1208,8 @@ DEF_TRAVERSE_TYPE(BitIntType, {}) DEF_TRAVERSE_TYPE(DependentBitIntType, { TRY_TO(TraverseStmt(T->getNumBitsExpr())); }) +DEF_TRAVERSE_TYPE(PredefinedSugarType, {}) + #undef DEF_TRAVERSE_TYPE // ----------------- TypeLoc traversal ----------------- @@ -1524,6 +1526,8 @@ DEF_TRAVERSE_TYPELOC(DependentBitIntType, { TRY_TO(TraverseStmt(TL.getTypePtr()->getNumBitsExpr())); }) +DEF_TRAVERSE_TYPELOC(PredefinedSugarType, {}) + #undef DEF_TRAVERSE_TYPELOC // ----------------- Decl traversal ----------------- diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 21b97102db95a..764e9d508a25a 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2258,6 +2258,30 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { unsigned NumExpansions; }; + enum class PredefinedSugarKind { + /// The "size_t" type. + SizeT, + + /// The signed integer type corresponding to "size_t". + SignedSizeT, + + /// The "ptrdiff_t" type. + PtrdiffT, + + // Indicates how many items the enum has. + Last = PtrdiffT + }; + + class PresefinedSugarTypeBitfields { + friend class PredefinedSugarType; + + LLVM_PREFERRED_TYPE(TypeBitfields) + unsigned : NumTypeBits; + + LLVM_PREFERRED_TYPE(PredefinedSugarKind) + unsigned Kind : 8; + }; + class CountAttributedTypeBitfields { friend class CountAttributedType; @@ -2297,6 +2321,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { DependentTemplateSpecializationTypeBits; PackExpansionTypeBitfields PackExpansionTypeBits; CountAttributedTypeBitfields CountAttributedTypeBits; + PresefinedSugarTypeBitfields PredefinedSugarTypeBits; }; private: @@ -8038,6 +8063,37 @@ class DependentBitIntType final : public Type, public llvm::FoldingSetNode { } }; +class PredefinedSugarType final : public Type { +public: + friend class ASTContext; + using Kind = PredefinedSugarKind; + +private: + PredefinedSugarType(Kind KD, const IdentifierInfo *IdentName, + QualType CanonicalType) + : Type(PredefinedSugar, CanonicalType, TypeDependence::None), + Name(IdentName) { + PredefinedSugarTypeBits.Kind = llvm::to_underlying(KD); + } + + static StringRef getName(Kind KD); + + const IdentifierInfo *Name; + +public: + bool isSugared() const { return true; } + + QualType desugar() const { return getCanonicalTypeInternal(); } + + Kind getKind() const { return Kind(PredefinedSugarTypeBits.Kind); } + + const IdentifierInfo *getIdentifier() const { return Name; } + + static bool classof(const Type *T) { + return T->getTypeClass() == PredefinedSugar; + } +}; + /// A qualifier set is used to build a set of qualifiers. class QualifierCollector : public Qualifiers { public: diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h index cf06e27758996..be0bc896de3ea 100644 --- a/clang/include/clang/AST/TypeLoc.h +++ b/clang/include/clang/AST/TypeLoc.h @@ -2783,6 +2783,16 @@ class ObjCProtocolLoc { } }; +struct PredefinedSugarTypeLocInfo {}; // Nothing. + +class PredefinedSugarTypeLoc final + : public ConcreteTypeLoc { +public: + void initializeLocal(ASTContext &Context, SourceLocation loc) {} + SourceRange getLocalSourceRange() const { return {}; } +}; + } // namespace clang #endif // LLVM_CLANG_AST_TYPELOC_H diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index a6157649060b1..3114d1180319a 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -1028,3 +1028,12 @@ let Class = DependentBitIntType in { return ctx.getDependentBitIntType(isUnsigned, numBitsExpr); }]>; } + +let Class = PredefinedSugarType in { + def : Property<"kind", UInt32> { + let Read = [{ static_cast(node->getKind()) }]; + } + def : Creator<[{ + return ctx.getPredefinedSugarType(static_cast(kind)); + }]>; +} diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td index 567b8a5ca5a4d..971ce541d4831 100644 --- a/clang/include/clang/Basic/TypeNodes.td +++ b/clang/include/clang/Basic/TypeNodes.td @@ -117,3 +117,4 @@ def PipeType : TypeNode; def AtomicType : TypeNode; def BitIntType : TypeNode; def DependentBitIntType : TypeNode, AlwaysDependent; +def PredefinedSugarType : TypeNode, NeverCanonical; diff --git a/clang/include/clang/Serialization/TypeBitCodes.def b/clang/include/clang/Serialization/TypeBitCodes.def index b8cde2e370960..613eb6af2005a 100644 --- a/clang/include/clang/Serialization/TypeBitCodes.def +++ b/clang/include/clang/Serialization/TypeBitCodes.def @@ -69,5 +69,6 @@ TYPE_BIT_CODE(CountAttributed, COUNT_ATTRIBUTED, 57) TYPE_BIT_CODE(ArrayParameter, ARRAY_PARAMETER, 58) TYPE_BIT_CODE(HLSLAttributedResource, HLSLRESOURCE_ATTRIBUTED, 59) TYPE_BIT_CODE(HLSLInlineSpirv, HLSL_INLINE_SPIRV, 60) +TYPE_BIT_CODE(PredefinedSugar, PREDEFINED_SUGAR, 61) #undef TYPE_BIT_CODE diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 232a4b6557b92..6b6275faa215a 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2597,6 +2597,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { } break; + case Type::PredefinedSugar: + return getTypeInfo(cast(T)->desugar().getTypePtr()); + case Type::Pipe: Width = Target->getPointerWidth(LangAS::opencl_global); Align = Target->getPointerAlign(LangAS::opencl_global); @@ -5216,6 +5219,39 @@ QualType ASTContext::getDependentBitIntType(bool IsUnsigned, return QualType(New, 0); } +QualType +ASTContext::getPredefinedSugarType(PredefinedSugarType::Kind KD) const { + using Kind = PredefinedSugarType::Kind; + + if (auto *Target = PredefinedSugarTypes[llvm::to_underlying(KD)]; + Target != nullptr) + return QualType(Target, 0); + + auto getCanonicalType = [](const ASTContext &Ctx, Kind KDI) -> QualType { + switch (KDI) { + // size_t (C99TC3 6.5.3.4), signed size_t (C++23 5.13.2) and + // ptrdiff_t (C99TC3 6.5.6) Although these types are not built-in, they + // are part of the core language and are widely used. Using + // PredefinedSugarType makes these types as named sugar types rather than + // standard integer types, enabling better hints and diagnostics. + case Kind::SizeT: + return Ctx.getFromTargetType(Ctx.Target->getSizeType()); + case Kind::SignedSizeT: + return Ctx.getFromTargetType(Ctx.Target->getSignedSizeType()); + case Kind::PtrdiffT: + return Ctx.getFromTargetType(Ctx.Target->getPtrDiffType(LangAS::Default)); + } + llvm_unreachable("unexpected kind"); + }; + + auto *New = new (*this, alignof(PredefinedSugarType)) + PredefinedSugarType(KD, &Idents.get(PredefinedSugarType::getName(KD)), + getCanonicalType(*this, static_cast(KD))); + Types.push_back(New); + PredefinedSugarTypes[llvm::to_underlying(KD)] = New; + return QualType(New, 0); +} + #ifndef NDEBUG static bool NeedsInjectedClassNameType(const RecordDecl *D) { if (!isa(D)) return false; @@ -6796,14 +6832,31 @@ QualType ASTContext::getTagDeclType(const TagDecl *Decl) const { /// getSizeType - Return the unique type for "size_t" (C99 7.17), the result /// of the sizeof operator (C99 6.5.3.4p4). The value is target dependent and /// needs to agree with the definition in . -CanQualType ASTContext::getSizeType() const { +QualType ASTContext::getSizeType() const { + return getPredefinedSugarType(PredefinedSugarType::Kind::SizeT); +} + +CanQualType ASTContext::getCanonicalSizeType() const { return getFromTargetType(Target->getSizeType()); } /// Return the unique signed counterpart of the integer type /// corresponding to size_t. -CanQualType ASTContext::getSignedSizeType() const { - return getFromTargetType(Target->getSignedSizeType()); +QualType ASTContext::getSignedSizeType() const { + return getPredefinedSugarType(PredefinedSugarType::Kind::SignedSizeT); +} + +/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17) +/// defined in . Pointer - pointer requires this (C99 6.5.6p9). +QualType ASTContext::getPointerDiffType() const { + return getPredefinedSugarType(PredefinedSugarType::Kind::PtrdiffT); +} + +/// Return the unique unsigned counterpart of "ptrdiff_t" +/// integer type. The standard (C11 7.21.6.1p7) refers to this type +/// in the definition of %tu format specifier. +QualType ASTContext::getUnsignedPointerDiffType() const { + return getFromTargetType(Target->getUnsignedPtrDiffType(LangAS::Default)); } /// getIntMaxType - Return the unique type for "intmax_t" (C99 7.18.1.5). @@ -6838,19 +6891,6 @@ QualType ASTContext::getUIntPtrType() const { return getCorrespondingUnsignedType(getIntPtrType()); } -/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17) -/// defined in . Pointer - pointer requires this (C99 6.5.6p9). -QualType ASTContext::getPointerDiffType() const { - return getFromTargetType(Target->getPtrDiffType(LangAS::Default)); -} - -/// Return the unique unsigned counterpart of "ptrdiff_t" -/// integer type. The standard (C11 7.21.6.1p7) refers to this type -/// in the definition of %tu format specifier. -QualType ASTContext::getUnsignedPointerDiffType() const { - return getFromTargetType(Target->getUnsignedPtrDiffType(LangAS::Default)); -} - /// Return the unique type for "pid_t" defined in /// . We need this to compute the correct type for vfork(). QualType ASTContext::getProcessIDType() const { @@ -14503,6 +14543,10 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X, DX->isCountInBytes(), DX->isOrNull(), CDX); } + case Type::PredefinedSugar: + assert(cast(X)->getKind() != + cast(Y)->getKind()); + return QualType(); } llvm_unreachable("Unhandled Type Class"); } diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index b5f6c5a8c6abe..b9bdabe0b8c06 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -2080,6 +2080,11 @@ ExpectedType clang::ASTNodeImporter::VisitDependentBitIntType( *ToNumBitsExprOrErr); } +ExpectedType clang::ASTNodeImporter::VisitPredefinedSugarType( + const clang::PredefinedSugarType *T) { + return Importer.getToContext().getPredefinedSugarType(T->getKind()); +} + ExpectedType clang::ASTNodeImporter::VisitDependentSizedMatrixType( const clang::DependentSizedMatrixType *T) { Error Err = Error::success(); diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp index 289c6d7737de7..0f2762d5c0f14 100644 --- a/clang/lib/AST/ASTStructuralEquivalence.cpp +++ b/clang/lib/AST/ASTStructuralEquivalence.cpp @@ -1477,6 +1477,13 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, return false; break; } + case Type::PredefinedSugar: { + const auto *TP1 = cast(T1); + const auto *TP2 = cast(T2); + if (TP1->getKind() != TP2->getKind()) + return false; + break; + } } // end switch return true; diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp index 5d3b56fc4e713..0bb737fa6a8af 100644 --- a/clang/lib/AST/FormatString.cpp +++ b/clang/lib/AST/FormatString.cpp @@ -320,6 +320,86 @@ bool clang::analyze_format_string::ParseUTF8InvalidSpecifier( // Methods on ArgType. //===----------------------------------------------------------------------===// +static bool namedTypeToLengthModifierKind(ASTContext &Ctx, QualType QT, + LengthModifier::Kind &K) { + if (!Ctx.getLangOpts().C99 && !Ctx.getLangOpts().CPlusPlus) + return false; + for (/**/; const auto *TT = QT->getAs(); QT = TT->desugar()) { + const auto *TD = TT->getDecl(); + const auto *DC = TT->getDecl()->getDeclContext(); + if (DC->isTranslationUnit() || DC->isStdNamespace()) { + StringRef Name = TD->getIdentifier()->getName(); + if (Name == "size_t") { + K = LengthModifier::AsSizeT; + return true; + } else if (Name == "ssize_t" /*Not C99, but common in Unix.*/) { + K = LengthModifier::AsSizeT; + return true; + } else if (Name == "ptrdiff_t") { + K = LengthModifier::AsPtrDiff; + return true; + } else if (Name == "intmax_t") { + K = LengthModifier::AsIntMax; + return true; + } else if (Name == "uintmax_t") { + K = LengthModifier::AsIntMax; + return true; + } + } + } + if (const auto *PST = QT->getAs()) { + using Kind = PredefinedSugarType::Kind; + switch (PST->getKind()) { + case Kind::SizeT: + case Kind::SignedSizeT: + K = LengthModifier::AsSizeT; + return true; + case Kind::PtrdiffT: + K = LengthModifier::AsPtrDiff; + return true; + } + llvm_unreachable("unexpected kind"); + } + return false; +} + +// Check whether T and E are compatible size_t/ptrdiff_t typedefs. E must be +// consistent with LE. +// T is the type of the actual expression in the code to be checked, and E is +// the expected type parsed from the format string. +static clang::analyze_format_string::ArgType::MatchKind +matchesSizeTPtrdiffT(ASTContext &C, QualType T, QualType E, + LengthModifier::Kind LE) { + using Kind = LengthModifier::Kind; + using MatchKind = clang::analyze_format_string::ArgType::MatchKind; + assert(LE == Kind::AsPtrDiff || LE == Kind::AsSizeT); + + if (!T->isIntegerType()) + return MatchKind::NoMatch; + + if (C.getCorrespondingSignedType(T.getCanonicalType()) != + C.getCorrespondingSignedType(E.getCanonicalType())) + return MatchKind::NoMatch; + + // signed size_t and unsigned ptrdiff_t does not have typedefs in C and C++. + if (LE == Kind::AsSizeT && E->isSignedIntegerType()) + return T->isSignedIntegerType() ? MatchKind::Match + : MatchKind::NoMatchSignedness; + + if (LE == LengthModifier::Kind::AsPtrDiff && E->isUnsignedIntegerType()) + return T->isUnsignedIntegerType() ? MatchKind::Match + : MatchKind::NoMatchSignedness; + + if (Kind Actual = Kind::None; namedTypeToLengthModifierKind(C, T, Actual)) { + if (Actual == LE) + return MatchKind::Match; + else if (Actual == Kind::AsPtrDiff || Actual == Kind::AsSizeT) + return MatchKind::NoMatchSignedness; + } + + return MatchKind::NoMatch; +} + clang::analyze_format_string::ArgType::MatchKind ArgType::matchesType(ASTContext &C, QualType argTy) const { // When using the format attribute in C++, you can receive a function or an @@ -394,6 +474,13 @@ ArgType::matchesType(ASTContext &C, QualType argTy) const { } case SpecificTy: { + if (TK != TypeKind::DontCare) { + return matchesSizeTPtrdiffT(C, argTy, T, + TK == TypeKind::SizeT + ? LengthModifier::Kind::AsSizeT + : LengthModifier::AsPtrDiff); + } + if (const EnumType *ETy = argTy->getAs()) { // If the enum is incomplete we know nothing about the underlying type. // Assume that it's 'int'. Do not use the underlying type for a scoped @@ -653,6 +740,18 @@ ArgType::matchesArgType(ASTContext &C, const ArgType &Other) const { if (Left.K == AK::SpecificTy) { if (Right.K == AK::SpecificTy) { + if (Left.TK != TypeKind::DontCare) { + return matchesSizeTPtrdiffT(C, Right.T, Left.T, + Left.TK == TypeKind::SizeT + ? LengthModifier::Kind::AsSizeT + : LengthModifier::AsPtrDiff); + } else if (Right.TK != TypeKind::DontCare) { + return matchesSizeTPtrdiffT(C, Left.T, Right.T, + Right.TK == TypeKind::SizeT + ? LengthModifier::Kind::AsSizeT + : LengthModifier::AsPtrDiff); + } + auto Canon1 = C.getCanonicalType(Left.T); auto Canon2 = C.getCanonicalType(Right.T); if (Canon1 == Canon2) @@ -1198,29 +1297,12 @@ FormatSpecifier::getCorrectedLengthModifier() const { return std::nullopt; } -bool FormatSpecifier::namedTypeToLengthModifier(QualType QT, +bool FormatSpecifier::namedTypeToLengthModifier(ASTContext &Ctx, QualType QT, LengthModifier &LM) { - for (/**/; const auto *TT = QT->getAs(); - QT = TT->getDecl()->getUnderlyingType()) { - const TypedefNameDecl *Typedef = TT->getDecl(); - const IdentifierInfo *Identifier = Typedef->getIdentifier(); - if (Identifier->getName() == "size_t") { - LM.setKind(LengthModifier::AsSizeT); - return true; - } else if (Identifier->getName() == "ssize_t") { - // Not C99, but common in Unix. - LM.setKind(LengthModifier::AsSizeT); - return true; - } else if (Identifier->getName() == "intmax_t") { - LM.setKind(LengthModifier::AsIntMax); - return true; - } else if (Identifier->getName() == "uintmax_t") { - LM.setKind(LengthModifier::AsIntMax); - return true; - } else if (Identifier->getName() == "ptrdiff_t") { - LM.setKind(LengthModifier::AsPtrDiff); - return true; - } + if (LengthModifier::Kind Out = LengthModifier::Kind::None; + namedTypeToLengthModifierKind(Ctx, QT, Out)) { + LM.setKind(Out); + return true; } return false; } diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 6d082b31a9caa..2a667934dba42 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -2514,6 +2514,10 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty, mangleSourceNameWithAbiTags(cast(Ty)->getDecl()); break; + case Type::PredefinedSugar: + mangleType(cast(Ty)->desugar()); + break; + case Type::UnresolvedUsing: mangleSourceNameWithAbiTags( cast(Ty)->getDecl()); diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp index 293164ddac8f8..bcd44f0a85eed 100644 --- a/clang/lib/AST/PrintfFormatString.cpp +++ b/clang/lib/AST/PrintfFormatString.cpp @@ -543,7 +543,8 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx, case LengthModifier::AsIntMax: return ArgType(Ctx.getIntMaxType(), "intmax_t"); case LengthModifier::AsSizeT: - return ArgType::makeSizeT(ArgType(Ctx.getSignedSizeType(), "ssize_t")); + return ArgType::makeSizeT( + ArgType(Ctx.getSignedSizeType(), "signed size_t")); case LengthModifier::AsInt3264: return Ctx.getTargetInfo().getTriple().isArch64Bit() ? ArgType(Ctx.LongLongTy, "__int64") @@ -626,9 +627,11 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx, case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); + return ArgType::PtrTo(ArgType::makeSizeT( + ArgType(Ctx.getSignedSizeType(), "signed size_t"))); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); + return ArgType::PtrTo(ArgType::makePtrdiffT( + ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); case LengthModifier::AsLongDouble: return ArgType(); // FIXME: Is this a known extension? case LengthModifier::AsAllocate: @@ -917,7 +920,7 @@ bool PrintfSpecifier::fixType(QualType QT, const LangOptions &LangOpt, // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99. if (LangOpt.C99 || LangOpt.CPlusPlus11) - namedTypeToLengthModifier(QT, LM); + namedTypeToLengthModifier(Ctx, QT, LM); // If fixing the length modifier was enough, we might be done. if (hasValidLengthModifier(Ctx.getTargetInfo(), LangOpt)) { diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp index 7ee21c8c61954..1227edd47d13d 100644 --- a/clang/lib/AST/ScanfFormatString.cpp +++ b/clang/lib/AST/ScanfFormatString.cpp @@ -251,9 +251,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); + return ArgType::PtrTo(ArgType::makeSizeT( + ArgType(Ctx.getSignedSizeType(), "signed size_t"))); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); + return ArgType::PtrTo(ArgType::makePtrdiffT( + ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); case LengthModifier::AsLongDouble: // GNU extension. return ArgType::PtrTo(Ctx.LongLongTy); @@ -292,10 +294,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getUIntMaxType(), "uintmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType(Ctx.getSizeType(), "size_t")); - case LengthModifier::AsPtrDiff: return ArgType::PtrTo( - ArgType(Ctx.getUnsignedPointerDiffType(), "unsigned ptrdiff_t")); + ArgType::makeSizeT(ArgType(Ctx.getSizeType(), "size_t"))); + case LengthModifier::AsPtrDiff: + return ArgType::PtrTo(ArgType::makePtrdiffT( + ArgType(Ctx.getUnsignedPointerDiffType(), "unsigned ptrdiff_t"))); case LengthModifier::AsLongDouble: // GNU extension. return ArgType::PtrTo(Ctx.UnsignedLongLongTy); @@ -390,9 +393,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); + return ArgType::PtrTo(ArgType::makeSizeT( + ArgType(Ctx.getSignedSizeType(), "signed size_t"))); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); + return ArgType::PtrTo(ArgType::makePtrdiffT( + ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); case LengthModifier::AsLongDouble: return ArgType(); // FIXME: Is this a known extension? case LengthModifier::AsAllocate: @@ -501,7 +506,7 @@ bool ScanfSpecifier::fixType(QualType QT, QualType RawQT, // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99. if (LangOpt.C99 || LangOpt.CPlusPlus11) - namedTypeToLengthModifier(PT, LM); + namedTypeToLengthModifier(Ctx, PT, LM); // If fixing the length modifier was enough, we are done. if (hasValidLengthModifier(Ctx.getTargetInfo(), LangOpt)) { diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index e5a1ab2ff8906..7444a2f90c5dd 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -5613,3 +5613,15 @@ HLSLAttributedResourceType::findHandleTypeOnResource(const Type *RT) { } return nullptr; } + +StringRef PredefinedSugarType::getName(Kind KD) { + switch (KD) { + case Kind::SizeT: + return "__size_t"; + case Kind::SignedSizeT: + return "__signed_size_t"; + case Kind::PtrdiffT: + return "__ptrdiff_t"; + } + llvm_unreachable("unexpected kind"); +} diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 818d2139628e3..deb453fe6ee75 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -248,6 +248,7 @@ bool TypePrinter::canPrefixQualifiers(const Type *T, case Type::BTFTagAttributed: case Type::HLSLAttributedResource: case Type::HLSLInlineSpirv: + case Type::PredefinedSugar: CanPrefixQualifiers = true; break; @@ -1417,6 +1418,15 @@ void TypePrinter::printDependentBitIntBefore(const DependentBitIntType *T, void TypePrinter::printDependentBitIntAfter(const DependentBitIntType *T, raw_ostream &OS) {} +void TypePrinter::printPredefinedSugarBefore(const PredefinedSugarType *T, + raw_ostream &OS) { + OS << T->getIdentifier()->getName(); + spaceBeforePlaceHolder(OS); +} + +void TypePrinter::printPredefinedSugarAfter(const PredefinedSugarType *T, + raw_ostream &OS) {} + /// Appends the given scope to the end of a string. void TypePrinter::AppendScope(DeclContext *DC, raw_ostream &OS, DeclarationName NameInScope) { diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index c8c3d6b20c496..668c91798bf54 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -214,7 +214,7 @@ static void appendParameterTypes( for (unsigned I = 0, E = FPT->getNumParams(); I != E; ++I) { prefix.push_back(FPT->getParamType(I)); if (ExtInfos[I].hasPassObjectSize()) - prefix.push_back(CGT.getContext().getSizeType()); + prefix.push_back(CGT.getContext().getCanonicalSizeType()); } addExtParameterInfosForCall(paramInfos, FPT.getTypePtr(), PrefixSize, diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp index 117ef3d16e21b..5ee908922b5a3 100644 --- a/clang/lib/CodeGen/CGCoroutine.cpp +++ b/clang/lib/CodeGen/CGCoroutine.cpp @@ -1006,15 +1006,15 @@ RValue CodeGenFunction::EmitCoroutineIntrinsic(const CallExpr *E, } case llvm::Intrinsic::coro_size: { auto &Context = getContext(); - CanQualType SizeTy = Context.getSizeType(); - llvm::IntegerType *T = Builder.getIntNTy(Context.getTypeSize(SizeTy)); + llvm::IntegerType *T = + Builder.getIntNTy(Context.getTypeSize(Context.getSizeType())); llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::coro_size, T); return RValue::get(Builder.CreateCall(F)); } case llvm::Intrinsic::coro_align: { auto &Context = getContext(); - CanQualType SizeTy = Context.getSizeType(); - llvm::IntegerType *T = Builder.getIntNTy(Context.getTypeSize(SizeTy)); + llvm::IntegerType *T = + Builder.getIntNTy(Context.getTypeSize(Context.getSizeType())); llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::coro_align, T); return RValue::get(Builder.CreateCall(F)); } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 446cf8d9e05c6..96da253ce1471 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -4052,7 +4052,8 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) { return CreateType(cast(Ty), Unit); case Type::HLSLInlineSpirv: return CreateType(cast(Ty), Unit); - + case Type::PredefinedSugar: + return getOrCreateType(cast(Ty)->desugar(), Unit); case Type::CountAttributed: case Type::Auto: case Type::Attributed: diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 8e71a576552d3..8c66176942cb5 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -285,7 +285,7 @@ class ObjCCommonTypesHelper { SmallVector Params; Params.push_back(Ctx.VoidPtrTy); Params.push_back(Ctx.VoidPtrTy); - Params.push_back(Ctx.getSizeType()); + Params.push_back(Ctx.getCanonicalSizeType()); Params.push_back(Ctx.BoolTy); Params.push_back(Ctx.BoolTy); llvm::FunctionType *FTy = Types.GetFunctionType( diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 0fda31c8e5fa1..ab345a598c4e8 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -720,7 +720,7 @@ static bool matchesStlAllocatorFn(const Decl *D, const ASTContext &Ctx) { (MD->getNumParams() != 1 && MD->getNumParams() != 2)) return false; - if (MD->parameters()[0]->getType().getCanonicalType() != Ctx.getSizeType()) + if (!Ctx.hasSameType(MD->parameters()[0]->getType(), Ctx.getSizeType())) return false; if (MD->getNumParams() == 2) { @@ -2491,6 +2491,7 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) { case Type::ObjCObjectPointer: case Type::BitInt: case Type::HLSLInlineSpirv: + case Type::PredefinedSugar: llvm_unreachable("type class is never variably-modified!"); case Type::Elaborated: diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index dd5b710d7e1d4..5e523fe887318 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5239,7 +5239,9 @@ bool Sema::BuiltinVAStartARMMicrosoft(CallExpr *Call) { << 2 << Arg1->getType() << ConstCharPtrTy; const QualType SizeTy = Context.getSizeType(); - if (Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers() != SizeTy) + if (!Context.hasSameType( + Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers(), + SizeTy)) Diag(Arg2->getBeginLoc(), diag::err_typecheck_convert_incompatible) << Arg2->getType() << SizeTy << 1 /* different class */ << 0 /* qualifier difference */ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 728ada33e2e63..45c7178c6965d 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -4564,6 +4564,9 @@ static void captureVariablyModifiedType(ASTContext &Context, QualType T, case Type::Atomic: T = cast(Ty)->getValueType(); break; + case Type::PredefinedSugar: + T = cast(Ty)->desugar(); + break; } } while (!T.isNull() && T->isVariablyModifiedType()); } diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index fd95f4ec54229..0edfd6015cbd9 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -3461,11 +3461,11 @@ void Sema::DeclareGlobalAllocationFunction(DeclarationName Name, // non-templated allocation function we are trying to declare here. if (FunctionDecl *Func = dyn_cast(*Alloc)) { if (Func->getNumParams() == Params.size()) { - llvm::SmallVector FuncParams; - for (auto *P : Func->parameters()) - FuncParams.push_back( - Context.getCanonicalType(P->getType().getUnqualifiedType())); - if (llvm::ArrayRef(FuncParams) == Params) { + if (std::equal(Func->param_begin(), Func->param_end(), Params.begin(), + Params.end(), [&](ParmVarDecl *D, QualType RT) { + return Context.hasSameUnqualifiedType(D->getType(), + RT); + })) { // Make the function visible to name lookup, even if we found it in // an unimported module. It either is an implicitly-declared global // allocation function, or is suppressing that function. diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 286c2b486c0f9..c7428d1a02345 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -7245,6 +7245,12 @@ QualType TreeTransform::TransformDependentBitIntType( return Result; } +template +QualType TreeTransform::TransformPredefinedSugarType( + TypeLocBuilder &TLB, PredefinedSugarTypeLoc TL) { + llvm_unreachable("This type does not need to be transformed."); +} + /// Simple iterator that traverses the template arguments in a /// container that provides a \c getArgLoc() member function. /// diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 3596d2240167e..10aedb68fcd9d 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -7574,11 +7574,16 @@ void TypeLocReader::VisitPipeTypeLoc(PipeTypeLoc TL) { void TypeLocReader::VisitBitIntTypeLoc(clang::BitIntTypeLoc TL) { TL.setNameLoc(readSourceLocation()); } + void TypeLocReader::VisitDependentBitIntTypeLoc( clang::DependentBitIntTypeLoc TL) { TL.setNameLoc(readSourceLocation()); } +void TypeLocReader::VisitPredefinedSugarTypeLoc(PredefinedSugarTypeLoc TL) { + // Nothing to do. +} + void ASTRecordReader::readTypeLoc(TypeLoc TL) { TypeLocReader TLR(*this); for (; !TL.isNull(); TL = TL.getNextTypeLoc()) diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index e868afeb1a145..a6957e54b66f1 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -692,7 +692,6 @@ void TypeLocWriter::VisitAtomicTypeLoc(AtomicTypeLoc TL) { void TypeLocWriter::VisitPipeTypeLoc(PipeTypeLoc TL) { addSourceLocation(TL.getKWLoc()); } - void TypeLocWriter::VisitBitIntTypeLoc(clang::BitIntTypeLoc TL) { addSourceLocation(TL.getNameLoc()); } @@ -701,6 +700,11 @@ void TypeLocWriter::VisitDependentBitIntTypeLoc( addSourceLocation(TL.getNameLoc()); } +void TypeLocWriter::VisitPredefinedSugarTypeLoc( + clang::PredefinedSugarTypeLoc TL) { + // Nothing to do. +} + void ASTWriter::WriteTypeAbbrevs() { using namespace llvm; diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp index 30a04977d906d..68efdbaec341b 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp @@ -1281,7 +1281,7 @@ SVal MallocChecker::evalMulForBufferSize(CheckerContext &C, const Expr *Blocks, SVal BlockBytesVal = C.getSVal(BlockBytes); ProgramStateRef State = C.getState(); SVal TotalSize = SB.evalBinOp(State, BO_Mul, BlocksVal, BlockBytesVal, - SB.getContext().getSizeType()); + SB.getContext().getCanonicalSizeType()); return TotalSize; } @@ -1311,11 +1311,9 @@ static bool isStandardRealloc(const CallEvent &Call) { const FunctionDecl *FD = dyn_cast(Call.getDecl()); assert(FD); ASTContext &AC = FD->getASTContext(); - - return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy && - FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy && - FD->getParamDecl(1)->getType().getDesugaredType(AC) == - AC.getSizeType(); + return AC.hasSameType(FD->getDeclaredReturnType(), AC.VoidPtrTy) && + AC.hasSameType(FD->getParamDecl(0)->getType(), AC.VoidPtrTy) && + AC.hasSameType(FD->getParamDecl(1)->getType(), AC.getSizeType()); } static bool isGRealloc(const CallEvent &Call) { @@ -1323,10 +1321,9 @@ static bool isGRealloc(const CallEvent &Call) { assert(FD); ASTContext &AC = FD->getASTContext(); - return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy && - FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy && - FD->getParamDecl(1)->getType().getDesugaredType(AC) == - AC.UnsignedLongTy; + return AC.hasSameType(FD->getDeclaredReturnType(), AC.VoidPtrTy) && + AC.hasSameType(FD->getParamDecl(0)->getType(), AC.VoidPtrTy) && + AC.hasSameType(FD->getParamDecl(1)->getType(), AC.UnsignedLongTy); } void MallocChecker::checkRealloc(ProgramStateRef State, const CallEvent &Call, @@ -2830,10 +2827,10 @@ MallocChecker::ReallocMemAux(CheckerContext &C, const CallEvent &Call, return nullptr; // Compare the size argument to 0. - DefinedOrUnknownSVal SizeZero = - svalBuilder.evalEQ(State, TotalSize.castAs(), - svalBuilder.makeIntValWithWidth( - svalBuilder.getContext().getSizeType(), 0)); + DefinedOrUnknownSVal SizeZero = svalBuilder.evalEQ( + State, TotalSize.castAs(), + svalBuilder.makeIntValWithWidth( + svalBuilder.getContext().getCanonicalSizeType(), 0)); ProgramStateRef StatePtrIsNull, StatePtrNotNull; std::tie(StatePtrIsNull, StatePtrNotNull) = State->assume(PtrEQ); diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 1c748f9bc1828..52b3d1e95942c 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -1666,7 +1666,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( const QualType IntTy = ACtx.IntTy; const QualType UnsignedIntTy = ACtx.UnsignedIntTy; const QualType LongTy = ACtx.LongTy; - const QualType SizeTy = ACtx.getSizeType(); + const QualType SizeTyCanonTy = ACtx.getCanonicalSizeType(); const QualType VoidPtrTy = getPointerTy(VoidTy); // void * const QualType IntPtrTy = getPointerTy(IntTy); // int * @@ -1684,14 +1684,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( const QualType ConstWchar_tPtrTy = getPointerTy(getConstTy(WCharTy)); // const wchar_t * const QualType ConstVoidPtrRestrictTy = getRestrictTy(ConstVoidPtrTy); - const QualType SizePtrTy = getPointerTy(SizeTy); + const QualType SizePtrTy = getPointerTy(SizeTyCanonTy); const QualType SizePtrRestrictTy = getRestrictTy(SizePtrTy); const RangeInt IntMax = BVF.getMaxValue(IntTy)->getLimitedValue(); const RangeInt UnsignedIntMax = BVF.getMaxValue(UnsignedIntTy)->getLimitedValue(); const RangeInt LongMax = BVF.getMaxValue(LongTy)->getLimitedValue(); - const RangeInt SizeMax = BVF.getMaxValue(SizeTy)->getLimitedValue(); + const RangeInt SizeMax = BVF.getMaxValue(SizeTyCanonTy)->getLimitedValue(); // Set UCharRangeMax to min of int or uchar maximum value. // The C standard states that the arguments of functions like isalpha must @@ -2057,18 +2057,19 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t fread(void *restrict ptr, size_t size, size_t nitems, // FILE *restrict stream); - addToFunctionSummaryMap( - "fread", - Signature(ArgTypes{VoidPtrRestrictTy, SizeTy, SizeTy, FilePtrRestrictTy}, - RetType{SizeTy}), - FreadSummary); + addToFunctionSummaryMap("fread", + Signature(ArgTypes{VoidPtrRestrictTy, SizeTyCanonTy, + SizeTyCanonTy, FilePtrRestrictTy}, + RetType{SizeTyCanonTy}), + FreadSummary); // size_t fwrite(const void *restrict ptr, size_t size, size_t nitems, // FILE *restrict stream); - addToFunctionSummaryMap("fwrite", - Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTy, - SizeTy, FilePtrRestrictTy}, - RetType{SizeTy}), - FreadSummary); + addToFunctionSummaryMap( + "fwrite", + Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTyCanonTy, SizeTyCanonTy, + FilePtrRestrictTy}, + RetType{SizeTyCanonTy}), + FreadSummary); std::optional Ssize_tTy = lookupTy("ssize_t"); std::optional Ssize_tMax = getMaxValue(Ssize_tTy); @@ -2083,12 +2084,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // should handle them together with the rest of the POSIX functions. // ssize_t read(int fildes, void *buf, size_t nbyte); addToFunctionSummaryMap( - "read", Signature(ArgTypes{IntTy, VoidPtrTy, SizeTy}, RetType{Ssize_tTy}), + "read", + Signature(ArgTypes{IntTy, VoidPtrTy, SizeTyCanonTy}, RetType{Ssize_tTy}), ReadSummary); // ssize_t write(int fildes, const void *buf, size_t nbyte); addToFunctionSummaryMap( "write", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy}, RetType{Ssize_tTy}), + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy}, + RetType{Ssize_tTy}), ReadSummary); auto GetLineSummary = @@ -2618,7 +2621,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *strndup(const char *s, size_t n); addToFunctionSummaryMap( "strndup", - Signature(ArgTypes{ConstCharPtrTy, SizeTy}, RetType{CharPtrTy}), + Signature(ArgTypes{ConstCharPtrTy, SizeTyCanonTy}, RetType{CharPtrTy}), Summary(NoEvalCall) .ArgConstraint(NotNull(ArgNo(0))) .ArgConstraint( @@ -2649,7 +2652,8 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *getcwd(char *buf, size_t size); addToFunctionSummaryMap( - "getcwd", Signature(ArgTypes{CharPtrTy, SizeTy}, RetType{CharPtrTy}), + "getcwd", + Signature(ArgTypes{CharPtrTy, SizeTyCanonTy}, RetType{CharPtrTy}), Summary(NoEvalCall) .Case({NotNull(0), ArgumentCondition(1, WithinRange, Range(1, SizeMax)), @@ -2957,8 +2961,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // FIXME: Improve for errno modeling. addToFunctionSummaryMap( "mmap", - Signature(ArgTypes{VoidPtrTy, SizeTy, IntTy, IntTy, IntTy, Off_tTy}, - RetType{VoidPtrTy}), + Signature( + ArgTypes{VoidPtrTy, SizeTyCanonTy, IntTy, IntTy, IntTy, Off_tTy}, + RetType{VoidPtrTy}), Summary(NoEvalCall) .ArgConstraint(ArgumentCondition(1, WithinRange, Range(1, SizeMax))) .ArgConstraint( @@ -2970,8 +2975,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // FIXME: Improve for errno modeling. addToFunctionSummaryMap( "mmap64", - Signature(ArgTypes{VoidPtrTy, SizeTy, IntTy, IntTy, IntTy, Off64_tTy}, - RetType{VoidPtrTy}), + Signature( + ArgTypes{VoidPtrTy, SizeTyCanonTy, IntTy, IntTy, IntTy, Off64_tTy}, + RetType{VoidPtrTy}), Summary(NoEvalCall) .ArgConstraint(ArgumentCondition(1, WithinRange, Range(1, SizeMax))) .ArgConstraint( @@ -3002,8 +3008,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t bufsize); addToFunctionSummaryMap( "readlink", - Signature(ArgTypes{ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTy}, - RetType{Ssize_tTy}), + Signature( + ArgTypes{ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTyCanonTy}, + RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ArgumentCondition(2, WithinRange, Range(1, IntMax)), ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3025,9 +3032,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *restrict buf, size_t bufsize); addToFunctionSummaryMap( "readlinkat", - Signature( - ArgTypes{IntTy, ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTy}, - RetType{Ssize_tTy}), + Signature(ArgTypes{IntTy, ConstCharPtrRestrictTy, CharPtrRestrictTy, + SizeTyCanonTy}, + RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ArgumentCondition(3, WithinRange, Range(1, IntMax)), ReturnValueCondition(LessThanOrEq, ArgNo(3)), @@ -3268,14 +3275,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t length, // int flags, struct sockaddr *restrict address, // socklen_t *restrict address_len); - Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTy, IntTy, + Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTyCanonTy, IntTy, StructSockaddrPtrRestrictTy, Socklen_tPtrRestrictTy}, RetType{Ssize_tTy}), Recvfrom)) addToFunctionSummaryMap( "recvfrom", - Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTy, IntTy, + Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTyCanonTy, IntTy, Irrelevant, Socklen_tPtrRestrictTy}, RetType{Ssize_tTy}), Recvfrom); @@ -3297,14 +3304,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t sendto(int socket, const void *message, size_t length, // int flags, const struct sockaddr *dest_addr, // socklen_t dest_len); - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy, ConstStructSockaddrPtrTy, Socklen_tTy}, RetType{Ssize_tTy}), Sendto)) addToFunctionSummaryMap( "sendto", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy, Irrelevant, - Socklen_tTy}, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy, + Irrelevant, Socklen_tTy}, RetType{Ssize_tTy}), Sendto); @@ -3320,7 +3327,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t recv(int sockfd, void *buf, size_t len, int flags); addToFunctionSummaryMap( "recv", - Signature(ArgTypes{IntTy, VoidPtrTy, SizeTy, IntTy}, + Signature(ArgTypes{IntTy, VoidPtrTy, SizeTyCanonTy, IntTy}, RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3395,7 +3402,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t send(int sockfd, const void *buf, size_t len, int flags); addToFunctionSummaryMap( "send", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy}, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy}, RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3683,7 +3690,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // int pthread_attr_setguardsize(pthread_attr_t *attr, size_t guardsize); addToFunctionSummaryMap( {"pthread_attr_setstacksize", "pthread_attr_setguardsize"}, - Signature(ArgTypes{Pthread_attr_tPtrTy, SizeTy}, RetType{IntTy}), + Signature(ArgTypes{Pthread_attr_tPtrTy, SizeTyCanonTy}, RetType{IntTy}), Summary(NoEvalCall) .ArgConstraint(NotNull(ArgNo(0))) .ArgConstraint( @@ -3888,13 +3895,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( .ArgConstraint(NotNull(ArgNo(1)))); addToFunctionSummaryMap( "__buf_size_arg_constraint", - Signature(ArgTypes{ConstVoidPtrTy, SizeTy}, RetType{IntTy}), + Signature(ArgTypes{ConstVoidPtrTy, SizeTyCanonTy}, RetType{IntTy}), Summary(EvalCallAsPure) .ArgConstraint( BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1)))); addToFunctionSummaryMap( "__buf_size_arg_constraint_mul", - Signature(ArgTypes{ConstVoidPtrTy, SizeTy, SizeTy}, RetType{IntTy}), + Signature(ArgTypes{ConstVoidPtrTy, SizeTyCanonTy, SizeTyCanonTy}, + RetType{IntTy}), Summary(EvalCallAsPure) .ArgConstraint(BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1), /*BufSizeMultiplier=*/ArgNo(2)))); diff --git a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp index 1042b43680fd2..c97341f072aba 100644 --- a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp @@ -92,7 +92,7 @@ ProgramStateRef VLASizeChecker::checkVLA(CheckerContext &C, ASTContext &Ctx = C.getASTContext(); SValBuilder &SVB = C.getSValBuilder(); - CanQualType SizeTy = Ctx.getSizeType(); + QualType SizeTy = Ctx.getSizeType(); uint64_t SizeMax = SVB.getBasicValueFactory().getMaxValue(SizeTy)->getZExtValue(); diff --git a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl index a4f6e6c44794e..fa8d78f38494a 100644 --- a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl +++ b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl @@ -9,7 +9,7 @@ // CHECK: | `-TemplateTypeParm {{.*}} 'element_type' // CHECK: `-BinaryOperator {{.*}} 'bool' lvalue '>=' // CHECK: |-UnaryExprOrTypeTraitExpr {{.*}} 'bool' sizeof 'element_type' -// CHECK: `-IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK: `-IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 StructuredBuffer Buffer; diff --git a/clang/test/AST/ast-dump-array.cpp b/clang/test/AST/ast-dump-array.cpp index 15771f227df8a..5a982d34683ff 100644 --- a/clang/test/AST/ast-dump-array.cpp +++ b/clang/test/AST/ast-dump-array.cpp @@ -14,7 +14,7 @@ void testArrayInitExpr() auto l = [a]{ }; // CHECK: |-ArrayInitLoopExpr 0x{{[^ ]*}} 'int[10]' - // CHECK: | `-ArrayInitIndexExpr 0x{{[^ ]*}} <> 'unsigned long' + // CHECK: | `-ArrayInitIndexExpr 0x{{[^ ]*}} <> '__size_t':'unsigned long' } template diff --git a/clang/test/AST/ast-dump-expr-json.c b/clang/test/AST/ast-dump-expr-json.c index e910864eeed65..ecb6191c52200 100644 --- a/clang/test/AST/ast-dump-expr-json.c +++ b/clang/test/AST/ast-dump-expr-json.c @@ -3911,7 +3911,8 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3964,7 +3965,8 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3989,7 +3991,8 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "alignof", diff --git a/clang/test/AST/ast-dump-expr-json.cpp b/clang/test/AST/ast-dump-expr-json.cpp index 5a762acad7917..11026c9d302f0 100644 --- a/clang/test/AST/ast-dump-expr-json.cpp +++ b/clang/test/AST/ast-dump-expr-json.cpp @@ -1545,7 +1545,8 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "Ts" @@ -1587,7 +1588,8 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "long" +// CHECK-NEXT: "desugaredQualType": "long", +// CHECK-NEXT: "qualType": "__ptrdiff_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "opcode": "-", @@ -1726,7 +1728,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: }, @@ -1755,7 +1757,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: }, @@ -1785,7 +1787,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1860,7 +1862,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1880,7 +1882,8 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -1937,7 +1940,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1957,7 +1960,8 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -2333,7 +2337,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ diff --git a/clang/test/AST/ast-dump-expr.c b/clang/test/AST/ast-dump-expr.c index 959d61ec9794b..e7aba39be8f68 100644 --- a/clang/test/AST/ast-dump-expr.c +++ b/clang/test/AST/ast-dump-expr.c @@ -222,15 +222,15 @@ void UnaryOperators(int a, int *b) { // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int' lvalue ParmVar 0x{{[^ ]*}} 'a' 'int' sizeof a; - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' sizeof + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' sizeof // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int' lvalue ParmVar 0x{{[^ ]*}} 'a' 'int' sizeof(int); - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' sizeof 'int' + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' sizeof 'int' _Alignof(int); // FIXME: Uses C++ spelling for alignof in C mode. - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' alignof 'int' + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' alignof 'int' } struct S { diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp index 8ccb39f8f3165..6fd429d1500a4 100644 --- a/clang/test/AST/ast-dump-expr.cpp +++ b/clang/test/AST/ast-dump-expr.cpp @@ -115,34 +115,34 @@ void Casting(const S *s) { template void UnaryExpressions(int *p) { sizeof...(Ts); - // CHECK: SizeOfPackExpr 0x{{[^ ]*}} 'unsigned long' 0x{{[^ ]*}} Ts + // CHECK: SizeOfPackExpr 0x{{[^ ]*}} '__size_t':'unsigned long' 0x{{[^ ]*}} Ts noexcept(p - p); // CHECK: CXXNoexceptExpr 0x{{[^ ]*}} 'bool' - // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'long' '-' + // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} '__ptrdiff_t':'long' '-' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' ::new int; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' global Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' global Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' new (int); - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' new int{12}; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} 'int' // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 12 new int[2]; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(__size_t)' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 2 new int[2]{1, 2}; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(__size_t)' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 2 // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} 'int[2]' @@ -164,7 +164,7 @@ void UnaryExpressions(int *p) { // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' ::delete p; - // CHECK: CXXDeleteExpr 0x{{[^ ]*}} 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, unsigned long) noexcept' + // CHECK: CXXDeleteExpr 0x{{[^ ]*}} 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, __size_t) noexcept' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' diff --git a/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c index 10f27e759b5b1..672607fa90670 100644 --- a/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c @@ -57,8 +57,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -97,8 +97,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -144,8 +144,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -191,8 +191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -251,8 +251,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c index 419ba57191039..8eedf8ac8bc58 100644 --- a/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c @@ -57,8 +57,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -97,8 +97,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -144,8 +144,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -191,8 +191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -251,8 +251,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c index c209a0456d7a0..64e19ce0a53bf 100644 --- a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c @@ -65,8 +65,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -94,8 +94,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -123,8 +123,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -152,8 +152,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -189,8 +189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -218,8 +218,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -247,8 +247,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -276,8 +276,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -325,8 +325,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -371,8 +371,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -417,8 +417,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -463,8 +463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -517,8 +517,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -563,8 +563,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -609,8 +609,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -655,8 +655,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -711,8 +711,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -757,8 +757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -803,8 +803,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -849,8 +849,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -903,8 +903,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -949,8 +949,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -995,8 +995,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1041,8 +1041,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1097,8 +1097,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1143,8 +1143,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1189,8 +1189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1289,8 +1289,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1335,8 +1335,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1381,8 +1381,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1427,8 +1427,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1497,8 +1497,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1560,8 +1560,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1623,8 +1623,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1686,8 +1686,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1757,8 +1757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1820,8 +1820,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1883,8 +1883,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1946,8 +1946,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c index b13e096101e63..cf3f4bfcaf225 100644 --- a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c @@ -65,8 +65,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -94,8 +94,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -123,8 +123,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -152,8 +152,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -189,8 +189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -218,8 +218,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -247,8 +247,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -276,8 +276,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -325,8 +325,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -371,8 +371,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -417,8 +417,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -463,8 +463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -517,8 +517,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -563,8 +563,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -609,8 +609,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -655,8 +655,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -711,8 +711,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -757,8 +757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -803,8 +803,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -849,8 +849,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -903,8 +903,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -949,8 +949,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -995,8 +995,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1041,8 +1041,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1097,8 +1097,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1143,8 +1143,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1189,8 +1189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1289,8 +1289,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1335,8 +1335,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1381,8 +1381,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1427,8 +1427,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1497,8 +1497,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1560,8 +1560,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1623,8 +1623,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1686,8 +1686,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1757,8 +1757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1820,8 +1820,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1883,8 +1883,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1946,8 +1946,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c index 14356882b599a..c8da8cd1a5efa 100644 --- a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c @@ -71,8 +71,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -99,8 +99,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -127,8 +127,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -155,8 +155,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -211,8 +211,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -239,8 +239,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -267,8 +267,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -295,8 +295,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -363,8 +363,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -407,8 +407,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -451,8 +451,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -495,8 +495,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -568,8 +568,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -612,8 +612,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -656,8 +656,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -700,8 +700,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -775,8 +775,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -819,8 +819,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -863,8 +863,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -907,8 +907,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -984,8 +984,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1028,8 +1028,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1072,8 +1072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1116,8 +1116,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1191,8 +1191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1279,8 +1279,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1323,8 +1323,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1419,8 +1419,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1463,8 +1463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1507,8 +1507,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1551,8 +1551,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1659,8 +1659,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1719,8 +1719,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1779,8 +1779,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1839,8 +1839,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1952,8 +1952,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2012,8 +2012,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2072,8 +2072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2132,8 +2132,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c index 0f983cfdff1dc..09b649cbb3660 100644 --- a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c @@ -71,8 +71,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -99,8 +99,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -127,8 +127,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -155,8 +155,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -211,8 +211,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -239,8 +239,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -267,8 +267,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -295,8 +295,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -363,8 +363,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -407,8 +407,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -451,8 +451,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -495,8 +495,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -568,8 +568,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -612,8 +612,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -656,8 +656,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -700,8 +700,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -775,8 +775,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -819,8 +819,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -863,8 +863,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -907,8 +907,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -984,8 +984,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1028,8 +1028,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1072,8 +1072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1116,8 +1116,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1191,8 +1191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1279,8 +1279,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1323,8 +1323,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1419,8 +1419,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1463,8 +1463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1507,8 +1507,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1551,8 +1551,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1659,8 +1659,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1719,8 +1719,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1779,8 +1779,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1839,8 +1839,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1952,8 +1952,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2012,8 +2012,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2072,8 +2072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2132,8 +2132,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-stmt-json.cpp b/clang/test/AST/ast-dump-stmt-json.cpp index a473d17da9424..a8f113ce6a3d4 100644 --- a/clang/test/AST/ast-dump-stmt-json.cpp +++ b/clang/test/AST/ast-dump-stmt-json.cpp @@ -963,7 +963,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } @@ -994,7 +994,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1126,7 +1126,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1146,7 +1146,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -1337,7 +1338,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } @@ -1369,7 +1370,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1444,7 +1445,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "mangledName": "_Znwm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1457,7 +1458,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1503,7 +1505,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "mangledName": "_ZnwmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long, std::align_val_t)" +// CHECK-NEXT: "qualType": "void *(__size_t, std::align_val_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1516,7 +1518,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1585,7 +1588,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "mangledName": "_Znam", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1598,7 +1601,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1644,7 +1648,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "mangledName": "_ZnamSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long, std::align_val_t)" +// CHECK-NEXT: "qualType": "void *(__size_t, std::align_val_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1657,7 +1661,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1821,7 +1826,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "mangledName": "_ZdlPvm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1847,7 +1852,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1874,7 +1880,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "mangledName": "_ZdlPvmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long, std::align_val_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t, std::align_val_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1900,7 +1906,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -2036,7 +2043,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete[]", // CHECK-NEXT: "mangledName": "_ZdaPvm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -2062,7 +2069,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -2089,7 +2097,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete[]", // CHECK-NEXT: "mangledName": "_ZdaPvmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long, std::align_val_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t, std::align_val_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -2115,7 +2123,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -3881,7 +3890,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3955,7 +3965,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -4085,7 +4096,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -4159,7 +4171,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -4980,7 +4993,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "long" +// CHECK-NEXT: "desugaredQualType": "long", +// CHECK-NEXT: "qualType": "__ptrdiff_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "value": "10" @@ -6503,7 +6517,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "long" +// CHECK-NEXT: "desugaredQualType": "long" +// CHECK-NEXT: "qualType": "__ptrdiff_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "value": "10" diff --git a/clang/test/AST/ast-dump-stmt.cpp b/clang/test/AST/ast-dump-stmt.cpp index 407584e5b82de..42c5f3b3498a4 100644 --- a/clang/test/AST/ast-dump-stmt.cpp +++ b/clang/test/AST/ast-dump-stmt.cpp @@ -206,7 +206,7 @@ void TestIteration() { // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'int *' '+' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]' - // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'long' 10 + // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} '__ptrdiff_t':'long' 10 // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'bool' '!=' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *' @@ -274,7 +274,7 @@ void TestIteration() { // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'int *' '+' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]' - // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'long' 10 + // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} '__ptrdiff_t':'long' 10 // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'bool' '!=' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *' diff --git a/clang/test/AST/ast-dump-traits.cpp b/clang/test/AST/ast-dump-traits.cpp index 3085e5883fd2e..72d2a2ae8603e 100644 --- a/clang/test/AST/ast-dump-traits.cpp +++ b/clang/test/AST/ast-dump-traits.cpp @@ -56,7 +56,7 @@ void test_unary_expr_or_type_trait() { // CHECK-NEXT: |-FunctionDecl {{.*}} line:20:6{{( imported)?}} test_array_type_trait 'void ()' // CHECK-NEXT: | `-CompoundStmt {{.*}} // CHECK-NEXT: | `-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-ArrayTypeTraitExpr {{.*}} 'unsigned long' __array_rank +// CHECK-NEXT: | `-ArrayTypeTraitExpr {{.*}} '__size_t':'unsigned long' __array_rank // CHECK-NEXT: |-FunctionDecl {{.*}} line:25:6{{( imported)?}} test_expression_trait 'void ()' // CHECK-NEXT: | `-CompoundStmt {{.*}} // CHECK-NEXT: | `-CStyleCastExpr {{.*}} 'void' @@ -64,8 +64,8 @@ void test_unary_expr_or_type_trait() { // CHECK-NEXT: `-FunctionDecl {{.*}} line:30:6{{( imported)?}} test_unary_expr_or_type_trait 'void ()' // CHECK-NEXT: `-CompoundStmt {{.*}} // CHECK-NEXT: |-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'int' +// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' sizeof 'int' // CHECK-NEXT: |-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' alignof 'int' +// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' alignof 'int' // CHECK-NEXT: `-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' __alignof 'int' +// CHECK-NEXT: `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' __alignof 'int' diff --git a/clang/test/AST/ast-dump-types-errors-json.cpp b/clang/test/AST/ast-dump-types-errors-json.cpp index e15f8eeee20cc..d9f918f6c3d72 100644 --- a/clang/test/AST/ast-dump-types-errors-json.cpp +++ b/clang/test/AST/ast-dump-types-errors-json.cpp @@ -60,7 +60,8 @@ using TestContainsErrors = int[sizeof(undef())]; // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", diff --git a/clang/test/Analysis/cfg.cpp b/clang/test/Analysis/cfg.cpp index 44a89df28e3b2..d6cef88dc18a6 100644 --- a/clang/test/Analysis/cfg.cpp +++ b/clang/test/Analysis/cfg.cpp @@ -70,7 +70,7 @@ void F(EmptyE e) { // CHECK-NEXT: Succs (1): B1 // CHECK: [B1] // CHECK-NEXT: 1: __builtin_object_size -// CHECK-NEXT: 2: [B1.1] (ImplicitCastExpr, BuiltinFnToFnPtr, unsigned long (*)(const void *, int) noexcept) +// CHECK-NEXT: 2: [B1.1] (ImplicitCastExpr, BuiltinFnToFnPtr, __size_t (*)(const void *, int) noexcept) // CHECK-NEXT: 3: [B1.2](dummy(), 0) // CHECK-NEXT: 4: (void)[B1.3] (CStyleCastExpr, ToVoid, void) // CHECK-NEXT: Preds (1): B2 diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp index 267980c3b20c8..dfc650223c9e7 100644 --- a/clang/test/Analysis/explain-svals.cpp +++ b/clang/test/Analysis/explain-svals.cpp @@ -46,7 +46,7 @@ void test_1(int param, void *ptr) { void test_2(char *ptr, int ext) { clang_analyzer_explain((void *) "asdf"); // expected-warning-re{{{{^pointer to element of type 'char' with index 0 of string literal "asdf"$}}}} - clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type 'unsigned long' tied to pointee of argument 'ptr'$}}}} + clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type '__size_t' tied to pointee of argument 'ptr'$}}}} clang_analyzer_explain(conjure()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure\(\)'$}}}} clang_analyzer_explain(glob); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure\(\)'\) for global variable 'glob'$}}}} clang_analyzer_explain(glob_ptr); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure\(\)'\) for global variable 'glob_ptr'$}}}} diff --git a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c index 1f0d3627fae34..ba5bc57928b0c 100644 --- a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c +++ b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c @@ -20,7 +20,7 @@ // RUN: -triple x86_64-unknown-linux 2>&1 | FileCheck %s // CHECK: Loaded summary for: int isalnum(int) -// CHECK: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict) __attribute__((nonnull(1))) +// CHECK: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) __attribute__((nonnull(1))) // CHECK: Loaded summary for: int fileno(FILE *stream) void initializeSummaryMap(void); diff --git a/clang/test/Analysis/std-c-library-functions-lookup.c b/clang/test/Analysis/std-c-library-functions-lookup.c index e47d9bddda91b..8182e5a1f5fde 100644 --- a/clang/test/Analysis/std-c-library-functions-lookup.c +++ b/clang/test/Analysis/std-c-library-functions-lookup.c @@ -6,7 +6,7 @@ // RUN: -analyzer-config eagerly-assume=false \ // RUN: -triple i686-unknown-linux 2>&1 | FileCheck %s -// CHECK: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) typedef typeof(sizeof(int)) size_t; typedef struct FILE FILE; diff --git a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c index b99cc30149c91..887817ba8551e 100644 --- a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c +++ b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c @@ -31,8 +31,8 @@ // Verify that the summaries are loaded when the StdLibraryFunctionsChecker is // enabled. // CHECK: Loaded summary for: int getchar(void) -// CHECK-NEXT: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict) -// CHECK-NEXT: Loaded summary for: unsigned long fwrite(const void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: __size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict) #include "Inputs/system-header-simulator.h" diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c index b03a1a5656517..b5f663493a676 100644 --- a/clang/test/Analysis/std-c-library-functions.c +++ b/clang/test/Analysis/std-c-library-functions.c @@ -59,8 +59,8 @@ // CHECK-NEXT: Loaded summary for: int tolower(int) // CHECK-NEXT: Loaded summary for: int toascii(int) // CHECK-NEXT: Loaded summary for: int getchar(void) -// CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict) -// CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: __size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict) // CHECK-NEXT: Loaded summary for: ssize_t read(int, void *, size_t) // CHECK-NEXT: Loaded summary for: ssize_t write(int, const void *, size_t) // CHECK-NEXT: Loaded summary for: ssize_t getline(char **restrict, size_t *restrict, FILE *restrict) diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp index a53a8d1ed64a8..556407afa2641 100644 --- a/clang/test/CXX/drs/cwg2xx.cpp +++ b/clang/test/CXX/drs/cwg2xx.cpp @@ -1429,7 +1429,7 @@ namespace cwg299 { // cwg299: 2.8 c++11 // cxx98-11-error@#cwg299-q {{ambiguous conversion of array size expression of type 'T' to an integral or enumeration type}} // cxx98-11-note@#cwg299-int {{conversion to integral type 'int' declared here}} // cxx98-11-note@#cwg299-ushort {{conversion to integral type 'unsigned short' declared here}} - // since-cxx14-error-re@#cwg299-q {{{{conversion from 'T' to 'unsigned (long long|long|int)' is ambiguous}}}} + // since-cxx14-error-re@#cwg299-q {{conversion from 'T' to '__size_t' (aka 'unsigned {{long long|long|int}}') is ambiguous}} // since-cxx14-note@#cwg299-int {{candidate function}} // since-cxx14-note@#cwg299-ushort {{candidate function}} } // namespace cwg299 diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp index 6942b68690c5d..d439f304b5101 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp @@ -5,11 +5,11 @@ typedef decltype(sizeof(int)) size_t; // FIXME: These diagnostics should say 'size_t' instead of 'unsigned long' int a = 123_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'unsigned long long' or 'const char *', and no matching literal operator template}} int b = 4.2_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'long double' or 'const char *', and no matching literal operator template}} -int c = "foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and 'unsigned}} -int d = L"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const wchar_t *' and 'unsigned}} -int e = u8"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and 'unsigned}} -int f = u"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char16_t *' and 'unsigned}} -int g = U"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char32_t *' and 'unsigned}} +int c = "foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and '__size_t' (aka 'unsigned}} +int d = L"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const wchar_t *' and '__size_t' (aka 'unsigned}} +int e = u8"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and '__size_t' (aka 'unsigned}} +int f = u"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char16_t *' and '__size_t' (aka 'unsigned}} +int g = U"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char32_t *' and '__size_t' (aka 'unsigned}} int h = 'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'char'}} int i = L'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'wchar_t'}} int j = u'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'char16_t'}} diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp index afadba282e626..463d7854867a2 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp @@ -13,7 +13,7 @@ float &operator ""_x1 (const char8_t *, size_t); using char8 = double; #endif char8 &i2 = u8"foo"_x1; -double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and 'unsigned long'}} +double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and '__size_t' (aka 'unsigned long')}} char &operator ""_x1(const wchar_t *, size_t); char &i4 = L"foo"_x1; // ok @@ -46,8 +46,8 @@ template float &operator""_s(); void no_fallback() { "hello"_s; // FIXME: It'd be useful to explain what candidates were found and why they didn't work. - "xyzzy"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and 'unsigned long', and no matching literal operator template}} - "yello"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and 'unsigned long', and no matching literal operator template}} + "xyzzy"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long'), and no matching literal operator template}} + "yello"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long'), and no matching literal operator template}} } double &operator""_s(const char*, size_t); diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp index d571fcb8697eb..17d9c83055a1c 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp @@ -17,7 +17,7 @@ int main() { auto v1 = 1.2_w; // calls operator""_w(1.2L) auto v2 = u"one"_w; // calls operator""_w(u"one", 3) auto v3 = 12_w; // calls operator""_w("12") - "two"_w; // expected-error {{no matching literal operator for call to 'operator""_w' with arguments of types 'const char *' and 'unsigned long'}} + "two"_w; // expected-error {{no matching literal operator for call to 'operator""_w' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long')}} same_type test1; same_type test2; diff --git a/clang/test/FixIt/fixit-format-ios-nopedantic.m b/clang/test/FixIt/fixit-format-ios-nopedantic.m index db9ac797c2472..836a4b5372f13 100644 --- a/clang/test/FixIt/fixit-format-ios-nopedantic.m +++ b/clang/test/FixIt/fixit-format-ios-nopedantic.m @@ -1,5 +1,5 @@ // RUN: cp %s %t -// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -Werror -fixit %t +// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -fixit %t int printf(const char *restrict, ...); typedef unsigned int NSUInteger; diff --git a/clang/test/FixIt/format.m b/clang/test/FixIt/format.m index 950765bad9339..e97ae10c974aa 100644 --- a/clang/test/FixIt/format.m +++ b/clang/test/FixIt/format.m @@ -237,14 +237,14 @@ void testSizeTypes(void) { printf("%zu", 0.f); // expected-warning-re{{format specifies type 'size_t' (aka '{{.+}}') but the argument has type 'float'}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f" - printf("%zd", 0.f); // expected-warning-re{{format specifies type 'ssize_t' (aka '{{.+}}') but the argument has type 'float'}} + printf("%zd", 0.f); // expected-warning-re{{format specifies type 'signed size_t' (aka '{{.+}}') but the argument has type 'float'}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f" short x; #if !defined(__ANDROID__) && !defined(__Fuchsia__) - printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}} + printf("%zn", &x); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'short *'}} #else - printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}} + printf("%zn", &x); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'short *'}} // expected-warning@-1 {{'%n' specifier not supported on this platform}} #endif // !defined(__ANDROID__) && !defined(__Fuchsia__) // PrintfSpecifier::fixType doesn't handle %n, so a fix-it is not emitted, diff --git a/clang/test/Sema/format-strings-fixit-ssize_t.c b/clang/test/Sema/format-strings-fixit-ssize_t.c index 2c83db0b66362..96806517b80f2 100644 --- a/clang/test/Sema/format-strings-fixit-ssize_t.c +++ b/clang/test/Sema/format-strings-fixit-ssize_t.c @@ -11,8 +11,8 @@ int printf(char const *, ...); int scanf(const char *, ...); +typedef long ssize_t; void test(void) { - typedef signed long int ssize_t; printf("%f", (ssize_t) 42); ssize_t s; scanf("%f", &s); diff --git a/clang/test/Sema/format-strings-int-typedefs.c b/clang/test/Sema/format-strings-int-typedefs.c index 341d49c500f43..8f85e68b067df 100644 --- a/clang/test/Sema/format-strings-int-typedefs.c +++ b/clang/test/Sema/format-strings-int-typedefs.c @@ -6,8 +6,8 @@ int scanf(char const *, ...); void test(void) { printf("%jd", 42.0); // expected-warning {{format specifies type 'intmax_t' (aka 'long long')}} printf("%ju", 42.0); // expected-warning {{format specifies type 'uintmax_t' (aka 'unsigned long long')}} - printf("%zu", 42.0); // expected-warning {{format specifies type 'size_t' (aka 'unsigned long')}} - printf("%td", 42.0); // expected-warning {{format specifies type 'ptrdiff_t' (aka 'int')}} + printf("%zu", 42.0); // expected-warning {{format specifies type 'size_t' (aka '__size_t')}} + printf("%td", 42.0); // expected-warning {{format specifies type 'ptrdiff_t' (aka '__ptrdiff_t')}} printf("%lc", 42.0); // expected-warning {{format specifies type 'wint_t' (aka 'int')}} printf("%ls", 42.0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} printf("%S", 42.0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} @@ -15,8 +15,8 @@ void test(void) { scanf("%jd", 0); // expected-warning {{format specifies type 'intmax_t *' (aka 'long long *')}} scanf("%ju", 0); // expected-warning {{format specifies type 'uintmax_t *' (aka 'unsigned long long *')}} - scanf("%zu", 0); // expected-warning {{format specifies type 'size_t *' (aka 'unsigned long *')}} - scanf("%td", 0); // expected-warning {{format specifies type 'ptrdiff_t *' (aka 'int *')}} + scanf("%zu", 0); // expected-warning {{format specifies type 'size_t *' (aka '__size_t *')}} + scanf("%td", 0); // expected-warning {{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *')}} scanf("%lc", 0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} scanf("%ls", 0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} scanf("%S", 0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} @@ -32,6 +32,6 @@ void test(void) { // The warning still fires, because it checks the underlying type. printf("%jd", (intmax_t)42); // expected-warning {{format specifies type 'intmax_t' (aka 'long long') but the argument has type 'intmax_t' (aka 'void *')}} printf("%ju", (uintmax_t)42); // expected-warning {{format specifies type 'uintmax_t' (aka 'unsigned long long') but the argument has type 'uintmax_t' (aka 'void *')}} - printf("%zu", (size_t)42); // expected-warning {{format specifies type 'size_t' (aka 'unsigned long') but the argument has type 'size_t' (aka 'void *')}} - printf("%td", (ptrdiff_t)42); // expected-warning {{format specifies type 'ptrdiff_t' (aka 'int') but the argument has type 'ptrdiff_t' (aka 'void *')}} + printf("%zu", (size_t)42); // expected-warning {{format specifies type 'size_t' (aka '__size_t') but the argument has type 'size_t' (aka 'void *')}} + printf("%td", (ptrdiff_t)42); // expected-warning {{format specifies type 'ptrdiff_t' (aka '__ptrdiff_t') but the argument has type 'ptrdiff_t' (aka 'void *')}} } diff --git a/clang/test/Sema/format-strings-scanf.c b/clang/test/Sema/format-strings-scanf.c index eb5b8ec36bf7a..0e48a760e457a 100644 --- a/clang/test/Sema/format-strings-scanf.c +++ b/clang/test/Sema/format-strings-scanf.c @@ -210,13 +210,13 @@ void test_size_types(void) { scanf("%zd", &s); // No warning. double d2 = 0.; - scanf("%zd", &d2); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'double *'}} + scanf("%zd", &d2); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'double *'}} ssize_t sn = 0; scanf("%zn", &sn); // No warning. double d3 = 0.; - scanf("%zn", &d3); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'double *'}} + scanf("%zn", &d3); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'double *'}} } void test_ptrdiff_t_types(void) { @@ -231,13 +231,13 @@ void test_ptrdiff_t_types(void) { scanf("%td", &p2); // No warning. double d2 = 0.; - scanf("%td", &d2); // expected-warning-re{{format specifies type 'ptrdiff_t *' (aka '{{.+}}') but the argument has type 'double *'}} + scanf("%td", &d2); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'double *'}} ptrdiff_t p3 = 0; scanf("%tn", &p3); // No warning. double d3 = 0.; - scanf("%tn", &d3); // expected-warning-re{{format specifies type 'ptrdiff_t *' (aka '{{.+}}') but the argument has type 'double *'}} + scanf("%tn", &d3); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'double *'}} } void check_conditional_literal(char *s, int *i) { diff --git a/clang/test/Sema/format-strings-size_t.c b/clang/test/Sema/format-strings-size_t.c index 5058a762183d3..08efc1fa25066 100644 --- a/clang/test/Sema/format-strings-size_t.c +++ b/clang/test/Sema/format-strings-size_t.c @@ -4,14 +4,14 @@ int printf(char const *, ...); void test(void) { // size_t - printf("%zu", (double)42); // expected-warning {{format specifies type 'size_t' (aka 'unsigned long') but the argument has type 'double'}} + printf("%zu", (double)42); // expected-warning {{format specifies type 'size_t' (aka '__size_t') but the argument has type 'double'}} // intmax_t / uintmax_t printf("%jd", (double)42); // expected-warning {{format specifies type 'intmax_t' (aka 'long') but the argument has type 'double'}} printf("%ju", (double)42); // expected-warning {{format specifies type 'uintmax_t' (aka 'unsigned long') but the argument has type 'double'}} // ptrdiff_t - printf("%td", (double)42); // expected-warning {{format specifies type 'ptrdiff_t' (aka 'long') but the argument has type 'double'}} + printf("%td", (double)42); // expected-warning {{format specifies type 'ptrdiff_t' (aka '__ptrdiff_t') but the argument has type 'double'}} } void test_writeback(void) { @@ -19,10 +19,9 @@ void test_writeback(void) { printf("%jn", (unsigned long*)0); // no-warning printf("%jn", (int*)0); // expected-warning{{format specifies type 'intmax_t *' (aka 'long *') but the argument has type 'int *'}} - printf("%zn", (long*)0); // no-warning - // FIXME: Warn about %zn with non-ssize_t argument. + printf("%zn", (int*)0); // expected-warning{{format specifies type 'signed size_t *' (aka '__signed_size_t *') but the argument has type 'int *'}} - printf("%tn", (long*)0); // no-warning - printf("%tn", (unsigned long*)0); // no-warning - printf("%tn", (int*)0); // expected-warning{{format specifies type 'ptrdiff_t *' (aka 'long *') but the argument has type 'int *'}} + printf("%tn", (long*)0); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'long *'}} + printf("%tn", (unsigned long*)0); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'unsigned long *'}} + printf("%tn", (int*)0); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'int *'}} } diff --git a/clang/test/Sema/matrix-type-builtins.c b/clang/test/Sema/matrix-type-builtins.c index b92f3ce6a3e8c..77e3b8a4287ed 100644 --- a/clang/test/Sema/matrix-type-builtins.c +++ b/clang/test/Sema/matrix-type-builtins.c @@ -73,13 +73,13 @@ void column_major_load(float *p1, int *p2, _Bool *p3, struct Foo *p4) { 10, // expected-error {{1st argument must be a pointer to a valid matrix element type}} 1ull << 21, // expected-error {{row dimension is outside the allowed range [1, 1048575]}} 1ull << 21, // expected-error {{column dimension is outside the allowed range [1, 1048575]}} - ""); // expected-error {{incompatible pointer to integer conversion casting 'char[1]' to type 'unsigned long'}} + ""); // expected-error {{incompatible pointer to integer conversion casting 'char[1]' to type '__size_t' (aka 'unsigned long')}} sx5x10_t a13 = __builtin_matrix_column_major_load( 10, // expected-error {{1st argument must be a pointer to a valid matrix element type}} - *p4, // expected-error {{casting 'struct Foo' to incompatible type 'unsigned long'}} + *p4, // expected-error {{casting 'struct Foo' to incompatible type '__size_t' (aka 'unsigned long')}} "", // expected-error {{column argument must be a constant unsigned integer expression}} - // expected-error@-1 {{incompatible pointer to integer conversion casting 'char[1]' to type 'unsigned long'}} + // expected-error@-1 {{incompatible pointer to integer conversion casting 'char[1]' to type '__size_t' (aka 'unsigned long')}} 10); } @@ -96,7 +96,7 @@ void column_major_store(sx5x10_t *m1, ix3x2_t *m2, float *p1, int *p2, struct Fo __builtin_matrix_column_major_store( "", // expected-error {{1st argument must be a matrix}} 10, // expected-error {{2nd argument must be a pointer to a valid matrix element type}} - *p3); // expected-error {{casting 'struct Foo' to incompatible type 'unsigned long'}} + *p3); // expected-error {{casting 'struct Foo' to incompatible type '__size_t' (aka 'unsigned long')}} __builtin_matrix_column_major_store( *m1, diff --git a/clang/test/Sema/ptrauth-atomic-ops.c b/clang/test/Sema/ptrauth-atomic-ops.c index ccb9a1abcc14d..8872090d83b8d 100644 --- a/clang/test/Sema/ptrauth-atomic-ops.c +++ b/clang/test/Sema/ptrauth-atomic-ops.c @@ -54,7 +54,7 @@ void f() { __c11_atomic_exchange(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}} __c11_atomic_fetch_add(ATOMIZE(non_addr_discriminatedauthenticated_ptr), ATOMIZE(j), memory_order_seq_cst); - // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type 'long'}} + // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type '__ptrdiff_t'}} __c11_atomic_fetch_and(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}} diff --git a/clang/test/Sema/ptrauth.c b/clang/test/Sema/ptrauth.c index e3932615c2962..b4e5214a7cb50 100644 --- a/clang/test/Sema/ptrauth.c +++ b/clang/test/Sema/ptrauth.c @@ -57,7 +57,7 @@ void test_string_discriminator(const char *str) { __builtin_ptrauth_string_discriminator(str); // expected-error {{argument must be a string literal}} __builtin_ptrauth_string_discriminator(L"wide test"); // expected-error {{argument must be a string literal}} expected-warning {{incompatible pointer types passing 'int[10]' to parameter of type 'const char *'}} - void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type 'unsigned long'}} + void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type '__size_t'}} } diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp index 6f4003f525930..c6919447798da 100644 --- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp +++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp @@ -372,7 +372,7 @@ void test__builtin_trivially_relocate() { __builtin_trivially_relocate((S*)0, 0, 0); //expected-error {{argument to '__builtin_trivially_relocate' must be relocatable}} __builtin_trivially_relocate((int*)0, 0, 0); //expected-error {{first and second arguments to '__builtin_trivially_relocate' must be of the same type}} - __builtin_trivially_relocate((int*)0, (int*)0, (int*)0); // expected-error-re {{cannot initialize a value of type '{{.*}}' with an rvalue of type 'int *'}} + __builtin_trivially_relocate((int*)0, (int*)0, (int*)0); // expected-error-re {{cannot initialize a value of type '__size_t' (aka '{{.*}}') with an rvalue of type 'int *'}} __builtin_trivially_relocate((int*)0, (int*)0, 0); __builtin_trivially_relocate((R*)0, (R*)0, 0); } diff --git a/clang/test/SemaCXX/enum-scoped.cpp b/clang/test/SemaCXX/enum-scoped.cpp index 0ce47274979d9..2d7b3c9557ebd 100644 --- a/clang/test/SemaCXX/enum-scoped.cpp +++ b/clang/test/SemaCXX/enum-scoped.cpp @@ -35,7 +35,7 @@ int a1[Val2]; int a2[E1::Val1]; #if __cplusplus >= 201703L -// expected-error@-3 {{type 'E1' is not implicitly convertible to 'unsigned long'}} +// expected-error@-3 {{type 'E1' is not implicitly convertible to '__size_t' (aka 'unsigned long')}} #else // expected-error@-5 {{size of array has non-integer type}} #endif @@ -44,7 +44,7 @@ int* p1 = new int[Val2]; int* p2 = new int[E1::Val1]; #if __cplusplus >= 201703L -// expected-error@-3 {{converting 'E1' to incompatible type 'unsigned long'}} +// expected-error@-3 {{converting 'E1' to incompatible type '__size_t'}} #else // expected-error@-5 {{array size expression must have integral or unscoped enumeration type, not 'E1'}} #endif diff --git a/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp b/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp index 0b76fdd92dabd..91c4ffda9d818 100644 --- a/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp +++ b/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp @@ -22,7 +22,7 @@ void test_non_last_argument(int i, int j, ...) { va_list ap; __va_start(&ap, &i, 4); // expected-error@-1{{passing 'int *' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int *' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} + // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} } void test_stack_allocated(int i, ...) { @@ -30,13 +30,13 @@ void test_stack_allocated(int i, ...) { int j; __va_start(&ap, &j, 4); // expected-error@-1{{passing 'int *' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int *' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} + // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} } void test_non_pointer_addressof(int i, ...) { va_list ap; __va_start(&ap, 1, 4); // expected-error@-1{{passing 'int' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} + // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} } diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp index f918501554f80..c05130bb30729 100644 --- a/clang/test/SemaCXX/new-delete.cpp +++ b/clang/test/SemaCXX/new-delete.cpp @@ -109,7 +109,7 @@ void bad_news(int *ip) #elif __cplusplus <= 201103L // expected-error@-4 {{array size expression must have integral or unscoped enumeration type, not 'double'}} #else - // expected-warning@-6 {{implicit conversion from 'double' to 'unsigned int' changes value from 1.1 to 1}} + // expected-warning@-6 {{implicit conversion from 'double' to '__size_t' (aka 'unsigned int') changes value from 1.1 to 1}} #endif (void)new int[1][i]; // expected-note {{read of non-const variable 'i' is not allowed in a constant expression}} diff --git a/clang/test/SemaCXX/static-assert-cxx26.cpp b/clang/test/SemaCXX/static-assert-cxx26.cpp index b53c67ee67932..b2ebd2abb785e 100644 --- a/clang/test/SemaCXX/static-assert-cxx26.cpp +++ b/clang/test/SemaCXX/static-assert-cxx26.cpp @@ -19,7 +19,7 @@ struct InvalidSize { const char* data() const; }; static_assert(true, InvalidSize{}); // expected-error {{the message in a static assertion must have a 'size()' member function returning an object convertible to 'std::size_t'}} \ - // expected-error {{value of type 'const char *' is not implicitly convertible to 'unsigned long'}} + // expected-error {{value of type 'const char *' is not implicitly convertible to '__size_t' (aka 'unsigned long')}} struct InvalidData { unsigned long size() const; unsigned long data() const; @@ -371,13 +371,13 @@ struct E { static_assert(true, A{}); // expected-error {{the message in this static assertion is not a constant expression}} // expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} +static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} +static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}} +static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} static_assert(true, E{}); // expected-error {{the message in this static assertion is not a constant expression}} @@ -391,21 +391,21 @@ static_assert( static_assert( false, // expected-error {{static assertion failed}} - B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} + B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); static_assert( false, // expected-error {{static assertion failed}} - C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} + C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); static_assert( false, // expected-error {{static assertion failed}} - D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}} + D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); diff --git a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp index 87dc58861ee81..281ef5fa63d6f 100644 --- a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp +++ b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp @@ -75,7 +75,7 @@ template void *operator new(std::type_identity, U); template void operator delete(std::type_identity, U, size_t, std::align_val_t); // expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 2nd parameter; use 'void *' instead}} template void operator delete(std::type_identity, void *, U, std::align_val_t); -// expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 3rd parameter; use 'unsigned long' instead}} +// expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 3rd parameter; use '__size_t' (aka 'unsigned long') instead}} template void operator delete(std::type_identity, void *, size_t, U); // expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 4th parameter; use 'std::align_val_t' instead}} template void *operator new(std::type_identity, typename S::size_ty, std::align_val_t); diff --git a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp index 45fdec606ad1b..56c564f170271 100644 --- a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp +++ b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp @@ -65,12 +65,12 @@ void testOveraligned() { #ifdef NO_ERRORS // expected-no-diagnostics #else -// expected-error-re@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-16 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-17 {{if you supply your own aligned allocation functions}} // expected-error-re@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-19 {{if you supply your own aligned allocation functions}} -// expected-error-re@-20 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-20 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-21 {{if you supply your own aligned allocation functions}} // expected-error-re@-22 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-23 {{if you supply your own aligned allocation functions}} @@ -83,12 +83,12 @@ void testOveraligned() { // expected-error-re@-28 {{aligned deallocation function of type 'void (void *, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}} // expected-note@-29 {{if you supply your own aligned allocation functions}} -// expected-error-re@-29 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-29 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-30 {{if you supply your own aligned allocation functions}} // expected-error-re@-31 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-32 {{if you supply your own aligned allocation functions}} -// expected-error-re@-33 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-33 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-34 {{if you supply your own aligned allocation functions}} // expected-error-re@-35 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-36 {{if you supply your own aligned allocation functions}} @@ -111,19 +111,19 @@ void testOveralignedCheckOS() { // expected-no-diagnostics #else #if defined(IOS) -// expected-error@-7 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on iOS 11 or newer}} +// expected-error@-7 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on iOS 11 or newer}} // expected-error@-8 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on iOS 11 or newer}}} #elif defined(TVOS) -// expected-error@-10 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on tvOS 11 or newer}}} +// expected-error@-10 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on tvOS 11 or newer}}} // expected-error@-11 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on tvOS 11 or newer}}} #elif defined(WATCHOS) -// expected-error@-13 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on watchOS 4 or newer}}} +// expected-error@-13 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on watchOS 4 or newer}}} // expected-error@-14 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on watchOS 4 or newer}}} #elif defined(MACOS) -// expected-error@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on macOS 10.13 or newer}}} +// expected-error@-16 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on macOS 10.13 or newer}}} // expected-error@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on macOS 10.13 or newer}}} #elif defined(ZOS) -// expected-error@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is not available on z/OS}}} +// expected-error@-19 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is not available on z/OS}}} // expected-error@-20 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is not available on z/OS}}} #endif @@ -181,19 +181,19 @@ void testExplicitOperatorNewDeleteOveraligned() { #ifdef NO_ERRORS // expected-no-diagnostics #else -// expected-error-re@-11 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-11 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-12 {{if you supply your own aligned allocation functions}} // expected-error-re@-13 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-14 {{if you supply your own aligned allocation functions}} -// expected-error-re@-15 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-15 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-16 {{if you supply your own aligned allocation functions}} // expected-error-re@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-18 {{if you supply your own aligned allocation functions}} -// expected-error-re@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-19 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-20 {{if you supply your own aligned allocation functions}} // expected-error-re@-21 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} diff --git a/clang/test/SemaHLSL/Language/AssignArray.hlsl b/clang/test/SemaHLSL/Language/AssignArray.hlsl index 1f813e7a350b1..16b60fe40f806 100644 --- a/clang/test/SemaHLSL/Language/AssignArray.hlsl +++ b/clang/test/SemaHLSL/Language/AssignArray.hlsl @@ -13,7 +13,7 @@ export void fn(int8 A) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' // CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' -// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} '__size_t':'unsigned long' int8 b = a; // CHECK-LABEL: VarDecl {{.*}} c 'int8':'vector[2]' cinit @@ -25,7 +25,7 @@ export void fn(int8 A) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' // CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' -// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} '__size_t':'unsigned long' int8 c = A; } diff --git a/clang/test/SemaHLSL/Language/InitListAST.hlsl b/clang/test/SemaHLSL/Language/InitListAST.hlsl index 78bf269769ae6..460ec38bb44af 100644 --- a/clang/test/SemaHLSL/Language/InitListAST.hlsl +++ b/clang/test/SemaHLSL/Language/InitListAST.hlsl @@ -97,12 +97,12 @@ TwoFloats case3(int Val) { // CHECK-NEXT: ImplicitCastExpr {{.*}}'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 TwoFloats case4(int2 TwoVals) { TwoFloats TF4 = {TwoVals}; return TF4; @@ -115,11 +115,11 @@ TwoFloats case4(int2 TwoVals) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 TwoInts case5(int2 TwoVals) { TwoInts TI1 = {TwoVals}; return TI1; @@ -209,22 +209,22 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} @@ -240,32 +240,32 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -273,32 +273,32 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 AnimalBits case8(Doggo D1) { AnimalBits A1 = {D1}; return A1; @@ -317,22 +317,22 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -347,32 +347,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -380,32 +380,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Doggo' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -413,25 +413,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -446,43 +446,43 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh[4]' // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' @@ -490,22 +490,22 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -520,32 +520,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -553,32 +553,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -586,25 +586,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -619,65 +619,65 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -692,32 +692,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -725,32 +725,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -758,25 +758,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -791,43 +791,43 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 Zoo case9(Doggo D1, AnimalBits A1) { Zoo Z1 = {D1, A1, D1, A1, D1, A1}; return Z1; @@ -867,28 +867,28 @@ FourFloats case10(TwoFloats TF1, TwoFloats TF2) { // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 FourFloats case11(float F) { FourFloats FF1 = {F.xxxx}; return FF1; @@ -1008,52 +1008,52 @@ FourFloats case16() { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 float case17() { IntAndFloat Structs[] = {1,2,3,4}; float Floats[] = {Structs, Structs}; diff --git a/clang/test/SemaObjC/format-size-spec-nsinteger.m b/clang/test/SemaObjC/format-size-spec-nsinteger.m index 8ecca6ec6a544..f25ce27f345db 100644 --- a/clang/test/SemaObjC/format-size-spec-nsinteger.m +++ b/clang/test/SemaObjC/format-size-spec-nsinteger.m @@ -3,10 +3,6 @@ // RUN: %clang_cc1 -triple thumbv7k-apple-watchos2.0.0 -fsyntax-only -fblocks -verify %s // RUN: %clang_cc1 -triple thumbv7k-apple-watchos2.0.0 -fsyntax-only -fblocks -verify -Wformat-pedantic -DPEDANTIC %s -#if !defined(PEDANTIC) -// expected-no-diagnostics -#endif - #if __LP64__ typedef unsigned long NSUInteger; typedef long NSInteger; @@ -30,12 +26,10 @@ void testSizeSpecifier(void) { NSInteger i = 0; NSUInteger j = 0; NSLog(@"max NSInteger = %zi", i); - NSLog(@"max NSUinteger = %zu", j); - #if defined(PEDANTIC) - // expected-warning@-4 {{values of type 'NSInteger' should not be used as format arguments; add an explicit cast to 'long' instead}} - // expected-warning@-4 {{values of type 'NSUInteger' should not be used as format arguments; add an explicit cast to 'unsigned long' instead}} + // expected-warning@-2 {{values of type 'NSInteger' should not be used as format arguments; add an explicit cast to 'long' instead}} #endif + NSLog(@"max NSUinteger = %zu", j); // expected-warning {{values of type 'NSUInteger' should not be used as format arguments; add an explicit cast to 'unsigned long' instead}} } void testPtrdiffSpecifier(ptrdiff_t x) { @@ -43,10 +37,9 @@ void testPtrdiffSpecifier(ptrdiff_t x) { NSUInteger j = 0; NSLog(@"ptrdiff_t NSUinteger: %tu", j); - NSLog(@"ptrdiff_t NSInteger: %td", i); - NSLog(@"ptrdiff_t %tu, %td", x, x); #if __is_target_os(watchos) && defined(PEDANTIC) - // expected-warning@-4 {{values of type 'NSUInteger' should not be used as format arguments; add an explicit cast to 'unsigned long' instead}} - // expected-warning@-4 {{values of type 'NSInteger' should not be used as format arguments; add an explicit cast to 'long' instead}} + // expected-warning@-2 {{values of type 'NSUInteger' should not be used as format arguments; add an explicit cast to 'unsigned long' instead}} #endif + NSLog(@"ptrdiff_t NSInteger: %td", i); // expected-warning {{values of type 'NSInteger' should not be used as format arguments; add an explicit cast to 'long' instead}} + NSLog(@"ptrdiff_t %tu, %td", x, x); // no-warning } diff --git a/clang/test/SemaObjC/matrix-type-builtins.m b/clang/test/SemaObjC/matrix-type-builtins.m index 21b8bf864271d..3916017cf0fe0 100644 --- a/clang/test/SemaObjC/matrix-type-builtins.m +++ b/clang/test/SemaObjC/matrix-type-builtins.m @@ -27,5 +27,5 @@ void test_element_type_mismatch(u4x4 m, MatrixValue *mv) { __builtin_matrix_column_major_store(mv.value, mv.value, mv.value); // expected-error@-1 {{2nd argument must be a pointer to a valid matrix element type}} - // expected-error@-2 {{casting 'double4x4' (aka 'double __attribute__((matrix_type(4, 4)))') to incompatible type 'unsigned long}} + // expected-error@-2 {{casting 'double4x4' (aka 'double __attribute__((matrix_type(4, 4)))') to incompatible type '__size_t' (aka 'unsigned long')}} } diff --git a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl index a44d9dd86b86a..22569fa7b443c 100644 --- a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl @@ -87,7 +87,7 @@ kernel void enqueue_kernel_tests(void) { }, 1024, 4294967296L); #ifdef B32 -// expected-warning@-2{{implicit conversion from 'long' to 'unsigned int' changes value from 4294967296 to 0}} +// expected-warning@-2{{implicit conversion from 'long' to '__size_t' (aka 'unsigned int') changes value from 4294967296 to 0}} #endif char c; @@ -97,7 +97,7 @@ kernel void enqueue_kernel_tests(void) { }, c, 1024L); #ifdef WCONV -// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to 'unsigned {{int|long}}'}} +// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to '__size_t' (aka 'unsigned {{int|long}}')}} #endif #define UINT_MAX 4294967295 @@ -107,7 +107,7 @@ kernel void enqueue_kernel_tests(void) { }, sizeof(int), sizeof(int) * UINT_MAX); #ifdef B32 -// expected-warning@-2{{implicit conversion from 'long' to 'unsigned int' changes value from 17179869180 to 4294967292}} +// expected-warning@-2{{implicit conversion from 'long' to '__size_t' (aka 'unsigned int') changes value from 17179869180 to 4294967292}} #endif typedef void (^bl_A_t)(local void *); diff --git a/clang/test/SemaTemplate/type_pack_element.cpp b/clang/test/SemaTemplate/type_pack_element.cpp index 264b4dcdc044d..5ff010c7db29c 100644 --- a/clang/test/SemaTemplate/type_pack_element.cpp +++ b/clang/test/SemaTemplate/type_pack_element.cpp @@ -7,9 +7,9 @@ using test1 = __type_pack_element<0, int>; // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr '0' -// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | |-value: Int 0 -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} 'int' 0 // CHECK-NEXT: |-TemplateArgument type 'int' // CHECK-NEXT: | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' @@ -23,7 +23,7 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr 'N' -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int' // CHECK-NEXT: `-TemplateArgument type 'Ts...' // CHECK-NEXT: `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'Ts...' dependent @@ -37,9 +37,9 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr '0' -// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | |-value: Int 0 -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} 'int' 0 // CHECK-NEXT: `-TemplateArgument type 'Ts...' // CHECK-NEXT: `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'Ts...' dependent @@ -53,7 +53,7 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr 'N' -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int' // CHECK-NEXT: `-TemplateArgument type 'int' // CHECK-NEXT: `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 75afa87947be4..9412d9735ef82 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -1672,6 +1672,10 @@ bool CursorVisitor::VisitTypedefTypeLoc(TypedefTypeLoc TL) { return Visit(MakeCursorTypeRef(TL.getTypedefNameDecl(), TL.getNameLoc(), TU)); } +bool CursorVisitor::VisitPredefinedSugarTypeLoc(PredefinedSugarTypeLoc TL) { + return false; +} + bool CursorVisitor::VisitUnresolvedUsingTypeLoc(UnresolvedUsingTypeLoc TL) { return Visit(MakeCursorTypeRef(TL.getDecl(), TL.getNameLoc(), TU)); } diff --git a/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp index 4fa4982de88fa..c5f152a26a766 100644 --- a/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp +++ b/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp @@ -38,10 +38,10 @@ static void print(const Deque& d) { " : __back_spare() == %zu" " : __capacity() == %zu" " : bytes allocated == %zu\n", - d.size(), - d.__front_spare(), - d.__back_spare(), - d.__capacity(), + std::size_t(d.size()), + std::size_t(d.__front_spare()), + std::size_t(d.__back_spare()), + std::size_t(d.__capacity()), malloc_allocator_base::outstanding_bytes); } diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index e847ede1a4ba6..3226e0accc5ea 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -2555,6 +2555,7 @@ RemoveWrappingTypes(QualType type, ArrayRef mask = {}) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: + case clang::Type::PredefinedSugar: type = type->getLocallyUnqualifiedSingleStepDesugaredType(); break; default: @@ -4130,6 +4131,7 @@ TypeSystemClang::GetTypeClass(lldb::opaque_compiler_type_t type) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: + case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: break; @@ -4840,6 +4842,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: + case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: @@ -5141,6 +5144,7 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: + case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: break; From 52a9c493e62c8af8f1299cae0babb92f2693065b Mon Sep 17 00:00:00 2001 From: Liao Chunyu Date: Fri, 18 Jul 2025 10:56:07 +0800 Subject: [PATCH 280/813] Reland "[RISCV] AddEdge between mask producer and user of V0 (#146855)" (#148566) The defmask vector cannot contain instructions that use V0. for `MultiSource/Benchmarks/DOE-ProxyApps-C++/CLAMR/mesh.cpp` Save `%173:vrm2nov0 = PseudoVMERGE_VVM_M2 undef %173:vrm2nov0(tied-def 0), %116:vrm2, %173:vrm2nov0, killed $v0, -1, 5 `to def mask caused crash. --- .../RISCV/RISCVVectorMaskDAGMutation.cpp | 22 ++- .../RISCV/rvv/combine-reduce-add-to-vcpop.ll | 18 +- llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll | 38 ++--- .../RISCV/rvv/fixed-vectors-extract-i1.ll | 108 ++++++------ .../rvv/fixed-vectors-shuffle-deinterleave.ll | 26 ++- .../CodeGen/RISCV/rvv/reproducer-pr146855.ll | 72 ++++++++ .../RISCV/rvv/vscale-vw-web-simplification.ll | 156 ++++++++---------- llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll | 14 +- 8 files changed, 264 insertions(+), 190 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp index c1f4d19824e86..3bd2705f021a6 100644 --- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp +++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp @@ -10,6 +10,10 @@ // instructions and masked instructions, so that we can reduce the live range // overlaps of mask registers. // +// If there are multiple masks producers followed by multiple masked +// instructions, then at each masked instructions add dependency edges between +// every producer and masked instruction. +// // The reason why we need to do this: // 1. When tracking register pressure, we don't track physical registers. // 2. We have a RegisterClass for mask register (which is `VMV0`), but we don't @@ -67,11 +71,27 @@ class RISCVVectorMaskDAGMutation : public ScheduleDAGMutation { void apply(ScheduleDAGInstrs *DAG) override { SUnit *NearestUseV0SU = nullptr; + SmallVector DefMask; for (SUnit &SU : DAG->SUnits) { const MachineInstr *MI = SU.getInstr(); - if (MI->findRegisterUseOperand(RISCV::V0, TRI)) + bool UseV0 = MI->findRegisterUseOperand(RISCV::V0, TRI); + if (isSoleUseCopyToV0(SU) && !UseV0) + DefMask.push_back(&SU); + + if (UseV0) { NearestUseV0SU = &SU; + // Copy may not be a real use, so skip it here. + if (DefMask.size() > 1 && !MI->isCopy()) { + for (SUnit *Def : DefMask) + if (DAG->canAddEdge(Def, &SU)) + DAG->addEdge(Def, SDep(&SU, SDep::Artificial)); + } + + if (!DefMask.empty()) + DefMask.erase(DefMask.begin()); + } + if (NearestUseV0SU && NearestUseV0SU != &SU && isSoleUseCopyToV0(SU) && // For LMUL=8 cases, there will be more possibilities to spill. // FIXME: We should use RegPressureTracker to do fine-grained diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll index 0d8aff306252e..2d4fce68f9545 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll @@ -313,12 +313,12 @@ define i32 @test_nxv128i1( %x) { ; CHECK-NEXT: vslidedown.vx v0, v6, a0 ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v7, a1 +; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a0 ; CHECK-NEXT: vslidedown.vx v5, v6, a0 -; CHECK-NEXT: vslidedown.vx v4, v7, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v4 ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v5 ; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t @@ -425,13 +425,15 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v5, a1 -; CHECK-NEXT: vslidedown.vx v5, v7, a1 -; CHECK-NEXT: vslidedown.vx v4, v6, a1 -; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, mu +; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma ; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v4 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v6, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, mu ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t -; CHECK-NEXT: vmv1r.v v0, v5 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, mu ; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: addi a2, sp, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll index 796f8dde58f47..15417da962bd3 100644 --- a/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extractelt-i1.ll @@ -139,21 +139,20 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind { ; RV32-NEXT: slli a3, a3, 4 ; RV32-NEXT: sub sp, sp, a3 ; RV32-NEXT: andi sp, sp, -64 -; RV32-NEXT: addi a3, sp, 64 ; RV32-NEXT: vl8r.v v8, (a0) ; RV32-NEXT: slli a2, a2, 3 ; RV32-NEXT: add a0, a0, a2 -; RV32-NEXT: vl8r.v v24, (a0) +; RV32-NEXT: vl8r.v v16, (a0) ; RV32-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV32-NEXT: vmseq.vi v0, v8, 0 -; RV32-NEXT: vmv.v.i v16, 0 -; RV32-NEXT: add a1, a3, a1 -; RV32-NEXT: add a2, a3, a2 -; RV32-NEXT: vmseq.vi v8, v24, 0 -; RV32-NEXT: vmerge.vim v24, v16, 1, v0 -; RV32-NEXT: vs8r.v v24, (a3) -; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vmerge.vim v8, v16, 1, v0 +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vim v24, v8, 1, v0 +; RV32-NEXT: vmseq.vi v0, v16, 0 +; RV32-NEXT: addi a0, sp, 64 +; RV32-NEXT: add a1, a0, a1 +; RV32-NEXT: add a2, a0, a2 +; RV32-NEXT: vs8r.v v24, (a0) +; RV32-NEXT: vmerge.vim v8, v8, 1, v0 ; RV32-NEXT: vs8r.v v8, (a2) ; RV32-NEXT: lbu a0, 0(a1) ; RV32-NEXT: addi sp, s0, -80 @@ -179,21 +178,20 @@ define i1 @extractelt_nxv128i1(ptr %x, i64 %idx) nounwind { ; RV64-NEXT: slli a3, a3, 4 ; RV64-NEXT: sub sp, sp, a3 ; RV64-NEXT: andi sp, sp, -64 -; RV64-NEXT: addi a3, sp, 64 ; RV64-NEXT: vl8r.v v8, (a0) ; RV64-NEXT: slli a2, a2, 3 ; RV64-NEXT: add a0, a0, a2 -; RV64-NEXT: vl8r.v v24, (a0) +; RV64-NEXT: vl8r.v v16, (a0) ; RV64-NEXT: vsetvli a0, zero, e8, m8, ta, ma ; RV64-NEXT: vmseq.vi v0, v8, 0 -; RV64-NEXT: vmv.v.i v16, 0 -; RV64-NEXT: add a1, a3, a1 -; RV64-NEXT: add a2, a3, a2 -; RV64-NEXT: vmseq.vi v8, v24, 0 -; RV64-NEXT: vmerge.vim v24, v16, 1, v0 -; RV64-NEXT: vs8r.v v24, (a3) -; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: vmerge.vim v8, v16, 1, v0 +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmerge.vim v24, v8, 1, v0 +; RV64-NEXT: vmseq.vi v0, v16, 0 +; RV64-NEXT: addi a0, sp, 64 +; RV64-NEXT: add a1, a0, a1 +; RV64-NEXT: add a2, a0, a2 +; RV64-NEXT: vs8r.v v24, (a0) +; RV64-NEXT: vmerge.vim v8, v8, 1, v0 ; RV64-NEXT: vs8r.v v8, (a2) ; RV64-NEXT: lbu a0, 0(a1) ; RV64-NEXT: addi sp, s0, -80 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll index 2587411566a3f..fb070b24a4f34 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-extract-i1.ll @@ -324,24 +324,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill ; RV32-NEXT: addi s0, sp, 384 ; RV32-NEXT: andi sp, sp, -128 -; RV32-NEXT: zext.b a1, a1 -; RV32-NEXT: mv a2, sp -; RV32-NEXT: li a3, 128 -; RV32-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV32-NEXT: li a2, 128 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV32-NEXT: vle8.v v8, (a0) ; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vle8.v v16, (a0) -; RV32-NEXT: add a1, a2, a1 ; RV32-NEXT: vmseq.vi v0, v8, 0 -; RV32-NEXT: vmv.v.i v24, 0 -; RV32-NEXT: vmseq.vi v8, v16, 0 -; RV32-NEXT: vmerge.vim v16, v24, 1, v0 -; RV32-NEXT: vse8.v v16, (a2) -; RV32-NEXT: vmv1r.v v0, v8 -; RV32-NEXT: vmerge.vim v8, v24, 1, v0 -; RV32-NEXT: addi a0, sp, 128 -; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: lbu a0, 0(a1) +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vmerge.vim v24, v8, 1, v0 +; RV32-NEXT: vmseq.vi v0, v16, 0 +; RV32-NEXT: zext.b a0, a1 +; RV32-NEXT: mv a1, sp +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: vse8.v v24, (a1) +; RV32-NEXT: vmerge.vim v8, v8, 1, v0 +; RV32-NEXT: addi a1, sp, 128 +; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: lbu a0, 0(a0) ; RV32-NEXT: addi sp, s0, -384 ; RV32-NEXT: lw ra, 380(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 376(sp) # 4-byte Folded Reload @@ -355,24 +354,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV64-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; RV64-NEXT: addi s0, sp, 384 ; RV64-NEXT: andi sp, sp, -128 -; RV64-NEXT: zext.b a1, a1 -; RV64-NEXT: mv a2, sp -; RV64-NEXT: li a3, 128 -; RV64-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV64-NEXT: li a2, 128 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV64-NEXT: vle8.v v8, (a0) ; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vle8.v v16, (a0) -; RV64-NEXT: add a1, a2, a1 ; RV64-NEXT: vmseq.vi v0, v8, 0 -; RV64-NEXT: vmv.v.i v24, 0 -; RV64-NEXT: vmseq.vi v8, v16, 0 -; RV64-NEXT: vmerge.vim v16, v24, 1, v0 -; RV64-NEXT: vse8.v v16, (a2) -; RV64-NEXT: vmv1r.v v0, v8 -; RV64-NEXT: vmerge.vim v8, v24, 1, v0 -; RV64-NEXT: addi a0, sp, 128 -; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: lbu a0, 0(a1) +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vmerge.vim v24, v8, 1, v0 +; RV64-NEXT: vmseq.vi v0, v16, 0 +; RV64-NEXT: zext.b a0, a1 +; RV64-NEXT: mv a1, sp +; RV64-NEXT: add a0, a1, a0 +; RV64-NEXT: vse8.v v24, (a1) +; RV64-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64-NEXT: addi a1, sp, 128 +; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: lbu a0, 0(a0) ; RV64-NEXT: addi sp, s0, -384 ; RV64-NEXT: ld ra, 376(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 368(sp) # 8-byte Folded Reload @@ -386,24 +384,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV32ZBS-NEXT: sw s0, 376(sp) # 4-byte Folded Spill ; RV32ZBS-NEXT: addi s0, sp, 384 ; RV32ZBS-NEXT: andi sp, sp, -128 -; RV32ZBS-NEXT: zext.b a1, a1 -; RV32ZBS-NEXT: mv a2, sp -; RV32ZBS-NEXT: li a3, 128 -; RV32ZBS-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV32ZBS-NEXT: li a2, 128 +; RV32ZBS-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV32ZBS-NEXT: vle8.v v8, (a0) ; RV32ZBS-NEXT: addi a0, a0, 128 ; RV32ZBS-NEXT: vle8.v v16, (a0) -; RV32ZBS-NEXT: add a1, a2, a1 ; RV32ZBS-NEXT: vmseq.vi v0, v8, 0 -; RV32ZBS-NEXT: vmv.v.i v24, 0 -; RV32ZBS-NEXT: vmseq.vi v8, v16, 0 -; RV32ZBS-NEXT: vmerge.vim v16, v24, 1, v0 -; RV32ZBS-NEXT: vse8.v v16, (a2) -; RV32ZBS-NEXT: vmv1r.v v0, v8 -; RV32ZBS-NEXT: vmerge.vim v8, v24, 1, v0 -; RV32ZBS-NEXT: addi a0, sp, 128 -; RV32ZBS-NEXT: vse8.v v8, (a0) -; RV32ZBS-NEXT: lbu a0, 0(a1) +; RV32ZBS-NEXT: vmv.v.i v8, 0 +; RV32ZBS-NEXT: vmerge.vim v24, v8, 1, v0 +; RV32ZBS-NEXT: vmseq.vi v0, v16, 0 +; RV32ZBS-NEXT: zext.b a0, a1 +; RV32ZBS-NEXT: mv a1, sp +; RV32ZBS-NEXT: add a0, a1, a0 +; RV32ZBS-NEXT: vse8.v v24, (a1) +; RV32ZBS-NEXT: vmerge.vim v8, v8, 1, v0 +; RV32ZBS-NEXT: addi a1, sp, 128 +; RV32ZBS-NEXT: vse8.v v8, (a1) +; RV32ZBS-NEXT: lbu a0, 0(a0) ; RV32ZBS-NEXT: addi sp, s0, -384 ; RV32ZBS-NEXT: lw ra, 380(sp) # 4-byte Folded Reload ; RV32ZBS-NEXT: lw s0, 376(sp) # 4-byte Folded Reload @@ -417,24 +414,23 @@ define i1 @extractelt_v256i1(ptr %x, i64 %idx) nounwind { ; RV64ZBS-NEXT: sd s0, 368(sp) # 8-byte Folded Spill ; RV64ZBS-NEXT: addi s0, sp, 384 ; RV64ZBS-NEXT: andi sp, sp, -128 -; RV64ZBS-NEXT: zext.b a1, a1 -; RV64ZBS-NEXT: mv a2, sp -; RV64ZBS-NEXT: li a3, 128 -; RV64ZBS-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV64ZBS-NEXT: li a2, 128 +; RV64ZBS-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV64ZBS-NEXT: vle8.v v8, (a0) ; RV64ZBS-NEXT: addi a0, a0, 128 ; RV64ZBS-NEXT: vle8.v v16, (a0) -; RV64ZBS-NEXT: add a1, a2, a1 ; RV64ZBS-NEXT: vmseq.vi v0, v8, 0 -; RV64ZBS-NEXT: vmv.v.i v24, 0 -; RV64ZBS-NEXT: vmseq.vi v8, v16, 0 -; RV64ZBS-NEXT: vmerge.vim v16, v24, 1, v0 -; RV64ZBS-NEXT: vse8.v v16, (a2) -; RV64ZBS-NEXT: vmv1r.v v0, v8 -; RV64ZBS-NEXT: vmerge.vim v8, v24, 1, v0 -; RV64ZBS-NEXT: addi a0, sp, 128 -; RV64ZBS-NEXT: vse8.v v8, (a0) -; RV64ZBS-NEXT: lbu a0, 0(a1) +; RV64ZBS-NEXT: vmv.v.i v8, 0 +; RV64ZBS-NEXT: vmerge.vim v24, v8, 1, v0 +; RV64ZBS-NEXT: vmseq.vi v0, v16, 0 +; RV64ZBS-NEXT: zext.b a0, a1 +; RV64ZBS-NEXT: mv a1, sp +; RV64ZBS-NEXT: add a0, a1, a0 +; RV64ZBS-NEXT: vse8.v v24, (a1) +; RV64ZBS-NEXT: vmerge.vim v8, v8, 1, v0 +; RV64ZBS-NEXT: addi a1, sp, 128 +; RV64ZBS-NEXT: vse8.v v8, (a1) +; RV64ZBS-NEXT: lbu a0, 0(a0) ; RV64ZBS-NEXT: addi sp, s0, -384 ; RV64ZBS-NEXT: ld ra, 376(sp) # 8-byte Folded Reload ; RV64ZBS-NEXT: ld s0, 368(sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll index c11319ff335fd..67584ba8a82cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-deinterleave.ll @@ -143,16 +143,15 @@ define void @deinterleave6_0_i8(ptr %in, ptr %out) { ; CHECK-LABEL: deinterleave6_0_i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.v.i v0, 2 -; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 8 +; CHECK-NEXT: vslidedown.vi v9, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v9, v9, 5, v0.t -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vrgather.vi v9, v10, 4, v0.t -; CHECK-NEXT: vse8.v v9, (a1) +; CHECK-NEXT: vslidedown.vi v8, v8, 5, v0.t +; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vrgather.vi v8, v9, 4, v0.t +; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -188,16 +187,15 @@ define void @deinterleave7_0_i8(ptr %in, ptr %out) { ; CHECK-LABEL: deinterleave7_0_i8: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vmv.v.i v0, 2 -; CHECK-NEXT: vmv.v.i v8, 4 ; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vi v10, v9, 8 +; CHECK-NEXT: vslidedown.vi v9, v8, 8 ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu -; CHECK-NEXT: vslidedown.vi v9, v9, 6, v0.t -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vrgather.vi v9, v10, 6, v0.t -; CHECK-NEXT: vse8.v v9, (a1) +; CHECK-NEXT: vslidedown.vi v8, v8, 6, v0.t +; CHECK-NEXT: vmv.v.i v0, 4 +; CHECK-NEXT: vrgather.vi v8, v9, 6, v0.t +; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll b/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll new file mode 100644 index 0000000000000..cca00bf58063d --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/reproducer-pr146855.ll @@ -0,0 +1,72 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +define i32 @_ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_( %wide.load, %0, %1, %2, %3) #0 { +; CHECK-LABEL: _ZN4Mesh12rezone_countESt6vectorIiSaIiEERiS3_: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vmv.v.i v14, 0 +; CHECK-NEXT: .LBB0_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: vmv2r.v v16, v10 +; CHECK-NEXT: vle32.v v16, (a0), v0.t +; CHECK-NEXT: vand.vi v16, v16, 1 +; CHECK-NEXT: vmsne.vi v9, v16, 0 +; CHECK-NEXT: vmand.mm v0, v8, v9 +; CHECK-NEXT: vmerge.vim v12, v12, -1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vor.vi v14, v14, 1, v0.t +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: j .LBB0_1 +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ 1, %vector.body ] + %vec.phi = phi [ zeroinitializer, %entry ], [ %predphi88, %vector.body ] + %vec.phi81 = phi [ zeroinitializer, %entry ], [ %predphi93, %vector.body ] + %wide.load1 = load , ptr null, align 4 + %4 = icmp slt %wide.load, zeroinitializer + %5 = icmp sgt %wide.load, zeroinitializer + %wide.masked.load82 = tail call @llvm.masked.load.nxv4i32.p0(ptr null, i32 1, zeroinitializer, zeroinitializer) + %6 = icmp eq zeroinitializer, zeroinitializer + %7 = getelementptr i32, ptr null, i64 %index + %wide.masked.load83 = tail call @llvm.masked.load.nxv4i32.p0(ptr %7, i32 1, %0, zeroinitializer) + %8 = select %0, %0, zeroinitializer + %9 = trunc %wide.masked.load83 to + %narrow = select %0, %9, zeroinitializer + %10 = sext %narrow to + %predphi88 = or %vec.phi, %10 + %11 = zext %0 to + %predphi93 = or %vec.phi81, %11 + %index.next = add i64 0, 1 + br i1 false, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %12 = tail call i32 @llvm.vector.reduce.add.nxv4i32( %vec.phi) + %13 = tail call i32 @llvm.vector.reduce.add.nxv4i32( %vec.phi81) + ret i32 %13 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.masked.load.nxv4i32.p0(ptr captures(none), i32 immarg, , ) #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i32 @llvm.vector.reduce.add.nxv4i32() #2 + +; uselistorder directives +uselistorder ptr @llvm.masked.load.nxv4i32.p0, { 1, 0 } +uselistorder ptr @llvm.vector.reduce.add.nxv4i32, { 1, 0 } + +attributes #0 = { "target-features"="+v" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll index 206838917d004..ad2ed47e67e64 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vscale-vw-web-simplification.ll @@ -153,20 +153,19 @@ define @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ; NO_FOLDING: # %bb.0: ; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, mu ; NO_FOLDING-NEXT: vlm.v v8, (a0) -; NO_FOLDING-NEXT: vlm.v v9, (a1) -; NO_FOLDING-NEXT: vlm.v v10, (a2) -; NO_FOLDING-NEXT: vmv.v.i v11, 0 +; NO_FOLDING-NEXT: vmv.v.i v10, 0 ; NO_FOLDING-NEXT: vmv.v.v v0, v8 -; NO_FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 +; NO_FOLDING-NEXT: vmerge.vim v11, v10, -1, v0 +; NO_FOLDING-NEXT: vlm.v v0, (a1) +; NO_FOLDING-NEXT: vlm.v v9, (a2) +; NO_FOLDING-NEXT: vmerge.vim v12, v10, -1, v0 ; NO_FOLDING-NEXT: vmv.v.v v0, v9 -; NO_FOLDING-NEXT: vmerge.vim v9, v11, -1, v0 -; NO_FOLDING-NEXT: vmv.v.v v0, v10 -; NO_FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 -; NO_FOLDING-NEXT: vmul.vv v9, v12, v9 -; NO_FOLDING-NEXT: vsub.vv v11, v12, v10 +; NO_FOLDING-NEXT: vmerge.vim v9, v10, -1, v0 +; NO_FOLDING-NEXT: vmul.vv v10, v11, v12 +; NO_FOLDING-NEXT: vsub.vv v11, v11, v9 ; NO_FOLDING-NEXT: vmv.v.v v0, v8 -; NO_FOLDING-NEXT: vadd.vi v10, v10, -1, v0.t -; NO_FOLDING-NEXT: vor.vv v8, v9, v10 +; NO_FOLDING-NEXT: vadd.vi v9, v9, -1, v0.t +; NO_FOLDING-NEXT: vor.vv v8, v10, v9 ; NO_FOLDING-NEXT: vor.vv v8, v8, v11 ; NO_FOLDING-NEXT: ret ; @@ -174,20 +173,19 @@ define @vwop_vscale_sext_i1i32_multiple_users(ptr %x, ptr %y, ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, mu ; FOLDING-NEXT: vlm.v v8, (a0) -; FOLDING-NEXT: vlm.v v9, (a1) -; FOLDING-NEXT: vlm.v v10, (a2) -; FOLDING-NEXT: vmv.v.i v11, 0 +; FOLDING-NEXT: vmv.v.i v10, 0 ; FOLDING-NEXT: vmv.v.v v0, v8 -; FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 +; FOLDING-NEXT: vmerge.vim v11, v10, -1, v0 +; FOLDING-NEXT: vlm.v v0, (a1) +; FOLDING-NEXT: vlm.v v9, (a2) +; FOLDING-NEXT: vmerge.vim v12, v10, -1, v0 ; FOLDING-NEXT: vmv.v.v v0, v9 -; FOLDING-NEXT: vmerge.vim v9, v11, -1, v0 -; FOLDING-NEXT: vmv.v.v v0, v10 -; FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 -; FOLDING-NEXT: vmul.vv v9, v12, v9 -; FOLDING-NEXT: vsub.vv v11, v12, v10 +; FOLDING-NEXT: vmerge.vim v9, v10, -1, v0 +; FOLDING-NEXT: vmul.vv v10, v11, v12 +; FOLDING-NEXT: vsub.vv v11, v11, v9 ; FOLDING-NEXT: vmv.v.v v0, v8 -; FOLDING-NEXT: vadd.vi v10, v10, -1, v0.t -; FOLDING-NEXT: vor.vv v8, v9, v10 +; FOLDING-NEXT: vadd.vi v9, v9, -1, v0.t +; FOLDING-NEXT: vor.vv v8, v10, v9 ; FOLDING-NEXT: vor.vv v8, v8, v11 ; FOLDING-NEXT: ret %a = load , ptr %x @@ -209,20 +207,19 @@ define @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p ; NO_FOLDING: # %bb.0: ; NO_FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; NO_FOLDING-NEXT: vlm.v v8, (a0) -; NO_FOLDING-NEXT: vlm.v v9, (a1) -; NO_FOLDING-NEXT: vlm.v v10, (a2) -; NO_FOLDING-NEXT: vmv.v.i v11, 0 +; NO_FOLDING-NEXT: vmv.v.i v10, 0 ; NO_FOLDING-NEXT: vmv1r.v v0, v8 -; NO_FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 +; NO_FOLDING-NEXT: vmerge.vim v11, v10, -1, v0 +; NO_FOLDING-NEXT: vlm.v v0, (a1) +; NO_FOLDING-NEXT: vlm.v v9, (a2) +; NO_FOLDING-NEXT: vmerge.vim v12, v10, -1, v0 ; NO_FOLDING-NEXT: vmv1r.v v0, v9 -; NO_FOLDING-NEXT: vmerge.vim v9, v11, -1, v0 -; NO_FOLDING-NEXT: vmv1r.v v0, v10 -; NO_FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 -; NO_FOLDING-NEXT: vmul.vv v9, v12, v9 -; NO_FOLDING-NEXT: vsub.vv v11, v12, v10 +; NO_FOLDING-NEXT: vmerge.vim v9, v10, -1, v0 +; NO_FOLDING-NEXT: vmul.vv v10, v11, v12 +; NO_FOLDING-NEXT: vsub.vv v11, v11, v9 ; NO_FOLDING-NEXT: vmv1r.v v0, v8 -; NO_FOLDING-NEXT: vadd.vi v10, v10, -1, v0.t -; NO_FOLDING-NEXT: vor.vv v8, v9, v10 +; NO_FOLDING-NEXT: vadd.vi v9, v9, -1, v0.t +; NO_FOLDING-NEXT: vor.vv v8, v10, v9 ; NO_FOLDING-NEXT: vor.vv v8, v8, v11 ; NO_FOLDING-NEXT: ret ; @@ -230,20 +227,19 @@ define @vwop_vscale_sext_i1i8_multiple_users(ptr %x, ptr %y, p ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; FOLDING-NEXT: vlm.v v8, (a0) -; FOLDING-NEXT: vlm.v v9, (a1) -; FOLDING-NEXT: vlm.v v10, (a2) -; FOLDING-NEXT: vmv.v.i v11, 0 +; FOLDING-NEXT: vmv.v.i v10, 0 ; FOLDING-NEXT: vmv1r.v v0, v8 -; FOLDING-NEXT: vmerge.vim v12, v11, -1, v0 +; FOLDING-NEXT: vmerge.vim v11, v10, -1, v0 +; FOLDING-NEXT: vlm.v v0, (a1) +; FOLDING-NEXT: vlm.v v9, (a2) +; FOLDING-NEXT: vmerge.vim v12, v10, -1, v0 ; FOLDING-NEXT: vmv1r.v v0, v9 -; FOLDING-NEXT: vmerge.vim v9, v11, -1, v0 -; FOLDING-NEXT: vmv1r.v v0, v10 -; FOLDING-NEXT: vmerge.vim v10, v11, -1, v0 -; FOLDING-NEXT: vmul.vv v9, v12, v9 -; FOLDING-NEXT: vsub.vv v11, v12, v10 +; FOLDING-NEXT: vmerge.vim v9, v10, -1, v0 +; FOLDING-NEXT: vmul.vv v10, v11, v12 +; FOLDING-NEXT: vsub.vv v11, v11, v9 ; FOLDING-NEXT: vmv1r.v v0, v8 -; FOLDING-NEXT: vadd.vi v10, v10, -1, v0.t -; FOLDING-NEXT: vor.vv v8, v9, v10 +; FOLDING-NEXT: vadd.vi v9, v9, -1, v0.t +; FOLDING-NEXT: vor.vv v8, v10, v9 ; FOLDING-NEXT: vor.vv v8, v8, v11 ; FOLDING-NEXT: ret %a = load , ptr %x @@ -444,16 +440,14 @@ define @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ; NO_FOLDING: # %bb.0: ; NO_FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, mu ; NO_FOLDING-NEXT: vlm.v v0, (a0) -; NO_FOLDING-NEXT: vlm.v v8, (a2) -; NO_FOLDING-NEXT: vlm.v v9, (a1) -; NO_FOLDING-NEXT: vmv.v.i v10, 0 -; NO_FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 -; NO_FOLDING-NEXT: vmv.v.v v0, v8 -; NO_FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 -; NO_FOLDING-NEXT: vadd.vv v10, v11, v8 -; NO_FOLDING-NEXT: vsub.vv v8, v11, v8 -; NO_FOLDING-NEXT: vmv.v.v v0, v9 -; NO_FOLDING-NEXT: vor.vv v10, v10, v11, v0.t +; NO_FOLDING-NEXT: vmv.v.i v8, 0 +; NO_FOLDING-NEXT: vmerge.vim v9, v8, 1, v0 +; NO_FOLDING-NEXT: vlm.v v0, (a2) +; NO_FOLDING-NEXT: vmerge.vim v8, v8, 1, v0 +; NO_FOLDING-NEXT: vlm.v v0, (a1) +; NO_FOLDING-NEXT: vadd.vv v10, v9, v8 +; NO_FOLDING-NEXT: vsub.vv v8, v9, v8 +; NO_FOLDING-NEXT: vor.vv v10, v10, v9, v0.t ; NO_FOLDING-NEXT: vor.vv v8, v10, v8 ; NO_FOLDING-NEXT: ret ; @@ -461,16 +455,14 @@ define @vwop_vscale_zext_i1i32_multiple_users(ptr %x, ptr %y, ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetvli a3, zero, e32, m1, ta, mu ; FOLDING-NEXT: vlm.v v0, (a0) -; FOLDING-NEXT: vlm.v v8, (a2) -; FOLDING-NEXT: vlm.v v9, (a1) -; FOLDING-NEXT: vmv.v.i v10, 0 -; FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 -; FOLDING-NEXT: vmv.v.v v0, v8 -; FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 -; FOLDING-NEXT: vadd.vv v10, v11, v8 -; FOLDING-NEXT: vsub.vv v8, v11, v8 -; FOLDING-NEXT: vmv.v.v v0, v9 -; FOLDING-NEXT: vor.vv v10, v10, v11, v0.t +; FOLDING-NEXT: vmv.v.i v8, 0 +; FOLDING-NEXT: vmerge.vim v9, v8, 1, v0 +; FOLDING-NEXT: vlm.v v0, (a2) +; FOLDING-NEXT: vmerge.vim v8, v8, 1, v0 +; FOLDING-NEXT: vlm.v v0, (a1) +; FOLDING-NEXT: vadd.vv v10, v9, v8 +; FOLDING-NEXT: vsub.vv v8, v9, v8 +; FOLDING-NEXT: vor.vv v10, v10, v9, v0.t ; FOLDING-NEXT: vor.vv v8, v10, v8 ; FOLDING-NEXT: ret %a = load , ptr %x @@ -492,16 +484,14 @@ define @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p ; NO_FOLDING: # %bb.0: ; NO_FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; NO_FOLDING-NEXT: vlm.v v0, (a0) -; NO_FOLDING-NEXT: vlm.v v8, (a2) -; NO_FOLDING-NEXT: vlm.v v9, (a1) -; NO_FOLDING-NEXT: vmv.v.i v10, 0 -; NO_FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 -; NO_FOLDING-NEXT: vmv1r.v v0, v8 -; NO_FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 -; NO_FOLDING-NEXT: vadd.vv v10, v11, v8 -; NO_FOLDING-NEXT: vsub.vv v8, v11, v8 -; NO_FOLDING-NEXT: vmv1r.v v0, v9 -; NO_FOLDING-NEXT: vor.vv v10, v10, v11, v0.t +; NO_FOLDING-NEXT: vmv.v.i v8, 0 +; NO_FOLDING-NEXT: vmerge.vim v9, v8, 1, v0 +; NO_FOLDING-NEXT: vlm.v v0, (a2) +; NO_FOLDING-NEXT: vmerge.vim v8, v8, 1, v0 +; NO_FOLDING-NEXT: vlm.v v0, (a1) +; NO_FOLDING-NEXT: vadd.vv v10, v9, v8 +; NO_FOLDING-NEXT: vsub.vv v8, v9, v8 +; NO_FOLDING-NEXT: vor.vv v10, v10, v9, v0.t ; NO_FOLDING-NEXT: vor.vv v8, v10, v8 ; NO_FOLDING-NEXT: ret ; @@ -509,16 +499,14 @@ define @vwop_vscale_zext_i1i8_multiple_users(ptr %x, ptr %y, p ; FOLDING: # %bb.0: ; FOLDING-NEXT: vsetvli a3, zero, e8, mf4, ta, mu ; FOLDING-NEXT: vlm.v v0, (a0) -; FOLDING-NEXT: vlm.v v8, (a2) -; FOLDING-NEXT: vlm.v v9, (a1) -; FOLDING-NEXT: vmv.v.i v10, 0 -; FOLDING-NEXT: vmerge.vim v11, v10, 1, v0 -; FOLDING-NEXT: vmv1r.v v0, v8 -; FOLDING-NEXT: vmerge.vim v8, v10, 1, v0 -; FOLDING-NEXT: vadd.vv v10, v11, v8 -; FOLDING-NEXT: vsub.vv v8, v11, v8 -; FOLDING-NEXT: vmv1r.v v0, v9 -; FOLDING-NEXT: vor.vv v10, v10, v11, v0.t +; FOLDING-NEXT: vmv.v.i v8, 0 +; FOLDING-NEXT: vmerge.vim v9, v8, 1, v0 +; FOLDING-NEXT: vlm.v v0, (a2) +; FOLDING-NEXT: vmerge.vim v8, v8, 1, v0 +; FOLDING-NEXT: vlm.v v0, (a1) +; FOLDING-NEXT: vadd.vv v10, v9, v8 +; FOLDING-NEXT: vsub.vv v8, v9, v8 +; FOLDING-NEXT: vor.vv v10, v10, v9, v0.t ; FOLDING-NEXT: vor.vv v8, v10, v8 ; FOLDING-NEXT: ret %a = load , ptr %x diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll index 9cdec6a9ff2e9..30044ad580143 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-fp.ll @@ -494,17 +494,17 @@ define @vfmerge_nzv_nxv8f64( %va, @vselect_combine_regression( %va, %vb) { ; CHECK-LABEL: vselect_combine_regression: ; CHECK: # %bb.0: -; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vsetvli a2, zero, e64, m8, ta, mu -; CHECK-NEXT: vmseq.vi v24, v16, 0 +; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, mu +; CHECK-NEXT: vmv8r.v v24, v16 ; CHECK-NEXT: vmseq.vi v0, v8, 0 ; CHECK-NEXT: vmv.v.i v16, 0 -; CHECK-NEXT: slli a1, a1, 3 ; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: add a1, a0, a1 ; CHECK-NEXT: vle64.v v8, (a0), v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vle64.v v16, (a1), v0.t +; CHECK-NEXT: vmseq.vi v0, v24, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: vle64.v v16, (a0), v0.t ; CHECK-NEXT: ret %cond = icmp eq %va, zeroinitializer %sel = select %cond, %vb, zeroinitializer From df56b1a2cf06d1954a9cd1a290a264375f47440d Mon Sep 17 00:00:00 2001 From: Kelvin Li Date: Thu, 17 Jul 2025 23:52:48 -0400 Subject: [PATCH 281/813] [flang] handle allocation of zero-sized objects (#149165) This PR handles the allocation of zero-sized objects for different implementations. One byte is allocated for the zero-sized objects. --- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 10 ++++++ flang/test/Fir/alloc-32.fir | 4 ++- flang/test/Fir/alloc.fir | 48 ++++++++++++++++++------- flang/test/Fir/arrexp.fir | 4 ++- flang/test/Fir/convert-to-llvm.fir | 32 ++++++++++++----- flang/test/Lower/forall/character-1.f90 | 4 ++- 6 files changed, 79 insertions(+), 23 deletions(-) diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 5ca53ee48955e..d879382555c39 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -1123,6 +1123,16 @@ struct AllocMemOpConversion : public fir::FIROpConversion { for (mlir::Value opnd : adaptor.getOperands()) size = rewriter.create( loc, ity, size, integerCast(loc, rewriter, ity, opnd)); + + // As the return value of malloc(0) is implementation defined, allocate one + // byte to ensure the allocation status being true. This behavior aligns to + // what the runtime has. + mlir::Value zero = genConstantIndex(loc, ity, rewriter, 0); + mlir::Value one = genConstantIndex(loc, ity, rewriter, 1); + mlir::Value cmp = rewriter.create( + loc, mlir::LLVM::ICmpPredicate::sgt, size, zero); + size = rewriter.create(loc, cmp, size, one); + auto mallocTyWidth = lowerTy().getIndexTypeBitwidth(); auto mallocTy = mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth); diff --git a/flang/test/Fir/alloc-32.fir b/flang/test/Fir/alloc-32.fir index 3eefc3225fac7..a3cbf200c24fc 100644 --- a/flang/test/Fir/alloc-32.fir +++ b/flang/test/Fir/alloc-32.fir @@ -20,7 +20,9 @@ func.func @allocmem_scalar_nonchar() -> !fir.heap { // CHECK-SAME: i32 %[[len:.*]]) // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64 // CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]] -// CHECK: %[[trunc:.*]] = trunc i64 %[[mul2]] to i32 +// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0 +// CHECK: %[[sz:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1 +// CHECK: %[[trunc:.*]] = trunc i64 %[[sz]] to i32 // CHECK: call ptr @malloc(i32 %[[trunc]]) func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap> { %1 = fir.allocmem !fir.char<1,?>(%l : i32) diff --git a/flang/test/Fir/alloc.fir b/flang/test/Fir/alloc.fir index 5b4930bb9cb34..8da8b828c18b9 100644 --- a/flang/test/Fir/alloc.fir +++ b/flang/test/Fir/alloc.fir @@ -87,7 +87,9 @@ func.func @alloca_scalar_dynchar_kind(%l : i32) -> !fir.ref> { // CHECK-SAME: i32 %[[len:.*]]) // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64 // CHECK: %[[mul2:.*]] = mul i64 1, %[[mul1]] -// CHECK: call ptr @malloc(i64 %[[mul2]]) +// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0 +// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1 +// CHECK: call ptr @malloc(i64 %[[size]]) func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap> { %1 = fir.allocmem !fir.char<1,?>(%l : i32) return %1 : !fir.heap> @@ -97,7 +99,9 @@ func.func @allocmem_scalar_dynchar(%l : i32) -> !fir.heap> { // CHECK-SAME: i32 %[[len:.*]]) // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64 // CHECK: %[[mul2:.*]] = mul i64 2, %[[mul1]] -// CHECK: call ptr @malloc(i64 %[[mul2]]) +// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0 +// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1 +// CHECK: call ptr @malloc(i64 %[[size]]) func.func @allocmem_scalar_dynchar_kind(%l : i32) -> !fir.heap>{ %1 = fir.allocmem !fir.char<2,?>(%l : i32) return %1 : !fir.heap> @@ -152,7 +156,9 @@ func.func @allocmem_array_of_char() -> !fir.heap> // CHECK-SAME: i32 %[[len:.*]]) // CHECK: %[[mul1:.*]] = sext i32 %[[len]] to i64 // CHECK: %[[mul2:.*]] = mul i64 9, %[[mul1]] -// CHECK: call ptr @malloc(i64 %[[mul2]]) +// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0 +// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1 +// CHECK: call ptr @malloc(i64 %[[size]]) func.func @allocmem_array_of_dynchar(%l: i32) -> !fir.heap>> { %1 = fir.allocmem !fir.array<3x3x!fir.char<1,?>>(%l : i32) return %1 : !fir.heap>> @@ -180,7 +186,9 @@ func.func @alloca_dynarray_of_nonchar2(%e: index) -> !fir.ref !fir.heap> { %1 = fir.allocmem !fir.array<3x?xi32>, %e return %1 : !fir.heap> @@ -190,7 +198,9 @@ func.func @allocmem_dynarray_of_nonchar(%e: index) -> !fir.heap !fir.heap> { %1 = fir.allocmem !fir.array, %e, %e return %1 : !fir.heap> @@ -218,7 +228,9 @@ func.func @alloca_dynarray_of_char2(%e : index) -> !fir.ref !fir.heap>> { %1 = fir.allocmem !fir.array<3x?x!fir.char<2,10>>, %e return %1 : !fir.heap>> @@ -228,7 +240,9 @@ func.func @allocmem_dynarray_of_char(%e : index) -> !fir.heap !fir.heap>> { %1 = fir.allocmem !fir.array>, %e, %e return %1 : !fir.heap>> @@ -261,7 +275,9 @@ func.func @alloca_dynarray_of_dynchar2(%l: i32, %e : index) -> !fir.ref !fir.heap>> { %1 = fir.allocmem !fir.array<3x?x!fir.char<2,?>>(%l : i32), %e return %1 : !fir.heap>> @@ -273,7 +289,9 @@ func.func @allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> !fir.heap !fir.heap>> { %1 = fir.allocmem !fir.array>(%l : i32), %e, %e return %1 : !fir.heap>> @@ -312,7 +330,9 @@ func.func @alloca_array_with_holes_dynchar(%arg0: index, %arg1: index) -> !fir.r // CHECK-SAME: i64 %[[e1:.*]], i64 %[[e2:.*]]) // CHECK: %[[a:.*]] = mul i64 240, %[[e1]] // CHECK: %[[b:.*]] = mul i64 %3, %[[e2]] -// CHECK: call ptr @malloc(i64 %[[b]]) +// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[b]], 0 +// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[b]], i64 1 +// CHECK: call ptr @malloc(i64 %[[size]]) func.func @allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> !fir.heap> { %a = fir.allocmem !fir.array<4x?x3x?x5xi32>, %0, %1 return %a : !fir.heap> @@ -321,7 +341,9 @@ func.func @allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> !fir.hea // CHECK-LABEL: define ptr @allocmem_array_with_holes_char( // CHECK-SAME: i64 %[[e:.*]]) // CHECK: %[[mul:.*]] = mul i64 240, %[[e]] -// CHECK: call ptr @malloc(i64 %[[mul]]) +// CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul]], 0 +// CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul]], i64 1 +// CHECK: call ptr @malloc(i64 %[[size]]) func.func @allocmem_array_with_holes_char(%e: index) -> !fir.heap>> { %1 = fir.allocmem !fir.array<3x?x4x!fir.char<2,10>>, %e return %1 : !fir.heap>> @@ -331,7 +353,9 @@ func.func @allocmem_array_with_holes_char(%e: index) -> !fir.heap !fir.heap>> { %1 = fir.allocmem !fir.array<3x?x4x!fir.char<2,?>>(%arg0 : index), %arg1 return %1 : !fir.heap>> diff --git a/flang/test/Fir/arrexp.fir b/flang/test/Fir/arrexp.fir index 6c7f71f6f1f9c..e8ec8ac79e0c2 100644 --- a/flang/test/Fir/arrexp.fir +++ b/flang/test/Fir/arrexp.fir @@ -146,7 +146,9 @@ func.func @f6(%arg0: !fir.box>, %arg1: f32) { // CHECK: %[[EXT_GEP:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 7, i64 0, i32 1 // CHECK: %[[EXTENT:.*]] = load i64, ptr %[[EXT_GEP]] // CHECK: %[[SIZE:.*]] = mul i64 4, %[[EXTENT]] - // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SIZE]]) + // CHECK: %[[CMP:.*]] = icmp sgt i64 %[[SIZE]], 0 + // CHECK: %[[SZ:.*]] = select i1 %[[CMP]], i64 %[[SIZE]], i64 1 + // CHECK: %[[MALLOC:.*]] = call ptr @malloc(i64 %[[SZ]]) %1 = fir.slice %c2, %c10, %c1 : (index, index, index) -> !fir.slice<1> %2 = fir.array_load %arg0 [%1] : (!fir.box>, !fir.slice<1>) -> !fir.array %3 = fir.slice %c1, %c9, %c1 : (index, index, index) -> !fir.slice<1> diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir index 0e2bfe48a807d..50a98466f0d4b 100644 --- a/flang/test/Fir/convert-to-llvm.fir +++ b/flang/test/Fir/convert-to-llvm.fir @@ -216,10 +216,14 @@ func.func @test_alloc_and_freemem_one() { } // CHECK-LABEL: llvm.func @test_alloc_and_freemem_one() { -// CHECK: %[[N:.*]] = llvm.mlir.constant(4 : i64) : i64 -// CHECK-NEXT: llvm.call @malloc(%[[N]]) -// CHECK: llvm.call @free(%{{.*}}) -// CHECK-NEXT: llvm.return +// CHECK-DAG: %[[N:.*]] = llvm.mlir.constant(4 : i64) : i64 +// CHECK-DAG: %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK-DAG: %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK-NEXT: %[[CMP:.*]] = llvm.icmp "sgt" %[[N]], %[[ZERO]] : i64 +// CHECK-NEXT: %[[SZ:.*]] = llvm.select %[[CMP]], %[[N]], %[[ONE]] : i1, i64 +// CHECK-NEXT: llvm.call @malloc(%[[SZ]]) +// CHECK: llvm.call @free(%{{.*}}) +// CHECK-NEXT: llvm.return // ----- // Verify that fir.allocmem is transformed to a call to malloc @@ -233,8 +237,12 @@ func.func @test_alloc_and_freemem_several() { } // CHECK-LABEL: llvm.func @test_alloc_and_freemem_several() { -// CHECK: %[[N:.*]] = llvm.mlir.constant(400 : i64) : i64 -// CHECK: [[MALLOC:%.*]] = llvm.call @malloc(%[[N]]) +// CHECK-DAG: %[[N:.*]] = llvm.mlir.constant(400 : i64) : i64 +// CHECK-DAG: %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK-DAG: %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK-NEXT: %[[CMP:.*]] = llvm.icmp "sgt" %[[N]], %[[ZERO]] : i64 +// CHECK-NEXT: %[[SZ:.*]] = llvm.select %[[CMP]], %[[N]], %[[ONE]] : i1, i64 +// CHECK: [[MALLOC:%.*]] = llvm.call @malloc(%[[SZ]]) // CHECK: llvm.call @free([[MALLOC]]) // CHECK: llvm.return @@ -250,7 +258,11 @@ func.func @test_with_shape(%ncols: index, %nrows: index) { // CHECK: %[[FOUR:.*]] = llvm.mlir.constant(4 : i64) : i64 // CHECK: %[[DIM1_SIZE:.*]] = llvm.mul %[[FOUR]], %[[NCOLS]] : i64 // CHECK: %[[TOTAL_SIZE:.*]] = llvm.mul %[[DIM1_SIZE]], %[[NROWS]] : i64 -// CHECK: %[[MEM:.*]] = llvm.call @malloc(%[[TOTAL_SIZE]]) +// CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[CMP:.*]] = llvm.icmp "sgt" %[[TOTAL_SIZE]], %[[ZERO]] : i64 +// CHECK: %[[SZ:.*]] = llvm.select %[[CMP]], %[[TOTAL_SIZE]], %[[ONE]] : i1, i64 +// CHECK: %[[MEM:.*]] = llvm.call @malloc(%[[SZ]]) // CHECK: llvm.call @free(%[[MEM]]) : (!llvm.ptr) -> () // CHECK: llvm.return // CHECK: } @@ -266,7 +278,11 @@ func.func @test_string_with_shape(%len: index, %nelems: index) { // CHECK: %[[ONE:.*]] = llvm.mlir.constant(1 : i64) : i64 // CHECK: %[[LEN_SIZE:.*]] = llvm.mul %[[ONE]], %[[LEN]] : i64 // CHECK: %[[TOTAL_SIZE:.*]] = llvm.mul %[[LEN_SIZE]], %[[NELEMS]] : i64 -// CHECK: %[[MEM:.*]] = llvm.call @malloc(%[[TOTAL_SIZE]]) +// CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0 : i64) : i64 +// CHECK: %[[ONEA:.*]] = llvm.mlir.constant(1 : i64) : i64 +// CHECK: %[[CMP:.*]] = llvm.icmp "sgt" %[[TOTAL_SIZE]], %[[ZERO]] : i64 +// CHECK: %[[SZ:.*]] = llvm.select %[[CMP]], %[[TOTAL_SIZE]], %[[ONEA]] : i1, i64 +// CHECK: %[[MEM:.*]] = llvm.call @malloc(%[[SZ]]) // CHECK: llvm.call @free(%[[MEM]]) : (!llvm.ptr) -> () // CHECK: llvm.return // CHECK: } diff --git a/flang/test/Lower/forall/character-1.f90 b/flang/test/Lower/forall/character-1.f90 index 1e4bb73350871..d1e12a8dbdfec 100644 --- a/flang/test/Lower/forall/character-1.f90 +++ b/flang/test/Lower/forall/character-1.f90 @@ -29,7 +29,9 @@ end program test ! CHECK: %[[esval:.*]] = load i64, ptr %[[elesize]] ! CHECK: %[[mul:.*]] = mul i64 1, %[[esval]] ! CHECK: %[[mul2:.*]] = mul i64 %[[mul]], %[[extval]] -! CHECK: %[[buff:.*]] = call ptr @malloc(i64 %[[mul2]]) +! CHECK: %[[cmp:.*]] = icmp sgt i64 %[[mul2]], 0 +! CHECK: %[[size:.*]] = select i1 %[[cmp]], i64 %[[mul2]], i64 1 +! CHECK: %[[buff:.*]] = call ptr @malloc(i64 %[[size]]) ! CHECK: %[[to:.*]] = getelementptr i8, ptr %[[buff]], i64 % ! CHECK: call void @llvm.memmove.p0.p0.i64(ptr %[[to]], ptr %{{.*}}, i64 %{{.*}}, i1 false) ! CHECK: call void @free(ptr %[[buff]]) From 4c85bf2fe8042c855c9dd5be4b02191e9d071ffd Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Thu, 17 Jul 2025 21:04:01 -0700 Subject: [PATCH 282/813] Revert "[Clang] Make the SizeType, SignedSizeType and PtrdiffType be named sugar types instead of built-in types (#143653)" This reverts commit c27e283cfbca2bd22f34592430e98ee76ed60ad8. A builbot failure has been reported: https://lab.llvm.org/buildbot/#/builders/186/builds/10819/steps/10/logs/stdio I'm also getting a large number of warnings related to %zu and %zx. --- .../clangd/unittests/FindTargetTests.cpp | 2 +- .../clangd/unittests/HoverTests.cpp | 4 +- clang/docs/ReleaseNotes.rst | 1 - clang/include/clang/AST/ASTContext.h | 13 +- clang/include/clang/AST/FormatString.h | 3 +- clang/include/clang/AST/RecursiveASTVisitor.h | 4 - clang/include/clang/AST/Type.h | 56 ---- clang/include/clang/AST/TypeLoc.h | 10 - clang/include/clang/AST/TypeProperties.td | 9 - clang/include/clang/Basic/TypeNodes.td | 1 - .../clang/Serialization/TypeBitCodes.def | 1 - clang/lib/AST/ASTContext.cpp | 76 ++--- clang/lib/AST/ASTImporter.cpp | 5 - clang/lib/AST/ASTStructuralEquivalence.cpp | 7 - clang/lib/AST/FormatString.cpp | 126 ++------- clang/lib/AST/ItaniumMangle.cpp | 4 - clang/lib/AST/PrintfFormatString.cpp | 11 +- clang/lib/AST/ScanfFormatString.cpp | 21 +- clang/lib/AST/Type.cpp | 12 - clang/lib/AST/TypePrinter.cpp | 10 - clang/lib/CodeGen/CGCall.cpp | 2 +- clang/lib/CodeGen/CGCoroutine.cpp | 8 +- clang/lib/CodeGen/CGDebugInfo.cpp | 3 +- clang/lib/CodeGen/CGObjCMac.cpp | 2 +- clang/lib/CodeGen/CodeGenFunction.cpp | 3 +- clang/lib/Sema/SemaChecking.cpp | 4 +- clang/lib/Sema/SemaExpr.cpp | 3 - clang/lib/Sema/SemaExprCXX.cpp | 10 +- clang/lib/Sema/TreeTransform.h | 6 - clang/lib/Serialization/ASTReader.cpp | 5 - clang/lib/Serialization/ASTWriter.cpp | 6 +- .../StaticAnalyzer/Checkers/MallocChecker.cpp | 25 +- .../Checkers/StdLibraryFunctionsChecker.cpp | 80 +++--- .../Checkers/VLASizeChecker.cpp | 2 +- ...d_resource_element_compatible_concept.hlsl | 2 +- clang/test/AST/ast-dump-array.cpp | 2 +- clang/test/AST/ast-dump-expr-json.c | 9 +- clang/test/AST/ast-dump-expr-json.cpp | 24 +- clang/test/AST/ast-dump-expr.c | 6 +- clang/test/AST/ast-dump-expr.cpp | 16 +- ...dump-openmp-distribute-parallel-for-simd.c | 20 +- .../ast-dump-openmp-distribute-parallel-for.c | 20 +- ...arget-teams-distribute-parallel-for-simd.c | 160 +++++------ ...nmp-target-teams-distribute-parallel-for.c | 160 +++++------ ...penmp-teams-distribute-parallel-for-simd.c | 160 +++++------ ...ump-openmp-teams-distribute-parallel-for.c | 160 +++++------ clang/test/AST/ast-dump-stmt-json.cpp | 71 ++--- clang/test/AST/ast-dump-stmt.cpp | 4 +- clang/test/AST/ast-dump-traits.cpp | 8 +- clang/test/AST/ast-dump-types-errors-json.cpp | 3 +- clang/test/Analysis/cfg.cpp | 2 +- clang/test/Analysis/explain-svals.cpp | 2 +- .../std-c-library-functions-arg-weakdeps.c | 2 +- .../Analysis/std-c-library-functions-lookup.c | 2 +- ...td-c-library-functions-vs-stream-checker.c | 4 +- clang/test/Analysis/std-c-library-functions.c | 4 +- clang/test/CXX/drs/cwg2xx.cpp | 2 +- clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp | 10 +- clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp | 6 +- clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp | 2 +- .../test/FixIt/fixit-format-ios-nopedantic.m | 2 +- clang/test/FixIt/format.m | 6 +- .../test/Sema/format-strings-fixit-ssize_t.c | 2 +- clang/test/Sema/format-strings-int-typedefs.c | 12 +- clang/test/Sema/format-strings-scanf.c | 8 +- clang/test/Sema/format-strings-size_t.c | 13 +- clang/test/Sema/matrix-type-builtins.c | 8 +- clang/test/Sema/ptrauth-atomic-ops.c | 2 +- clang/test/Sema/ptrauth.c | 2 +- .../SemaCXX/cxx2c-trivially-relocatable.cpp | 2 +- clang/test/SemaCXX/enum-scoped.cpp | 4 +- .../SemaCXX/microsoft-varargs-diagnostics.cpp | 6 +- clang/test/SemaCXX/new-delete.cpp | 2 +- clang/test/SemaCXX/static-assert-cxx26.cpp | 14 +- ...are-new-delete-basic-free-declarations.cpp | 2 +- .../unavailable_aligned_allocation.cpp | 24 +- clang/test/SemaHLSL/Language/AssignArray.hlsl | 4 +- clang/test/SemaHLSL/Language/InitListAST.hlsl | 264 +++++++++--------- .../SemaObjC/format-size-spec-nsinteger.m | 17 +- clang/test/SemaObjC/matrix-type-builtins.m | 2 +- .../SemaOpenCL/cl20-device-side-enqueue.cl | 6 +- clang/test/SemaTemplate/type_pack_element.cpp | 12 +- clang/tools/libclang/CIndex.cpp | 4 - .../deque/spare_block_handling.pass.cpp | 8 +- .../TypeSystem/Clang/TypeSystemClang.cpp | 4 - 85 files changed, 756 insertions(+), 1070 deletions(-) diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp index 4d77f9d690ca0..602f61d9ecb41 100644 --- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp +++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp @@ -838,7 +838,7 @@ TEST_F(TargetDeclTest, OverloadExpr) { )cpp"; // Sized deallocation is enabled by default in C++14 onwards. EXPECT_DECLS("CXXDeleteExpr", - "void operator delete(void *, __size_t) noexcept"); + "void operator delete(void *, unsigned long) noexcept"); } TEST_F(TargetDeclTest, DependentExprs) { diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp index 4a21dafed5e95..775278ccf694b 100644 --- a/clang-tools-extra/clangd/unittests/HoverTests.cpp +++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp @@ -2794,7 +2794,7 @@ TEST(Hover, All) { })cpp", [](HoverInfo &HI) { HI.Name = "expression"; - HI.Type = {"__size_t", "unsigned long"}; + HI.Type = "unsigned long"; HI.Value = "1"; }}, { @@ -2804,7 +2804,7 @@ TEST(Hover, All) { })cpp", [](HoverInfo &HI) { HI.Name = "expression"; - HI.Type = {"__size_t", "unsigned long"}; + HI.Type = "unsigned long"; HI.Value = "1"; }}, { diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 288f2fb9d81ca..fcd3887ec7a09 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -46,7 +46,6 @@ Potentially Breaking Changes ``endbr64`` instruction at the labels named as possible branch destinations, so it is not safe to use a register-controlled branch instruction to branch to one. (In line with gcc.) -- Added a sugar type `PredefinedSugarType` to improve diagnostic messages. (#GH143653) C/C++ Language Potentially Breaking Changes ------------------------------------------- diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 5b456794e7b13..66ec3395571ea 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -277,11 +277,6 @@ class ASTContext : public RefCountedBase { mutable llvm::ContextualFoldingSet ArrayParameterTypes; - /// Store the unique Type corresponding to each Kind. - mutable std::array - PredefinedSugarTypes{}; - /// The set of nested name specifiers. /// /// This set is managed by the NestedNameSpecifier class. @@ -1572,8 +1567,6 @@ class ASTContext : public RefCountedBase { /// and bit count. QualType getDependentBitIntType(bool Unsigned, Expr *BitsExpr) const; - QualType getPredefinedSugarType(PredefinedSugarType::Kind KD) const; - /// Gets the struct used to keep track of the extended descriptor for /// pointer to blocks. QualType getBlockDescriptorExtendedType() const; @@ -2006,13 +1999,11 @@ class ASTContext : public RefCountedBase { /// . /// /// The sizeof operator requires this (C99 6.5.3.4p4). - QualType getSizeType() const; - - CanQualType getCanonicalSizeType() const; + CanQualType getSizeType() const; /// Return the unique signed counterpart of /// the integer type corresponding to size_t. - QualType getSignedSizeType() const; + CanQualType getSignedSizeType() const; /// Return the unique type for "intmax_t" (C99 7.18.1.5), defined in /// . diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h index a284f2c44d633..3560766433fe2 100644 --- a/clang/include/clang/AST/FormatString.h +++ b/clang/include/clang/AST/FormatString.h @@ -489,8 +489,7 @@ class FormatSpecifier { /// For a TypedefType QT, if it is a named integer type such as size_t, /// assign the appropriate value to LM and return true. - static bool namedTypeToLengthModifier(ASTContext &Ctx, QualType QT, - LengthModifier &LM); + static bool namedTypeToLengthModifier(QualType QT, LengthModifier &LM); }; } // end analyze_format_string namespace diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 62991d986e675..519a811775c01 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -1208,8 +1208,6 @@ DEF_TRAVERSE_TYPE(BitIntType, {}) DEF_TRAVERSE_TYPE(DependentBitIntType, { TRY_TO(TraverseStmt(T->getNumBitsExpr())); }) -DEF_TRAVERSE_TYPE(PredefinedSugarType, {}) - #undef DEF_TRAVERSE_TYPE // ----------------- TypeLoc traversal ----------------- @@ -1526,8 +1524,6 @@ DEF_TRAVERSE_TYPELOC(DependentBitIntType, { TRY_TO(TraverseStmt(TL.getTypePtr()->getNumBitsExpr())); }) -DEF_TRAVERSE_TYPELOC(PredefinedSugarType, {}) - #undef DEF_TRAVERSE_TYPELOC // ----------------- Decl traversal ----------------- diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 764e9d508a25a..21b97102db95a 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2258,30 +2258,6 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { unsigned NumExpansions; }; - enum class PredefinedSugarKind { - /// The "size_t" type. - SizeT, - - /// The signed integer type corresponding to "size_t". - SignedSizeT, - - /// The "ptrdiff_t" type. - PtrdiffT, - - // Indicates how many items the enum has. - Last = PtrdiffT - }; - - class PresefinedSugarTypeBitfields { - friend class PredefinedSugarType; - - LLVM_PREFERRED_TYPE(TypeBitfields) - unsigned : NumTypeBits; - - LLVM_PREFERRED_TYPE(PredefinedSugarKind) - unsigned Kind : 8; - }; - class CountAttributedTypeBitfields { friend class CountAttributedType; @@ -2321,7 +2297,6 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { DependentTemplateSpecializationTypeBits; PackExpansionTypeBitfields PackExpansionTypeBits; CountAttributedTypeBitfields CountAttributedTypeBits; - PresefinedSugarTypeBitfields PredefinedSugarTypeBits; }; private: @@ -8063,37 +8038,6 @@ class DependentBitIntType final : public Type, public llvm::FoldingSetNode { } }; -class PredefinedSugarType final : public Type { -public: - friend class ASTContext; - using Kind = PredefinedSugarKind; - -private: - PredefinedSugarType(Kind KD, const IdentifierInfo *IdentName, - QualType CanonicalType) - : Type(PredefinedSugar, CanonicalType, TypeDependence::None), - Name(IdentName) { - PredefinedSugarTypeBits.Kind = llvm::to_underlying(KD); - } - - static StringRef getName(Kind KD); - - const IdentifierInfo *Name; - -public: - bool isSugared() const { return true; } - - QualType desugar() const { return getCanonicalTypeInternal(); } - - Kind getKind() const { return Kind(PredefinedSugarTypeBits.Kind); } - - const IdentifierInfo *getIdentifier() const { return Name; } - - static bool classof(const Type *T) { - return T->getTypeClass() == PredefinedSugar; - } -}; - /// A qualifier set is used to build a set of qualifiers. class QualifierCollector : public Qualifiers { public: diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h index be0bc896de3ea..cf06e27758996 100644 --- a/clang/include/clang/AST/TypeLoc.h +++ b/clang/include/clang/AST/TypeLoc.h @@ -2783,16 +2783,6 @@ class ObjCProtocolLoc { } }; -struct PredefinedSugarTypeLocInfo {}; // Nothing. - -class PredefinedSugarTypeLoc final - : public ConcreteTypeLoc { -public: - void initializeLocal(ASTContext &Context, SourceLocation loc) {} - SourceRange getLocalSourceRange() const { return {}; } -}; - } // namespace clang #endif // LLVM_CLANG_AST_TYPELOC_H diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 3114d1180319a..a6157649060b1 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -1028,12 +1028,3 @@ let Class = DependentBitIntType in { return ctx.getDependentBitIntType(isUnsigned, numBitsExpr); }]>; } - -let Class = PredefinedSugarType in { - def : Property<"kind", UInt32> { - let Read = [{ static_cast(node->getKind()) }]; - } - def : Creator<[{ - return ctx.getPredefinedSugarType(static_cast(kind)); - }]>; -} diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td index 971ce541d4831..567b8a5ca5a4d 100644 --- a/clang/include/clang/Basic/TypeNodes.td +++ b/clang/include/clang/Basic/TypeNodes.td @@ -117,4 +117,3 @@ def PipeType : TypeNode; def AtomicType : TypeNode; def BitIntType : TypeNode; def DependentBitIntType : TypeNode, AlwaysDependent; -def PredefinedSugarType : TypeNode, NeverCanonical; diff --git a/clang/include/clang/Serialization/TypeBitCodes.def b/clang/include/clang/Serialization/TypeBitCodes.def index 613eb6af2005a..b8cde2e370960 100644 --- a/clang/include/clang/Serialization/TypeBitCodes.def +++ b/clang/include/clang/Serialization/TypeBitCodes.def @@ -69,6 +69,5 @@ TYPE_BIT_CODE(CountAttributed, COUNT_ATTRIBUTED, 57) TYPE_BIT_CODE(ArrayParameter, ARRAY_PARAMETER, 58) TYPE_BIT_CODE(HLSLAttributedResource, HLSLRESOURCE_ATTRIBUTED, 59) TYPE_BIT_CODE(HLSLInlineSpirv, HLSL_INLINE_SPIRV, 60) -TYPE_BIT_CODE(PredefinedSugar, PREDEFINED_SUGAR, 61) #undef TYPE_BIT_CODE diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 6b6275faa215a..232a4b6557b92 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2597,9 +2597,6 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { } break; - case Type::PredefinedSugar: - return getTypeInfo(cast(T)->desugar().getTypePtr()); - case Type::Pipe: Width = Target->getPointerWidth(LangAS::opencl_global); Align = Target->getPointerAlign(LangAS::opencl_global); @@ -5219,39 +5216,6 @@ QualType ASTContext::getDependentBitIntType(bool IsUnsigned, return QualType(New, 0); } -QualType -ASTContext::getPredefinedSugarType(PredefinedSugarType::Kind KD) const { - using Kind = PredefinedSugarType::Kind; - - if (auto *Target = PredefinedSugarTypes[llvm::to_underlying(KD)]; - Target != nullptr) - return QualType(Target, 0); - - auto getCanonicalType = [](const ASTContext &Ctx, Kind KDI) -> QualType { - switch (KDI) { - // size_t (C99TC3 6.5.3.4), signed size_t (C++23 5.13.2) and - // ptrdiff_t (C99TC3 6.5.6) Although these types are not built-in, they - // are part of the core language and are widely used. Using - // PredefinedSugarType makes these types as named sugar types rather than - // standard integer types, enabling better hints and diagnostics. - case Kind::SizeT: - return Ctx.getFromTargetType(Ctx.Target->getSizeType()); - case Kind::SignedSizeT: - return Ctx.getFromTargetType(Ctx.Target->getSignedSizeType()); - case Kind::PtrdiffT: - return Ctx.getFromTargetType(Ctx.Target->getPtrDiffType(LangAS::Default)); - } - llvm_unreachable("unexpected kind"); - }; - - auto *New = new (*this, alignof(PredefinedSugarType)) - PredefinedSugarType(KD, &Idents.get(PredefinedSugarType::getName(KD)), - getCanonicalType(*this, static_cast(KD))); - Types.push_back(New); - PredefinedSugarTypes[llvm::to_underlying(KD)] = New; - return QualType(New, 0); -} - #ifndef NDEBUG static bool NeedsInjectedClassNameType(const RecordDecl *D) { if (!isa(D)) return false; @@ -6832,31 +6796,14 @@ QualType ASTContext::getTagDeclType(const TagDecl *Decl) const { /// getSizeType - Return the unique type for "size_t" (C99 7.17), the result /// of the sizeof operator (C99 6.5.3.4p4). The value is target dependent and /// needs to agree with the definition in . -QualType ASTContext::getSizeType() const { - return getPredefinedSugarType(PredefinedSugarType::Kind::SizeT); -} - -CanQualType ASTContext::getCanonicalSizeType() const { +CanQualType ASTContext::getSizeType() const { return getFromTargetType(Target->getSizeType()); } /// Return the unique signed counterpart of the integer type /// corresponding to size_t. -QualType ASTContext::getSignedSizeType() const { - return getPredefinedSugarType(PredefinedSugarType::Kind::SignedSizeT); -} - -/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17) -/// defined in . Pointer - pointer requires this (C99 6.5.6p9). -QualType ASTContext::getPointerDiffType() const { - return getPredefinedSugarType(PredefinedSugarType::Kind::PtrdiffT); -} - -/// Return the unique unsigned counterpart of "ptrdiff_t" -/// integer type. The standard (C11 7.21.6.1p7) refers to this type -/// in the definition of %tu format specifier. -QualType ASTContext::getUnsignedPointerDiffType() const { - return getFromTargetType(Target->getUnsignedPtrDiffType(LangAS::Default)); +CanQualType ASTContext::getSignedSizeType() const { + return getFromTargetType(Target->getSignedSizeType()); } /// getIntMaxType - Return the unique type for "intmax_t" (C99 7.18.1.5). @@ -6891,6 +6838,19 @@ QualType ASTContext::getUIntPtrType() const { return getCorrespondingUnsignedType(getIntPtrType()); } +/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17) +/// defined in . Pointer - pointer requires this (C99 6.5.6p9). +QualType ASTContext::getPointerDiffType() const { + return getFromTargetType(Target->getPtrDiffType(LangAS::Default)); +} + +/// Return the unique unsigned counterpart of "ptrdiff_t" +/// integer type. The standard (C11 7.21.6.1p7) refers to this type +/// in the definition of %tu format specifier. +QualType ASTContext::getUnsignedPointerDiffType() const { + return getFromTargetType(Target->getUnsignedPtrDiffType(LangAS::Default)); +} + /// Return the unique type for "pid_t" defined in /// . We need this to compute the correct type for vfork(). QualType ASTContext::getProcessIDType() const { @@ -14543,10 +14503,6 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X, DX->isCountInBytes(), DX->isOrNull(), CDX); } - case Type::PredefinedSugar: - assert(cast(X)->getKind() != - cast(Y)->getKind()); - return QualType(); } llvm_unreachable("Unhandled Type Class"); } diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index b9bdabe0b8c06..b5f6c5a8c6abe 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -2080,11 +2080,6 @@ ExpectedType clang::ASTNodeImporter::VisitDependentBitIntType( *ToNumBitsExprOrErr); } -ExpectedType clang::ASTNodeImporter::VisitPredefinedSugarType( - const clang::PredefinedSugarType *T) { - return Importer.getToContext().getPredefinedSugarType(T->getKind()); -} - ExpectedType clang::ASTNodeImporter::VisitDependentSizedMatrixType( const clang::DependentSizedMatrixType *T) { Error Err = Error::success(); diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp index 0f2762d5c0f14..289c6d7737de7 100644 --- a/clang/lib/AST/ASTStructuralEquivalence.cpp +++ b/clang/lib/AST/ASTStructuralEquivalence.cpp @@ -1477,13 +1477,6 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, return false; break; } - case Type::PredefinedSugar: { - const auto *TP1 = cast(T1); - const auto *TP2 = cast(T2); - if (TP1->getKind() != TP2->getKind()) - return false; - break; - } } // end switch return true; diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp index 0bb737fa6a8af..5d3b56fc4e713 100644 --- a/clang/lib/AST/FormatString.cpp +++ b/clang/lib/AST/FormatString.cpp @@ -320,86 +320,6 @@ bool clang::analyze_format_string::ParseUTF8InvalidSpecifier( // Methods on ArgType. //===----------------------------------------------------------------------===// -static bool namedTypeToLengthModifierKind(ASTContext &Ctx, QualType QT, - LengthModifier::Kind &K) { - if (!Ctx.getLangOpts().C99 && !Ctx.getLangOpts().CPlusPlus) - return false; - for (/**/; const auto *TT = QT->getAs(); QT = TT->desugar()) { - const auto *TD = TT->getDecl(); - const auto *DC = TT->getDecl()->getDeclContext(); - if (DC->isTranslationUnit() || DC->isStdNamespace()) { - StringRef Name = TD->getIdentifier()->getName(); - if (Name == "size_t") { - K = LengthModifier::AsSizeT; - return true; - } else if (Name == "ssize_t" /*Not C99, but common in Unix.*/) { - K = LengthModifier::AsSizeT; - return true; - } else if (Name == "ptrdiff_t") { - K = LengthModifier::AsPtrDiff; - return true; - } else if (Name == "intmax_t") { - K = LengthModifier::AsIntMax; - return true; - } else if (Name == "uintmax_t") { - K = LengthModifier::AsIntMax; - return true; - } - } - } - if (const auto *PST = QT->getAs()) { - using Kind = PredefinedSugarType::Kind; - switch (PST->getKind()) { - case Kind::SizeT: - case Kind::SignedSizeT: - K = LengthModifier::AsSizeT; - return true; - case Kind::PtrdiffT: - K = LengthModifier::AsPtrDiff; - return true; - } - llvm_unreachable("unexpected kind"); - } - return false; -} - -// Check whether T and E are compatible size_t/ptrdiff_t typedefs. E must be -// consistent with LE. -// T is the type of the actual expression in the code to be checked, and E is -// the expected type parsed from the format string. -static clang::analyze_format_string::ArgType::MatchKind -matchesSizeTPtrdiffT(ASTContext &C, QualType T, QualType E, - LengthModifier::Kind LE) { - using Kind = LengthModifier::Kind; - using MatchKind = clang::analyze_format_string::ArgType::MatchKind; - assert(LE == Kind::AsPtrDiff || LE == Kind::AsSizeT); - - if (!T->isIntegerType()) - return MatchKind::NoMatch; - - if (C.getCorrespondingSignedType(T.getCanonicalType()) != - C.getCorrespondingSignedType(E.getCanonicalType())) - return MatchKind::NoMatch; - - // signed size_t and unsigned ptrdiff_t does not have typedefs in C and C++. - if (LE == Kind::AsSizeT && E->isSignedIntegerType()) - return T->isSignedIntegerType() ? MatchKind::Match - : MatchKind::NoMatchSignedness; - - if (LE == LengthModifier::Kind::AsPtrDiff && E->isUnsignedIntegerType()) - return T->isUnsignedIntegerType() ? MatchKind::Match - : MatchKind::NoMatchSignedness; - - if (Kind Actual = Kind::None; namedTypeToLengthModifierKind(C, T, Actual)) { - if (Actual == LE) - return MatchKind::Match; - else if (Actual == Kind::AsPtrDiff || Actual == Kind::AsSizeT) - return MatchKind::NoMatchSignedness; - } - - return MatchKind::NoMatch; -} - clang::analyze_format_string::ArgType::MatchKind ArgType::matchesType(ASTContext &C, QualType argTy) const { // When using the format attribute in C++, you can receive a function or an @@ -474,13 +394,6 @@ ArgType::matchesType(ASTContext &C, QualType argTy) const { } case SpecificTy: { - if (TK != TypeKind::DontCare) { - return matchesSizeTPtrdiffT(C, argTy, T, - TK == TypeKind::SizeT - ? LengthModifier::Kind::AsSizeT - : LengthModifier::AsPtrDiff); - } - if (const EnumType *ETy = argTy->getAs()) { // If the enum is incomplete we know nothing about the underlying type. // Assume that it's 'int'. Do not use the underlying type for a scoped @@ -740,18 +653,6 @@ ArgType::matchesArgType(ASTContext &C, const ArgType &Other) const { if (Left.K == AK::SpecificTy) { if (Right.K == AK::SpecificTy) { - if (Left.TK != TypeKind::DontCare) { - return matchesSizeTPtrdiffT(C, Right.T, Left.T, - Left.TK == TypeKind::SizeT - ? LengthModifier::Kind::AsSizeT - : LengthModifier::AsPtrDiff); - } else if (Right.TK != TypeKind::DontCare) { - return matchesSizeTPtrdiffT(C, Left.T, Right.T, - Right.TK == TypeKind::SizeT - ? LengthModifier::Kind::AsSizeT - : LengthModifier::AsPtrDiff); - } - auto Canon1 = C.getCanonicalType(Left.T); auto Canon2 = C.getCanonicalType(Right.T); if (Canon1 == Canon2) @@ -1297,12 +1198,29 @@ FormatSpecifier::getCorrectedLengthModifier() const { return std::nullopt; } -bool FormatSpecifier::namedTypeToLengthModifier(ASTContext &Ctx, QualType QT, +bool FormatSpecifier::namedTypeToLengthModifier(QualType QT, LengthModifier &LM) { - if (LengthModifier::Kind Out = LengthModifier::Kind::None; - namedTypeToLengthModifierKind(Ctx, QT, Out)) { - LM.setKind(Out); - return true; + for (/**/; const auto *TT = QT->getAs(); + QT = TT->getDecl()->getUnderlyingType()) { + const TypedefNameDecl *Typedef = TT->getDecl(); + const IdentifierInfo *Identifier = Typedef->getIdentifier(); + if (Identifier->getName() == "size_t") { + LM.setKind(LengthModifier::AsSizeT); + return true; + } else if (Identifier->getName() == "ssize_t") { + // Not C99, but common in Unix. + LM.setKind(LengthModifier::AsSizeT); + return true; + } else if (Identifier->getName() == "intmax_t") { + LM.setKind(LengthModifier::AsIntMax); + return true; + } else if (Identifier->getName() == "uintmax_t") { + LM.setKind(LengthModifier::AsIntMax); + return true; + } else if (Identifier->getName() == "ptrdiff_t") { + LM.setKind(LengthModifier::AsPtrDiff); + return true; + } } return false; } diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 2a667934dba42..6d082b31a9caa 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -2514,10 +2514,6 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty, mangleSourceNameWithAbiTags(cast(Ty)->getDecl()); break; - case Type::PredefinedSugar: - mangleType(cast(Ty)->desugar()); - break; - case Type::UnresolvedUsing: mangleSourceNameWithAbiTags( cast(Ty)->getDecl()); diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp index bcd44f0a85eed..293164ddac8f8 100644 --- a/clang/lib/AST/PrintfFormatString.cpp +++ b/clang/lib/AST/PrintfFormatString.cpp @@ -543,8 +543,7 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx, case LengthModifier::AsIntMax: return ArgType(Ctx.getIntMaxType(), "intmax_t"); case LengthModifier::AsSizeT: - return ArgType::makeSizeT( - ArgType(Ctx.getSignedSizeType(), "signed size_t")); + return ArgType::makeSizeT(ArgType(Ctx.getSignedSizeType(), "ssize_t")); case LengthModifier::AsInt3264: return Ctx.getTargetInfo().getTriple().isArch64Bit() ? ArgType(Ctx.LongLongTy, "__int64") @@ -627,11 +626,9 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx, case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType::makeSizeT( - ArgType(Ctx.getSignedSizeType(), "signed size_t"))); + return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType::makePtrdiffT( - ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); + return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); case LengthModifier::AsLongDouble: return ArgType(); // FIXME: Is this a known extension? case LengthModifier::AsAllocate: @@ -920,7 +917,7 @@ bool PrintfSpecifier::fixType(QualType QT, const LangOptions &LangOpt, // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99. if (LangOpt.C99 || LangOpt.CPlusPlus11) - namedTypeToLengthModifier(Ctx, QT, LM); + namedTypeToLengthModifier(QT, LM); // If fixing the length modifier was enough, we might be done. if (hasValidLengthModifier(Ctx.getTargetInfo(), LangOpt)) { diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp index 1227edd47d13d..7ee21c8c61954 100644 --- a/clang/lib/AST/ScanfFormatString.cpp +++ b/clang/lib/AST/ScanfFormatString.cpp @@ -251,11 +251,9 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType::makeSizeT( - ArgType(Ctx.getSignedSizeType(), "signed size_t"))); + return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType::makePtrdiffT( - ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); + return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); case LengthModifier::AsLongDouble: // GNU extension. return ArgType::PtrTo(Ctx.LongLongTy); @@ -294,11 +292,10 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getUIntMaxType(), "uintmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo( - ArgType::makeSizeT(ArgType(Ctx.getSizeType(), "size_t"))); + return ArgType::PtrTo(ArgType(Ctx.getSizeType(), "size_t")); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType::makePtrdiffT( - ArgType(Ctx.getUnsignedPointerDiffType(), "unsigned ptrdiff_t"))); + return ArgType::PtrTo( + ArgType(Ctx.getUnsignedPointerDiffType(), "unsigned ptrdiff_t")); case LengthModifier::AsLongDouble: // GNU extension. return ArgType::PtrTo(Ctx.UnsignedLongLongTy); @@ -393,11 +390,9 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType::makeSizeT( - ArgType(Ctx.getSignedSizeType(), "signed size_t"))); + return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType::makePtrdiffT( - ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); + return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); case LengthModifier::AsLongDouble: return ArgType(); // FIXME: Is this a known extension? case LengthModifier::AsAllocate: @@ -506,7 +501,7 @@ bool ScanfSpecifier::fixType(QualType QT, QualType RawQT, // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99. if (LangOpt.C99 || LangOpt.CPlusPlus11) - namedTypeToLengthModifier(Ctx, PT, LM); + namedTypeToLengthModifier(PT, LM); // If fixing the length modifier was enough, we are done. if (hasValidLengthModifier(Ctx.getTargetInfo(), LangOpt)) { diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 7444a2f90c5dd..e5a1ab2ff8906 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -5613,15 +5613,3 @@ HLSLAttributedResourceType::findHandleTypeOnResource(const Type *RT) { } return nullptr; } - -StringRef PredefinedSugarType::getName(Kind KD) { - switch (KD) { - case Kind::SizeT: - return "__size_t"; - case Kind::SignedSizeT: - return "__signed_size_t"; - case Kind::PtrdiffT: - return "__ptrdiff_t"; - } - llvm_unreachable("unexpected kind"); -} diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index deb453fe6ee75..818d2139628e3 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -248,7 +248,6 @@ bool TypePrinter::canPrefixQualifiers(const Type *T, case Type::BTFTagAttributed: case Type::HLSLAttributedResource: case Type::HLSLInlineSpirv: - case Type::PredefinedSugar: CanPrefixQualifiers = true; break; @@ -1418,15 +1417,6 @@ void TypePrinter::printDependentBitIntBefore(const DependentBitIntType *T, void TypePrinter::printDependentBitIntAfter(const DependentBitIntType *T, raw_ostream &OS) {} -void TypePrinter::printPredefinedSugarBefore(const PredefinedSugarType *T, - raw_ostream &OS) { - OS << T->getIdentifier()->getName(); - spaceBeforePlaceHolder(OS); -} - -void TypePrinter::printPredefinedSugarAfter(const PredefinedSugarType *T, - raw_ostream &OS) {} - /// Appends the given scope to the end of a string. void TypePrinter::AppendScope(DeclContext *DC, raw_ostream &OS, DeclarationName NameInScope) { diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 668c91798bf54..c8c3d6b20c496 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -214,7 +214,7 @@ static void appendParameterTypes( for (unsigned I = 0, E = FPT->getNumParams(); I != E; ++I) { prefix.push_back(FPT->getParamType(I)); if (ExtInfos[I].hasPassObjectSize()) - prefix.push_back(CGT.getContext().getCanonicalSizeType()); + prefix.push_back(CGT.getContext().getSizeType()); } addExtParameterInfosForCall(paramInfos, FPT.getTypePtr(), PrefixSize, diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp index 5ee908922b5a3..117ef3d16e21b 100644 --- a/clang/lib/CodeGen/CGCoroutine.cpp +++ b/clang/lib/CodeGen/CGCoroutine.cpp @@ -1006,15 +1006,15 @@ RValue CodeGenFunction::EmitCoroutineIntrinsic(const CallExpr *E, } case llvm::Intrinsic::coro_size: { auto &Context = getContext(); - llvm::IntegerType *T = - Builder.getIntNTy(Context.getTypeSize(Context.getSizeType())); + CanQualType SizeTy = Context.getSizeType(); + llvm::IntegerType *T = Builder.getIntNTy(Context.getTypeSize(SizeTy)); llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::coro_size, T); return RValue::get(Builder.CreateCall(F)); } case llvm::Intrinsic::coro_align: { auto &Context = getContext(); - llvm::IntegerType *T = - Builder.getIntNTy(Context.getTypeSize(Context.getSizeType())); + CanQualType SizeTy = Context.getSizeType(); + llvm::IntegerType *T = Builder.getIntNTy(Context.getTypeSize(SizeTy)); llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::coro_align, T); return RValue::get(Builder.CreateCall(F)); } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 96da253ce1471..446cf8d9e05c6 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -4052,8 +4052,7 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) { return CreateType(cast(Ty), Unit); case Type::HLSLInlineSpirv: return CreateType(cast(Ty), Unit); - case Type::PredefinedSugar: - return getOrCreateType(cast(Ty)->desugar(), Unit); + case Type::CountAttributed: case Type::Auto: case Type::Attributed: diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 8c66176942cb5..8e71a576552d3 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -285,7 +285,7 @@ class ObjCCommonTypesHelper { SmallVector Params; Params.push_back(Ctx.VoidPtrTy); Params.push_back(Ctx.VoidPtrTy); - Params.push_back(Ctx.getCanonicalSizeType()); + Params.push_back(Ctx.getSizeType()); Params.push_back(Ctx.BoolTy); Params.push_back(Ctx.BoolTy); llvm::FunctionType *FTy = Types.GetFunctionType( diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index ab345a598c4e8..0fda31c8e5fa1 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -720,7 +720,7 @@ static bool matchesStlAllocatorFn(const Decl *D, const ASTContext &Ctx) { (MD->getNumParams() != 1 && MD->getNumParams() != 2)) return false; - if (!Ctx.hasSameType(MD->parameters()[0]->getType(), Ctx.getSizeType())) + if (MD->parameters()[0]->getType().getCanonicalType() != Ctx.getSizeType()) return false; if (MD->getNumParams() == 2) { @@ -2491,7 +2491,6 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) { case Type::ObjCObjectPointer: case Type::BitInt: case Type::HLSLInlineSpirv: - case Type::PredefinedSugar: llvm_unreachable("type class is never variably-modified!"); case Type::Elaborated: diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 5e523fe887318..dd5b710d7e1d4 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5239,9 +5239,7 @@ bool Sema::BuiltinVAStartARMMicrosoft(CallExpr *Call) { << 2 << Arg1->getType() << ConstCharPtrTy; const QualType SizeTy = Context.getSizeType(); - if (!Context.hasSameType( - Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers(), - SizeTy)) + if (Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers() != SizeTy) Diag(Arg2->getBeginLoc(), diag::err_typecheck_convert_incompatible) << Arg2->getType() << SizeTy << 1 /* different class */ << 0 /* qualifier difference */ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 45c7178c6965d..728ada33e2e63 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -4564,9 +4564,6 @@ static void captureVariablyModifiedType(ASTContext &Context, QualType T, case Type::Atomic: T = cast(Ty)->getValueType(); break; - case Type::PredefinedSugar: - T = cast(Ty)->desugar(); - break; } } while (!T.isNull() && T->isVariablyModifiedType()); } diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 0edfd6015cbd9..fd95f4ec54229 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -3461,11 +3461,11 @@ void Sema::DeclareGlobalAllocationFunction(DeclarationName Name, // non-templated allocation function we are trying to declare here. if (FunctionDecl *Func = dyn_cast(*Alloc)) { if (Func->getNumParams() == Params.size()) { - if (std::equal(Func->param_begin(), Func->param_end(), Params.begin(), - Params.end(), [&](ParmVarDecl *D, QualType RT) { - return Context.hasSameUnqualifiedType(D->getType(), - RT); - })) { + llvm::SmallVector FuncParams; + for (auto *P : Func->parameters()) + FuncParams.push_back( + Context.getCanonicalType(P->getType().getUnqualifiedType())); + if (llvm::ArrayRef(FuncParams) == Params) { // Make the function visible to name lookup, even if we found it in // an unimported module. It either is an implicitly-declared global // allocation function, or is suppressing that function. diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index c7428d1a02345..286c2b486c0f9 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -7245,12 +7245,6 @@ QualType TreeTransform::TransformDependentBitIntType( return Result; } -template -QualType TreeTransform::TransformPredefinedSugarType( - TypeLocBuilder &TLB, PredefinedSugarTypeLoc TL) { - llvm_unreachable("This type does not need to be transformed."); -} - /// Simple iterator that traverses the template arguments in a /// container that provides a \c getArgLoc() member function. /// diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 10aedb68fcd9d..3596d2240167e 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -7574,16 +7574,11 @@ void TypeLocReader::VisitPipeTypeLoc(PipeTypeLoc TL) { void TypeLocReader::VisitBitIntTypeLoc(clang::BitIntTypeLoc TL) { TL.setNameLoc(readSourceLocation()); } - void TypeLocReader::VisitDependentBitIntTypeLoc( clang::DependentBitIntTypeLoc TL) { TL.setNameLoc(readSourceLocation()); } -void TypeLocReader::VisitPredefinedSugarTypeLoc(PredefinedSugarTypeLoc TL) { - // Nothing to do. -} - void ASTRecordReader::readTypeLoc(TypeLoc TL) { TypeLocReader TLR(*this); for (; !TL.isNull(); TL = TL.getNextTypeLoc()) diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index a6957e54b66f1..e868afeb1a145 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -692,6 +692,7 @@ void TypeLocWriter::VisitAtomicTypeLoc(AtomicTypeLoc TL) { void TypeLocWriter::VisitPipeTypeLoc(PipeTypeLoc TL) { addSourceLocation(TL.getKWLoc()); } + void TypeLocWriter::VisitBitIntTypeLoc(clang::BitIntTypeLoc TL) { addSourceLocation(TL.getNameLoc()); } @@ -700,11 +701,6 @@ void TypeLocWriter::VisitDependentBitIntTypeLoc( addSourceLocation(TL.getNameLoc()); } -void TypeLocWriter::VisitPredefinedSugarTypeLoc( - clang::PredefinedSugarTypeLoc TL) { - // Nothing to do. -} - void ASTWriter::WriteTypeAbbrevs() { using namespace llvm; diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp index 68efdbaec341b..30a04977d906d 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp @@ -1281,7 +1281,7 @@ SVal MallocChecker::evalMulForBufferSize(CheckerContext &C, const Expr *Blocks, SVal BlockBytesVal = C.getSVal(BlockBytes); ProgramStateRef State = C.getState(); SVal TotalSize = SB.evalBinOp(State, BO_Mul, BlocksVal, BlockBytesVal, - SB.getContext().getCanonicalSizeType()); + SB.getContext().getSizeType()); return TotalSize; } @@ -1311,9 +1311,11 @@ static bool isStandardRealloc(const CallEvent &Call) { const FunctionDecl *FD = dyn_cast(Call.getDecl()); assert(FD); ASTContext &AC = FD->getASTContext(); - return AC.hasSameType(FD->getDeclaredReturnType(), AC.VoidPtrTy) && - AC.hasSameType(FD->getParamDecl(0)->getType(), AC.VoidPtrTy) && - AC.hasSameType(FD->getParamDecl(1)->getType(), AC.getSizeType()); + + return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy && + FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy && + FD->getParamDecl(1)->getType().getDesugaredType(AC) == + AC.getSizeType(); } static bool isGRealloc(const CallEvent &Call) { @@ -1321,9 +1323,10 @@ static bool isGRealloc(const CallEvent &Call) { assert(FD); ASTContext &AC = FD->getASTContext(); - return AC.hasSameType(FD->getDeclaredReturnType(), AC.VoidPtrTy) && - AC.hasSameType(FD->getParamDecl(0)->getType(), AC.VoidPtrTy) && - AC.hasSameType(FD->getParamDecl(1)->getType(), AC.UnsignedLongTy); + return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy && + FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy && + FD->getParamDecl(1)->getType().getDesugaredType(AC) == + AC.UnsignedLongTy; } void MallocChecker::checkRealloc(ProgramStateRef State, const CallEvent &Call, @@ -2827,10 +2830,10 @@ MallocChecker::ReallocMemAux(CheckerContext &C, const CallEvent &Call, return nullptr; // Compare the size argument to 0. - DefinedOrUnknownSVal SizeZero = svalBuilder.evalEQ( - State, TotalSize.castAs(), - svalBuilder.makeIntValWithWidth( - svalBuilder.getContext().getCanonicalSizeType(), 0)); + DefinedOrUnknownSVal SizeZero = + svalBuilder.evalEQ(State, TotalSize.castAs(), + svalBuilder.makeIntValWithWidth( + svalBuilder.getContext().getSizeType(), 0)); ProgramStateRef StatePtrIsNull, StatePtrNotNull; std::tie(StatePtrIsNull, StatePtrNotNull) = State->assume(PtrEQ); diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 52b3d1e95942c..1c748f9bc1828 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -1666,7 +1666,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( const QualType IntTy = ACtx.IntTy; const QualType UnsignedIntTy = ACtx.UnsignedIntTy; const QualType LongTy = ACtx.LongTy; - const QualType SizeTyCanonTy = ACtx.getCanonicalSizeType(); + const QualType SizeTy = ACtx.getSizeType(); const QualType VoidPtrTy = getPointerTy(VoidTy); // void * const QualType IntPtrTy = getPointerTy(IntTy); // int * @@ -1684,14 +1684,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( const QualType ConstWchar_tPtrTy = getPointerTy(getConstTy(WCharTy)); // const wchar_t * const QualType ConstVoidPtrRestrictTy = getRestrictTy(ConstVoidPtrTy); - const QualType SizePtrTy = getPointerTy(SizeTyCanonTy); + const QualType SizePtrTy = getPointerTy(SizeTy); const QualType SizePtrRestrictTy = getRestrictTy(SizePtrTy); const RangeInt IntMax = BVF.getMaxValue(IntTy)->getLimitedValue(); const RangeInt UnsignedIntMax = BVF.getMaxValue(UnsignedIntTy)->getLimitedValue(); const RangeInt LongMax = BVF.getMaxValue(LongTy)->getLimitedValue(); - const RangeInt SizeMax = BVF.getMaxValue(SizeTyCanonTy)->getLimitedValue(); + const RangeInt SizeMax = BVF.getMaxValue(SizeTy)->getLimitedValue(); // Set UCharRangeMax to min of int or uchar maximum value. // The C standard states that the arguments of functions like isalpha must @@ -2057,19 +2057,18 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t fread(void *restrict ptr, size_t size, size_t nitems, // FILE *restrict stream); - addToFunctionSummaryMap("fread", - Signature(ArgTypes{VoidPtrRestrictTy, SizeTyCanonTy, - SizeTyCanonTy, FilePtrRestrictTy}, - RetType{SizeTyCanonTy}), - FreadSummary); - // size_t fwrite(const void *restrict ptr, size_t size, size_t nitems, - // FILE *restrict stream); addToFunctionSummaryMap( - "fwrite", - Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTyCanonTy, SizeTyCanonTy, - FilePtrRestrictTy}, - RetType{SizeTyCanonTy}), + "fread", + Signature(ArgTypes{VoidPtrRestrictTy, SizeTy, SizeTy, FilePtrRestrictTy}, + RetType{SizeTy}), FreadSummary); + // size_t fwrite(const void *restrict ptr, size_t size, size_t nitems, + // FILE *restrict stream); + addToFunctionSummaryMap("fwrite", + Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTy, + SizeTy, FilePtrRestrictTy}, + RetType{SizeTy}), + FreadSummary); std::optional Ssize_tTy = lookupTy("ssize_t"); std::optional Ssize_tMax = getMaxValue(Ssize_tTy); @@ -2084,14 +2083,12 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // should handle them together with the rest of the POSIX functions. // ssize_t read(int fildes, void *buf, size_t nbyte); addToFunctionSummaryMap( - "read", - Signature(ArgTypes{IntTy, VoidPtrTy, SizeTyCanonTy}, RetType{Ssize_tTy}), + "read", Signature(ArgTypes{IntTy, VoidPtrTy, SizeTy}, RetType{Ssize_tTy}), ReadSummary); // ssize_t write(int fildes, const void *buf, size_t nbyte); addToFunctionSummaryMap( "write", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy}, - RetType{Ssize_tTy}), + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy}, RetType{Ssize_tTy}), ReadSummary); auto GetLineSummary = @@ -2621,7 +2618,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *strndup(const char *s, size_t n); addToFunctionSummaryMap( "strndup", - Signature(ArgTypes{ConstCharPtrTy, SizeTyCanonTy}, RetType{CharPtrTy}), + Signature(ArgTypes{ConstCharPtrTy, SizeTy}, RetType{CharPtrTy}), Summary(NoEvalCall) .ArgConstraint(NotNull(ArgNo(0))) .ArgConstraint( @@ -2652,8 +2649,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *getcwd(char *buf, size_t size); addToFunctionSummaryMap( - "getcwd", - Signature(ArgTypes{CharPtrTy, SizeTyCanonTy}, RetType{CharPtrTy}), + "getcwd", Signature(ArgTypes{CharPtrTy, SizeTy}, RetType{CharPtrTy}), Summary(NoEvalCall) .Case({NotNull(0), ArgumentCondition(1, WithinRange, Range(1, SizeMax)), @@ -2961,9 +2957,8 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // FIXME: Improve for errno modeling. addToFunctionSummaryMap( "mmap", - Signature( - ArgTypes{VoidPtrTy, SizeTyCanonTy, IntTy, IntTy, IntTy, Off_tTy}, - RetType{VoidPtrTy}), + Signature(ArgTypes{VoidPtrTy, SizeTy, IntTy, IntTy, IntTy, Off_tTy}, + RetType{VoidPtrTy}), Summary(NoEvalCall) .ArgConstraint(ArgumentCondition(1, WithinRange, Range(1, SizeMax))) .ArgConstraint( @@ -2975,9 +2970,8 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // FIXME: Improve for errno modeling. addToFunctionSummaryMap( "mmap64", - Signature( - ArgTypes{VoidPtrTy, SizeTyCanonTy, IntTy, IntTy, IntTy, Off64_tTy}, - RetType{VoidPtrTy}), + Signature(ArgTypes{VoidPtrTy, SizeTy, IntTy, IntTy, IntTy, Off64_tTy}, + RetType{VoidPtrTy}), Summary(NoEvalCall) .ArgConstraint(ArgumentCondition(1, WithinRange, Range(1, SizeMax))) .ArgConstraint( @@ -3008,9 +3002,8 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t bufsize); addToFunctionSummaryMap( "readlink", - Signature( - ArgTypes{ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTyCanonTy}, - RetType{Ssize_tTy}), + Signature(ArgTypes{ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTy}, + RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ArgumentCondition(2, WithinRange, Range(1, IntMax)), ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3032,9 +3025,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *restrict buf, size_t bufsize); addToFunctionSummaryMap( "readlinkat", - Signature(ArgTypes{IntTy, ConstCharPtrRestrictTy, CharPtrRestrictTy, - SizeTyCanonTy}, - RetType{Ssize_tTy}), + Signature( + ArgTypes{IntTy, ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTy}, + RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ArgumentCondition(3, WithinRange, Range(1, IntMax)), ReturnValueCondition(LessThanOrEq, ArgNo(3)), @@ -3275,14 +3268,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t length, // int flags, struct sockaddr *restrict address, // socklen_t *restrict address_len); - Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTyCanonTy, IntTy, + Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTy, IntTy, StructSockaddrPtrRestrictTy, Socklen_tPtrRestrictTy}, RetType{Ssize_tTy}), Recvfrom)) addToFunctionSummaryMap( "recvfrom", - Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTyCanonTy, IntTy, + Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTy, IntTy, Irrelevant, Socklen_tPtrRestrictTy}, RetType{Ssize_tTy}), Recvfrom); @@ -3304,14 +3297,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t sendto(int socket, const void *message, size_t length, // int flags, const struct sockaddr *dest_addr, // socklen_t dest_len); - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy, ConstStructSockaddrPtrTy, Socklen_tTy}, RetType{Ssize_tTy}), Sendto)) addToFunctionSummaryMap( "sendto", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy, - Irrelevant, Socklen_tTy}, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy, Irrelevant, + Socklen_tTy}, RetType{Ssize_tTy}), Sendto); @@ -3327,7 +3320,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t recv(int sockfd, void *buf, size_t len, int flags); addToFunctionSummaryMap( "recv", - Signature(ArgTypes{IntTy, VoidPtrTy, SizeTyCanonTy, IntTy}, + Signature(ArgTypes{IntTy, VoidPtrTy, SizeTy, IntTy}, RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3402,7 +3395,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t send(int sockfd, const void *buf, size_t len, int flags); addToFunctionSummaryMap( "send", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy}, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy}, RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3690,7 +3683,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // int pthread_attr_setguardsize(pthread_attr_t *attr, size_t guardsize); addToFunctionSummaryMap( {"pthread_attr_setstacksize", "pthread_attr_setguardsize"}, - Signature(ArgTypes{Pthread_attr_tPtrTy, SizeTyCanonTy}, RetType{IntTy}), + Signature(ArgTypes{Pthread_attr_tPtrTy, SizeTy}, RetType{IntTy}), Summary(NoEvalCall) .ArgConstraint(NotNull(ArgNo(0))) .ArgConstraint( @@ -3895,14 +3888,13 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( .ArgConstraint(NotNull(ArgNo(1)))); addToFunctionSummaryMap( "__buf_size_arg_constraint", - Signature(ArgTypes{ConstVoidPtrTy, SizeTyCanonTy}, RetType{IntTy}), + Signature(ArgTypes{ConstVoidPtrTy, SizeTy}, RetType{IntTy}), Summary(EvalCallAsPure) .ArgConstraint( BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1)))); addToFunctionSummaryMap( "__buf_size_arg_constraint_mul", - Signature(ArgTypes{ConstVoidPtrTy, SizeTyCanonTy, SizeTyCanonTy}, - RetType{IntTy}), + Signature(ArgTypes{ConstVoidPtrTy, SizeTy, SizeTy}, RetType{IntTy}), Summary(EvalCallAsPure) .ArgConstraint(BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1), /*BufSizeMultiplier=*/ArgNo(2)))); diff --git a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp index c97341f072aba..1042b43680fd2 100644 --- a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp @@ -92,7 +92,7 @@ ProgramStateRef VLASizeChecker::checkVLA(CheckerContext &C, ASTContext &Ctx = C.getASTContext(); SValBuilder &SVB = C.getSValBuilder(); - QualType SizeTy = Ctx.getSizeType(); + CanQualType SizeTy = Ctx.getSizeType(); uint64_t SizeMax = SVB.getBasicValueFactory().getMaxValue(SizeTy)->getZExtValue(); diff --git a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl index fa8d78f38494a..a4f6e6c44794e 100644 --- a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl +++ b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl @@ -9,7 +9,7 @@ // CHECK: | `-TemplateTypeParm {{.*}} 'element_type' // CHECK: `-BinaryOperator {{.*}} 'bool' lvalue '>=' // CHECK: |-UnaryExprOrTypeTraitExpr {{.*}} 'bool' sizeof 'element_type' -// CHECK: `-IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK: `-IntegerLiteral {{.*}} 'unsigned long' 1 StructuredBuffer Buffer; diff --git a/clang/test/AST/ast-dump-array.cpp b/clang/test/AST/ast-dump-array.cpp index 5a982d34683ff..15771f227df8a 100644 --- a/clang/test/AST/ast-dump-array.cpp +++ b/clang/test/AST/ast-dump-array.cpp @@ -14,7 +14,7 @@ void testArrayInitExpr() auto l = [a]{ }; // CHECK: |-ArrayInitLoopExpr 0x{{[^ ]*}} 'int[10]' - // CHECK: | `-ArrayInitIndexExpr 0x{{[^ ]*}} <> '__size_t':'unsigned long' + // CHECK: | `-ArrayInitIndexExpr 0x{{[^ ]*}} <> 'unsigned long' } template diff --git a/clang/test/AST/ast-dump-expr-json.c b/clang/test/AST/ast-dump-expr-json.c index ecb6191c52200..e910864eeed65 100644 --- a/clang/test/AST/ast-dump-expr-json.c +++ b/clang/test/AST/ast-dump-expr-json.c @@ -3911,8 +3911,7 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3965,8 +3964,7 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3991,8 +3989,7 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "alignof", diff --git a/clang/test/AST/ast-dump-expr-json.cpp b/clang/test/AST/ast-dump-expr-json.cpp index 11026c9d302f0..5a762acad7917 100644 --- a/clang/test/AST/ast-dump-expr-json.cpp +++ b/clang/test/AST/ast-dump-expr-json.cpp @@ -1545,8 +1545,7 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "Ts" @@ -1588,8 +1587,7 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "long", -// CHECK-NEXT: "qualType": "__ptrdiff_t" +// CHECK-NEXT: "qualType": "long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "opcode": "-", @@ -1728,7 +1726,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: }, @@ -1757,7 +1755,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: }, @@ -1787,7 +1785,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1862,7 +1860,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1882,8 +1880,7 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -1940,7 +1937,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1960,8 +1957,7 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -2337,7 +2333,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ diff --git a/clang/test/AST/ast-dump-expr.c b/clang/test/AST/ast-dump-expr.c index e7aba39be8f68..959d61ec9794b 100644 --- a/clang/test/AST/ast-dump-expr.c +++ b/clang/test/AST/ast-dump-expr.c @@ -222,15 +222,15 @@ void UnaryOperators(int a, int *b) { // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int' lvalue ParmVar 0x{{[^ ]*}} 'a' 'int' sizeof a; - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' sizeof + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' sizeof // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int' lvalue ParmVar 0x{{[^ ]*}} 'a' 'int' sizeof(int); - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' sizeof 'int' + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' sizeof 'int' _Alignof(int); // FIXME: Uses C++ spelling for alignof in C mode. - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' alignof 'int' + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' alignof 'int' } struct S { diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp index 6fd429d1500a4..8ccb39f8f3165 100644 --- a/clang/test/AST/ast-dump-expr.cpp +++ b/clang/test/AST/ast-dump-expr.cpp @@ -115,34 +115,34 @@ void Casting(const S *s) { template void UnaryExpressions(int *p) { sizeof...(Ts); - // CHECK: SizeOfPackExpr 0x{{[^ ]*}} '__size_t':'unsigned long' 0x{{[^ ]*}} Ts + // CHECK: SizeOfPackExpr 0x{{[^ ]*}} 'unsigned long' 0x{{[^ ]*}} Ts noexcept(p - p); // CHECK: CXXNoexceptExpr 0x{{[^ ]*}} 'bool' - // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} '__ptrdiff_t':'long' '-' + // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'long' '-' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' ::new int; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' global Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' global Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' new (int); - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' new int{12}; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} 'int' // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 12 new int[2]; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(__size_t)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(unsigned long)' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 2 new int[2]{1, 2}; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(__size_t)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(unsigned long)' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 2 // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} 'int[2]' @@ -164,7 +164,7 @@ void UnaryExpressions(int *p) { // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' ::delete p; - // CHECK: CXXDeleteExpr 0x{{[^ ]*}} 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, __size_t) noexcept' + // CHECK: CXXDeleteExpr 0x{{[^ ]*}} 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, unsigned long) noexcept' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' diff --git a/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c index 672607fa90670..10f27e759b5b1 100644 --- a/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c @@ -57,8 +57,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -97,8 +97,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -144,8 +144,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -191,8 +191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -251,8 +251,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c index 8eedf8ac8bc58..419ba57191039 100644 --- a/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c @@ -57,8 +57,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -97,8 +97,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -144,8 +144,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -191,8 +191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -251,8 +251,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c index 64e19ce0a53bf..c209a0456d7a0 100644 --- a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c @@ -65,8 +65,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -94,8 +94,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -123,8 +123,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -152,8 +152,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -189,8 +189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -218,8 +218,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -247,8 +247,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -276,8 +276,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -325,8 +325,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -371,8 +371,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -417,8 +417,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -463,8 +463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -517,8 +517,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -563,8 +563,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -609,8 +609,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -655,8 +655,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -711,8 +711,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -757,8 +757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -803,8 +803,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -849,8 +849,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -903,8 +903,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -949,8 +949,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -995,8 +995,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1041,8 +1041,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1097,8 +1097,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1143,8 +1143,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1189,8 +1189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1289,8 +1289,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1335,8 +1335,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1381,8 +1381,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1427,8 +1427,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1497,8 +1497,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1560,8 +1560,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1623,8 +1623,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1686,8 +1686,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1757,8 +1757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1820,8 +1820,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1883,8 +1883,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1946,8 +1946,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c index cf3f4bfcaf225..b13e096101e63 100644 --- a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c @@ -65,8 +65,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -94,8 +94,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -123,8 +123,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -152,8 +152,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -189,8 +189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -218,8 +218,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -247,8 +247,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -276,8 +276,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -325,8 +325,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -371,8 +371,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -417,8 +417,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -463,8 +463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -517,8 +517,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -563,8 +563,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -609,8 +609,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -655,8 +655,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -711,8 +711,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -757,8 +757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -803,8 +803,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -849,8 +849,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -903,8 +903,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -949,8 +949,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -995,8 +995,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1041,8 +1041,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1097,8 +1097,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1143,8 +1143,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1189,8 +1189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1289,8 +1289,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1335,8 +1335,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1381,8 +1381,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1427,8 +1427,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1497,8 +1497,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1560,8 +1560,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1623,8 +1623,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1686,8 +1686,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1757,8 +1757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1820,8 +1820,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1883,8 +1883,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1946,8 +1946,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c index c8da8cd1a5efa..14356882b599a 100644 --- a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c @@ -71,8 +71,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -99,8 +99,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -127,8 +127,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -155,8 +155,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -211,8 +211,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -239,8 +239,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -267,8 +267,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -295,8 +295,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -363,8 +363,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -407,8 +407,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -451,8 +451,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -495,8 +495,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -568,8 +568,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -612,8 +612,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -656,8 +656,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -700,8 +700,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -775,8 +775,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -819,8 +819,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -863,8 +863,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -907,8 +907,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -984,8 +984,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1028,8 +1028,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1072,8 +1072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1116,8 +1116,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1191,8 +1191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1279,8 +1279,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1323,8 +1323,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1419,8 +1419,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1463,8 +1463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1507,8 +1507,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1551,8 +1551,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1659,8 +1659,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1719,8 +1719,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1779,8 +1779,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1839,8 +1839,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1952,8 +1952,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2012,8 +2012,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2072,8 +2072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2132,8 +2132,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c index 09b649cbb3660..0f983cfdff1dc 100644 --- a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c @@ -71,8 +71,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -99,8 +99,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -127,8 +127,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -155,8 +155,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -211,8 +211,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -239,8 +239,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -267,8 +267,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -295,8 +295,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -363,8 +363,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -407,8 +407,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -451,8 +451,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -495,8 +495,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -568,8 +568,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -612,8 +612,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -656,8 +656,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -700,8 +700,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -775,8 +775,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -819,8 +819,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -863,8 +863,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -907,8 +907,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -984,8 +984,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1028,8 +1028,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1072,8 +1072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1116,8 +1116,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1191,8 +1191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1279,8 +1279,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1323,8 +1323,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1419,8 +1419,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1463,8 +1463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1507,8 +1507,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1551,8 +1551,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1659,8 +1659,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1719,8 +1719,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1779,8 +1779,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1839,8 +1839,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1952,8 +1952,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2012,8 +2012,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2072,8 +2072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2132,8 +2132,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-stmt-json.cpp b/clang/test/AST/ast-dump-stmt-json.cpp index a8f113ce6a3d4..a473d17da9424 100644 --- a/clang/test/AST/ast-dump-stmt-json.cpp +++ b/clang/test/AST/ast-dump-stmt-json.cpp @@ -963,7 +963,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } @@ -994,7 +994,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1126,7 +1126,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1146,8 +1146,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -1338,7 +1337,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } @@ -1370,7 +1369,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1445,7 +1444,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "mangledName": "_Znwm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1458,8 +1457,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1505,7 +1503,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "mangledName": "_ZnwmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t, std::align_val_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long, std::align_val_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1518,8 +1516,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1588,7 +1585,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "mangledName": "_Znam", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1601,8 +1598,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1648,7 +1644,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "mangledName": "_ZnamSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(__size_t, std::align_val_t)" +// CHECK-NEXT: "qualType": "void *(unsigned long, std::align_val_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1661,8 +1657,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1826,7 +1821,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "mangledName": "_ZdlPvm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1852,8 +1847,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1880,7 +1874,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "mangledName": "_ZdlPvmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, __size_t, std::align_val_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, unsigned long, std::align_val_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1906,8 +1900,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -2043,7 +2036,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete[]", // CHECK-NEXT: "mangledName": "_ZdaPvm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -2069,8 +2062,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -2097,7 +2089,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete[]", // CHECK-NEXT: "mangledName": "_ZdaPvmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, __size_t, std::align_val_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, unsigned long, std::align_val_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -2123,8 +2115,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -3890,8 +3881,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3965,8 +3955,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -4096,8 +4085,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -4171,8 +4159,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -4993,8 +4980,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "long", -// CHECK-NEXT: "qualType": "__ptrdiff_t" +// CHECK-NEXT: "qualType": "long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "value": "10" @@ -6517,8 +6503,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "long" -// CHECK-NEXT: "qualType": "__ptrdiff_t" +// CHECK-NEXT: "qualType": "long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "value": "10" diff --git a/clang/test/AST/ast-dump-stmt.cpp b/clang/test/AST/ast-dump-stmt.cpp index 42c5f3b3498a4..407584e5b82de 100644 --- a/clang/test/AST/ast-dump-stmt.cpp +++ b/clang/test/AST/ast-dump-stmt.cpp @@ -206,7 +206,7 @@ void TestIteration() { // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'int *' '+' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]' - // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} '__ptrdiff_t':'long' 10 + // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'long' 10 // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'bool' '!=' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *' @@ -274,7 +274,7 @@ void TestIteration() { // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'int *' '+' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]' - // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} '__ptrdiff_t':'long' 10 + // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'long' 10 // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'bool' '!=' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *' diff --git a/clang/test/AST/ast-dump-traits.cpp b/clang/test/AST/ast-dump-traits.cpp index 72d2a2ae8603e..3085e5883fd2e 100644 --- a/clang/test/AST/ast-dump-traits.cpp +++ b/clang/test/AST/ast-dump-traits.cpp @@ -56,7 +56,7 @@ void test_unary_expr_or_type_trait() { // CHECK-NEXT: |-FunctionDecl {{.*}} line:20:6{{( imported)?}} test_array_type_trait 'void ()' // CHECK-NEXT: | `-CompoundStmt {{.*}} // CHECK-NEXT: | `-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-ArrayTypeTraitExpr {{.*}} '__size_t':'unsigned long' __array_rank +// CHECK-NEXT: | `-ArrayTypeTraitExpr {{.*}} 'unsigned long' __array_rank // CHECK-NEXT: |-FunctionDecl {{.*}} line:25:6{{( imported)?}} test_expression_trait 'void ()' // CHECK-NEXT: | `-CompoundStmt {{.*}} // CHECK-NEXT: | `-CStyleCastExpr {{.*}} 'void' @@ -64,8 +64,8 @@ void test_unary_expr_or_type_trait() { // CHECK-NEXT: `-FunctionDecl {{.*}} line:30:6{{( imported)?}} test_unary_expr_or_type_trait 'void ()' // CHECK-NEXT: `-CompoundStmt {{.*}} // CHECK-NEXT: |-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' sizeof 'int' +// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'int' // CHECK-NEXT: |-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' alignof 'int' +// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' alignof 'int' // CHECK-NEXT: `-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' __alignof 'int' +// CHECK-NEXT: `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' __alignof 'int' diff --git a/clang/test/AST/ast-dump-types-errors-json.cpp b/clang/test/AST/ast-dump-types-errors-json.cpp index d9f918f6c3d72..e15f8eeee20cc 100644 --- a/clang/test/AST/ast-dump-types-errors-json.cpp +++ b/clang/test/AST/ast-dump-types-errors-json.cpp @@ -60,8 +60,7 @@ using TestContainsErrors = int[sizeof(undef())]; // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "desugaredQualType": "unsigned long", -// CHECK-NEXT: "qualType": "__size_t" +// CHECK-NEXT: "qualType": "unsigned long" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", diff --git a/clang/test/Analysis/cfg.cpp b/clang/test/Analysis/cfg.cpp index d6cef88dc18a6..44a89df28e3b2 100644 --- a/clang/test/Analysis/cfg.cpp +++ b/clang/test/Analysis/cfg.cpp @@ -70,7 +70,7 @@ void F(EmptyE e) { // CHECK-NEXT: Succs (1): B1 // CHECK: [B1] // CHECK-NEXT: 1: __builtin_object_size -// CHECK-NEXT: 2: [B1.1] (ImplicitCastExpr, BuiltinFnToFnPtr, __size_t (*)(const void *, int) noexcept) +// CHECK-NEXT: 2: [B1.1] (ImplicitCastExpr, BuiltinFnToFnPtr, unsigned long (*)(const void *, int) noexcept) // CHECK-NEXT: 3: [B1.2](dummy(), 0) // CHECK-NEXT: 4: (void)[B1.3] (CStyleCastExpr, ToVoid, void) // CHECK-NEXT: Preds (1): B2 diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp index dfc650223c9e7..267980c3b20c8 100644 --- a/clang/test/Analysis/explain-svals.cpp +++ b/clang/test/Analysis/explain-svals.cpp @@ -46,7 +46,7 @@ void test_1(int param, void *ptr) { void test_2(char *ptr, int ext) { clang_analyzer_explain((void *) "asdf"); // expected-warning-re{{{{^pointer to element of type 'char' with index 0 of string literal "asdf"$}}}} - clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type '__size_t' tied to pointee of argument 'ptr'$}}}} + clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type 'unsigned long' tied to pointee of argument 'ptr'$}}}} clang_analyzer_explain(conjure()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure\(\)'$}}}} clang_analyzer_explain(glob); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure\(\)'\) for global variable 'glob'$}}}} clang_analyzer_explain(glob_ptr); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure\(\)'\) for global variable 'glob_ptr'$}}}} diff --git a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c index ba5bc57928b0c..1f0d3627fae34 100644 --- a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c +++ b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c @@ -20,7 +20,7 @@ // RUN: -triple x86_64-unknown-linux 2>&1 | FileCheck %s // CHECK: Loaded summary for: int isalnum(int) -// CHECK: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) __attribute__((nonnull(1))) +// CHECK: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict) __attribute__((nonnull(1))) // CHECK: Loaded summary for: int fileno(FILE *stream) void initializeSummaryMap(void); diff --git a/clang/test/Analysis/std-c-library-functions-lookup.c b/clang/test/Analysis/std-c-library-functions-lookup.c index 8182e5a1f5fde..e47d9bddda91b 100644 --- a/clang/test/Analysis/std-c-library-functions-lookup.c +++ b/clang/test/Analysis/std-c-library-functions-lookup.c @@ -6,7 +6,7 @@ // RUN: -analyzer-config eagerly-assume=false \ // RUN: -triple i686-unknown-linux 2>&1 | FileCheck %s -// CHECK: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict) typedef typeof(sizeof(int)) size_t; typedef struct FILE FILE; diff --git a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c index 887817ba8551e..b99cc30149c91 100644 --- a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c +++ b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c @@ -31,8 +31,8 @@ // Verify that the summaries are loaded when the StdLibraryFunctionsChecker is // enabled. // CHECK: Loaded summary for: int getchar(void) -// CHECK-NEXT: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) -// CHECK-NEXT: Loaded summary for: __size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: unsigned long fwrite(const void *restrict, size_t, size_t, FILE *restrict) #include "Inputs/system-header-simulator.h" diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c index b5f663493a676..b03a1a5656517 100644 --- a/clang/test/Analysis/std-c-library-functions.c +++ b/clang/test/Analysis/std-c-library-functions.c @@ -59,8 +59,8 @@ // CHECK-NEXT: Loaded summary for: int tolower(int) // CHECK-NEXT: Loaded summary for: int toascii(int) // CHECK-NEXT: Loaded summary for: int getchar(void) -// CHECK-NEXT: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) -// CHECK-NEXT: Loaded summary for: __size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict) // CHECK-NEXT: Loaded summary for: ssize_t read(int, void *, size_t) // CHECK-NEXT: Loaded summary for: ssize_t write(int, const void *, size_t) // CHECK-NEXT: Loaded summary for: ssize_t getline(char **restrict, size_t *restrict, FILE *restrict) diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp index 556407afa2641..a53a8d1ed64a8 100644 --- a/clang/test/CXX/drs/cwg2xx.cpp +++ b/clang/test/CXX/drs/cwg2xx.cpp @@ -1429,7 +1429,7 @@ namespace cwg299 { // cwg299: 2.8 c++11 // cxx98-11-error@#cwg299-q {{ambiguous conversion of array size expression of type 'T' to an integral or enumeration type}} // cxx98-11-note@#cwg299-int {{conversion to integral type 'int' declared here}} // cxx98-11-note@#cwg299-ushort {{conversion to integral type 'unsigned short' declared here}} - // since-cxx14-error-re@#cwg299-q {{conversion from 'T' to '__size_t' (aka 'unsigned {{long long|long|int}}') is ambiguous}} + // since-cxx14-error-re@#cwg299-q {{{{conversion from 'T' to 'unsigned (long long|long|int)' is ambiguous}}}} // since-cxx14-note@#cwg299-int {{candidate function}} // since-cxx14-note@#cwg299-ushort {{candidate function}} } // namespace cwg299 diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp index d439f304b5101..6942b68690c5d 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp @@ -5,11 +5,11 @@ typedef decltype(sizeof(int)) size_t; // FIXME: These diagnostics should say 'size_t' instead of 'unsigned long' int a = 123_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'unsigned long long' or 'const char *', and no matching literal operator template}} int b = 4.2_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'long double' or 'const char *', and no matching literal operator template}} -int c = "foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and '__size_t' (aka 'unsigned}} -int d = L"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const wchar_t *' and '__size_t' (aka 'unsigned}} -int e = u8"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and '__size_t' (aka 'unsigned}} -int f = u"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char16_t *' and '__size_t' (aka 'unsigned}} -int g = U"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char32_t *' and '__size_t' (aka 'unsigned}} +int c = "foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and 'unsigned}} +int d = L"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const wchar_t *' and 'unsigned}} +int e = u8"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and 'unsigned}} +int f = u"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char16_t *' and 'unsigned}} +int g = U"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char32_t *' and 'unsigned}} int h = 'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'char'}} int i = L'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'wchar_t'}} int j = u'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'char16_t'}} diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp index 463d7854867a2..afadba282e626 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp @@ -13,7 +13,7 @@ float &operator ""_x1 (const char8_t *, size_t); using char8 = double; #endif char8 &i2 = u8"foo"_x1; -double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and '__size_t' (aka 'unsigned long')}} +double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and 'unsigned long'}} char &operator ""_x1(const wchar_t *, size_t); char &i4 = L"foo"_x1; // ok @@ -46,8 +46,8 @@ template float &operator""_s(); void no_fallback() { "hello"_s; // FIXME: It'd be useful to explain what candidates were found and why they didn't work. - "xyzzy"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long'), and no matching literal operator template}} - "yello"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long'), and no matching literal operator template}} + "xyzzy"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and 'unsigned long', and no matching literal operator template}} + "yello"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and 'unsigned long', and no matching literal operator template}} } double &operator""_s(const char*, size_t); diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp index 17d9c83055a1c..d571fcb8697eb 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp @@ -17,7 +17,7 @@ int main() { auto v1 = 1.2_w; // calls operator""_w(1.2L) auto v2 = u"one"_w; // calls operator""_w(u"one", 3) auto v3 = 12_w; // calls operator""_w("12") - "two"_w; // expected-error {{no matching literal operator for call to 'operator""_w' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long')}} + "two"_w; // expected-error {{no matching literal operator for call to 'operator""_w' with arguments of types 'const char *' and 'unsigned long'}} same_type test1; same_type test2; diff --git a/clang/test/FixIt/fixit-format-ios-nopedantic.m b/clang/test/FixIt/fixit-format-ios-nopedantic.m index 836a4b5372f13..db9ac797c2472 100644 --- a/clang/test/FixIt/fixit-format-ios-nopedantic.m +++ b/clang/test/FixIt/fixit-format-ios-nopedantic.m @@ -1,5 +1,5 @@ // RUN: cp %s %t -// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -fixit %t +// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -Werror -fixit %t int printf(const char *restrict, ...); typedef unsigned int NSUInteger; diff --git a/clang/test/FixIt/format.m b/clang/test/FixIt/format.m index e97ae10c974aa..950765bad9339 100644 --- a/clang/test/FixIt/format.m +++ b/clang/test/FixIt/format.m @@ -237,14 +237,14 @@ void testSizeTypes(void) { printf("%zu", 0.f); // expected-warning-re{{format specifies type 'size_t' (aka '{{.+}}') but the argument has type 'float'}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f" - printf("%zd", 0.f); // expected-warning-re{{format specifies type 'signed size_t' (aka '{{.+}}') but the argument has type 'float'}} + printf("%zd", 0.f); // expected-warning-re{{format specifies type 'ssize_t' (aka '{{.+}}') but the argument has type 'float'}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f" short x; #if !defined(__ANDROID__) && !defined(__Fuchsia__) - printf("%zn", &x); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'short *'}} + printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}} #else - printf("%zn", &x); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'short *'}} + printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}} // expected-warning@-1 {{'%n' specifier not supported on this platform}} #endif // !defined(__ANDROID__) && !defined(__Fuchsia__) // PrintfSpecifier::fixType doesn't handle %n, so a fix-it is not emitted, diff --git a/clang/test/Sema/format-strings-fixit-ssize_t.c b/clang/test/Sema/format-strings-fixit-ssize_t.c index 96806517b80f2..2c83db0b66362 100644 --- a/clang/test/Sema/format-strings-fixit-ssize_t.c +++ b/clang/test/Sema/format-strings-fixit-ssize_t.c @@ -11,8 +11,8 @@ int printf(char const *, ...); int scanf(const char *, ...); -typedef long ssize_t; void test(void) { + typedef signed long int ssize_t; printf("%f", (ssize_t) 42); ssize_t s; scanf("%f", &s); diff --git a/clang/test/Sema/format-strings-int-typedefs.c b/clang/test/Sema/format-strings-int-typedefs.c index 8f85e68b067df..341d49c500f43 100644 --- a/clang/test/Sema/format-strings-int-typedefs.c +++ b/clang/test/Sema/format-strings-int-typedefs.c @@ -6,8 +6,8 @@ int scanf(char const *, ...); void test(void) { printf("%jd", 42.0); // expected-warning {{format specifies type 'intmax_t' (aka 'long long')}} printf("%ju", 42.0); // expected-warning {{format specifies type 'uintmax_t' (aka 'unsigned long long')}} - printf("%zu", 42.0); // expected-warning {{format specifies type 'size_t' (aka '__size_t')}} - printf("%td", 42.0); // expected-warning {{format specifies type 'ptrdiff_t' (aka '__ptrdiff_t')}} + printf("%zu", 42.0); // expected-warning {{format specifies type 'size_t' (aka 'unsigned long')}} + printf("%td", 42.0); // expected-warning {{format specifies type 'ptrdiff_t' (aka 'int')}} printf("%lc", 42.0); // expected-warning {{format specifies type 'wint_t' (aka 'int')}} printf("%ls", 42.0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} printf("%S", 42.0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} @@ -15,8 +15,8 @@ void test(void) { scanf("%jd", 0); // expected-warning {{format specifies type 'intmax_t *' (aka 'long long *')}} scanf("%ju", 0); // expected-warning {{format specifies type 'uintmax_t *' (aka 'unsigned long long *')}} - scanf("%zu", 0); // expected-warning {{format specifies type 'size_t *' (aka '__size_t *')}} - scanf("%td", 0); // expected-warning {{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *')}} + scanf("%zu", 0); // expected-warning {{format specifies type 'size_t *' (aka 'unsigned long *')}} + scanf("%td", 0); // expected-warning {{format specifies type 'ptrdiff_t *' (aka 'int *')}} scanf("%lc", 0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} scanf("%ls", 0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} scanf("%S", 0); // expected-warning {{format specifies type 'wchar_t *' (aka 'int *')}} @@ -32,6 +32,6 @@ void test(void) { // The warning still fires, because it checks the underlying type. printf("%jd", (intmax_t)42); // expected-warning {{format specifies type 'intmax_t' (aka 'long long') but the argument has type 'intmax_t' (aka 'void *')}} printf("%ju", (uintmax_t)42); // expected-warning {{format specifies type 'uintmax_t' (aka 'unsigned long long') but the argument has type 'uintmax_t' (aka 'void *')}} - printf("%zu", (size_t)42); // expected-warning {{format specifies type 'size_t' (aka '__size_t') but the argument has type 'size_t' (aka 'void *')}} - printf("%td", (ptrdiff_t)42); // expected-warning {{format specifies type 'ptrdiff_t' (aka '__ptrdiff_t') but the argument has type 'ptrdiff_t' (aka 'void *')}} + printf("%zu", (size_t)42); // expected-warning {{format specifies type 'size_t' (aka 'unsigned long') but the argument has type 'size_t' (aka 'void *')}} + printf("%td", (ptrdiff_t)42); // expected-warning {{format specifies type 'ptrdiff_t' (aka 'int') but the argument has type 'ptrdiff_t' (aka 'void *')}} } diff --git a/clang/test/Sema/format-strings-scanf.c b/clang/test/Sema/format-strings-scanf.c index 0e48a760e457a..eb5b8ec36bf7a 100644 --- a/clang/test/Sema/format-strings-scanf.c +++ b/clang/test/Sema/format-strings-scanf.c @@ -210,13 +210,13 @@ void test_size_types(void) { scanf("%zd", &s); // No warning. double d2 = 0.; - scanf("%zd", &d2); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'double *'}} + scanf("%zd", &d2); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'double *'}} ssize_t sn = 0; scanf("%zn", &sn); // No warning. double d3 = 0.; - scanf("%zn", &d3); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'double *'}} + scanf("%zn", &d3); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'double *'}} } void test_ptrdiff_t_types(void) { @@ -231,13 +231,13 @@ void test_ptrdiff_t_types(void) { scanf("%td", &p2); // No warning. double d2 = 0.; - scanf("%td", &d2); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'double *'}} + scanf("%td", &d2); // expected-warning-re{{format specifies type 'ptrdiff_t *' (aka '{{.+}}') but the argument has type 'double *'}} ptrdiff_t p3 = 0; scanf("%tn", &p3); // No warning. double d3 = 0.; - scanf("%tn", &d3); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'double *'}} + scanf("%tn", &d3); // expected-warning-re{{format specifies type 'ptrdiff_t *' (aka '{{.+}}') but the argument has type 'double *'}} } void check_conditional_literal(char *s, int *i) { diff --git a/clang/test/Sema/format-strings-size_t.c b/clang/test/Sema/format-strings-size_t.c index 08efc1fa25066..5058a762183d3 100644 --- a/clang/test/Sema/format-strings-size_t.c +++ b/clang/test/Sema/format-strings-size_t.c @@ -4,14 +4,14 @@ int printf(char const *, ...); void test(void) { // size_t - printf("%zu", (double)42); // expected-warning {{format specifies type 'size_t' (aka '__size_t') but the argument has type 'double'}} + printf("%zu", (double)42); // expected-warning {{format specifies type 'size_t' (aka 'unsigned long') but the argument has type 'double'}} // intmax_t / uintmax_t printf("%jd", (double)42); // expected-warning {{format specifies type 'intmax_t' (aka 'long') but the argument has type 'double'}} printf("%ju", (double)42); // expected-warning {{format specifies type 'uintmax_t' (aka 'unsigned long') but the argument has type 'double'}} // ptrdiff_t - printf("%td", (double)42); // expected-warning {{format specifies type 'ptrdiff_t' (aka '__ptrdiff_t') but the argument has type 'double'}} + printf("%td", (double)42); // expected-warning {{format specifies type 'ptrdiff_t' (aka 'long') but the argument has type 'double'}} } void test_writeback(void) { @@ -19,9 +19,10 @@ void test_writeback(void) { printf("%jn", (unsigned long*)0); // no-warning printf("%jn", (int*)0); // expected-warning{{format specifies type 'intmax_t *' (aka 'long *') but the argument has type 'int *'}} - printf("%zn", (int*)0); // expected-warning{{format specifies type 'signed size_t *' (aka '__signed_size_t *') but the argument has type 'int *'}} + printf("%zn", (long*)0); // no-warning + // FIXME: Warn about %zn with non-ssize_t argument. - printf("%tn", (long*)0); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'long *'}} - printf("%tn", (unsigned long*)0); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'unsigned long *'}} - printf("%tn", (int*)0); // expected-warning{{format specifies type 'ptrdiff_t *' (aka '__ptrdiff_t *') but the argument has type 'int *'}} + printf("%tn", (long*)0); // no-warning + printf("%tn", (unsigned long*)0); // no-warning + printf("%tn", (int*)0); // expected-warning{{format specifies type 'ptrdiff_t *' (aka 'long *') but the argument has type 'int *'}} } diff --git a/clang/test/Sema/matrix-type-builtins.c b/clang/test/Sema/matrix-type-builtins.c index 77e3b8a4287ed..b92f3ce6a3e8c 100644 --- a/clang/test/Sema/matrix-type-builtins.c +++ b/clang/test/Sema/matrix-type-builtins.c @@ -73,13 +73,13 @@ void column_major_load(float *p1, int *p2, _Bool *p3, struct Foo *p4) { 10, // expected-error {{1st argument must be a pointer to a valid matrix element type}} 1ull << 21, // expected-error {{row dimension is outside the allowed range [1, 1048575]}} 1ull << 21, // expected-error {{column dimension is outside the allowed range [1, 1048575]}} - ""); // expected-error {{incompatible pointer to integer conversion casting 'char[1]' to type '__size_t' (aka 'unsigned long')}} + ""); // expected-error {{incompatible pointer to integer conversion casting 'char[1]' to type 'unsigned long'}} sx5x10_t a13 = __builtin_matrix_column_major_load( 10, // expected-error {{1st argument must be a pointer to a valid matrix element type}} - *p4, // expected-error {{casting 'struct Foo' to incompatible type '__size_t' (aka 'unsigned long')}} + *p4, // expected-error {{casting 'struct Foo' to incompatible type 'unsigned long'}} "", // expected-error {{column argument must be a constant unsigned integer expression}} - // expected-error@-1 {{incompatible pointer to integer conversion casting 'char[1]' to type '__size_t' (aka 'unsigned long')}} + // expected-error@-1 {{incompatible pointer to integer conversion casting 'char[1]' to type 'unsigned long'}} 10); } @@ -96,7 +96,7 @@ void column_major_store(sx5x10_t *m1, ix3x2_t *m2, float *p1, int *p2, struct Fo __builtin_matrix_column_major_store( "", // expected-error {{1st argument must be a matrix}} 10, // expected-error {{2nd argument must be a pointer to a valid matrix element type}} - *p3); // expected-error {{casting 'struct Foo' to incompatible type '__size_t' (aka 'unsigned long')}} + *p3); // expected-error {{casting 'struct Foo' to incompatible type 'unsigned long'}} __builtin_matrix_column_major_store( *m1, diff --git a/clang/test/Sema/ptrauth-atomic-ops.c b/clang/test/Sema/ptrauth-atomic-ops.c index 8872090d83b8d..ccb9a1abcc14d 100644 --- a/clang/test/Sema/ptrauth-atomic-ops.c +++ b/clang/test/Sema/ptrauth-atomic-ops.c @@ -54,7 +54,7 @@ void f() { __c11_atomic_exchange(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}} __c11_atomic_fetch_add(ATOMIZE(non_addr_discriminatedauthenticated_ptr), ATOMIZE(j), memory_order_seq_cst); - // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type '__ptrdiff_t'}} + // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type 'long'}} __c11_atomic_fetch_and(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}} diff --git a/clang/test/Sema/ptrauth.c b/clang/test/Sema/ptrauth.c index b4e5214a7cb50..e3932615c2962 100644 --- a/clang/test/Sema/ptrauth.c +++ b/clang/test/Sema/ptrauth.c @@ -57,7 +57,7 @@ void test_string_discriminator(const char *str) { __builtin_ptrauth_string_discriminator(str); // expected-error {{argument must be a string literal}} __builtin_ptrauth_string_discriminator(L"wide test"); // expected-error {{argument must be a string literal}} expected-warning {{incompatible pointer types passing 'int[10]' to parameter of type 'const char *'}} - void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type '__size_t'}} + void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type 'unsigned long'}} } diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp index c6919447798da..6f4003f525930 100644 --- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp +++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp @@ -372,7 +372,7 @@ void test__builtin_trivially_relocate() { __builtin_trivially_relocate((S*)0, 0, 0); //expected-error {{argument to '__builtin_trivially_relocate' must be relocatable}} __builtin_trivially_relocate((int*)0, 0, 0); //expected-error {{first and second arguments to '__builtin_trivially_relocate' must be of the same type}} - __builtin_trivially_relocate((int*)0, (int*)0, (int*)0); // expected-error-re {{cannot initialize a value of type '__size_t' (aka '{{.*}}') with an rvalue of type 'int *'}} + __builtin_trivially_relocate((int*)0, (int*)0, (int*)0); // expected-error-re {{cannot initialize a value of type '{{.*}}' with an rvalue of type 'int *'}} __builtin_trivially_relocate((int*)0, (int*)0, 0); __builtin_trivially_relocate((R*)0, (R*)0, 0); } diff --git a/clang/test/SemaCXX/enum-scoped.cpp b/clang/test/SemaCXX/enum-scoped.cpp index 2d7b3c9557ebd..0ce47274979d9 100644 --- a/clang/test/SemaCXX/enum-scoped.cpp +++ b/clang/test/SemaCXX/enum-scoped.cpp @@ -35,7 +35,7 @@ int a1[Val2]; int a2[E1::Val1]; #if __cplusplus >= 201703L -// expected-error@-3 {{type 'E1' is not implicitly convertible to '__size_t' (aka 'unsigned long')}} +// expected-error@-3 {{type 'E1' is not implicitly convertible to 'unsigned long'}} #else // expected-error@-5 {{size of array has non-integer type}} #endif @@ -44,7 +44,7 @@ int* p1 = new int[Val2]; int* p2 = new int[E1::Val1]; #if __cplusplus >= 201703L -// expected-error@-3 {{converting 'E1' to incompatible type '__size_t'}} +// expected-error@-3 {{converting 'E1' to incompatible type 'unsigned long'}} #else // expected-error@-5 {{array size expression must have integral or unscoped enumeration type, not 'E1'}} #endif diff --git a/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp b/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp index 91c4ffda9d818..0b76fdd92dabd 100644 --- a/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp +++ b/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp @@ -22,7 +22,7 @@ void test_non_last_argument(int i, int j, ...) { va_list ap; __va_start(&ap, &i, 4); // expected-error@-1{{passing 'int *' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int *' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} + // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} } void test_stack_allocated(int i, ...) { @@ -30,13 +30,13 @@ void test_stack_allocated(int i, ...) { int j; __va_start(&ap, &j, 4); // expected-error@-1{{passing 'int *' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int *' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} + // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} } void test_non_pointer_addressof(int i, ...) { va_list ap; __va_start(&ap, 1, 4); // expected-error@-1{{passing 'int' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} + // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} } diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp index c05130bb30729..f918501554f80 100644 --- a/clang/test/SemaCXX/new-delete.cpp +++ b/clang/test/SemaCXX/new-delete.cpp @@ -109,7 +109,7 @@ void bad_news(int *ip) #elif __cplusplus <= 201103L // expected-error@-4 {{array size expression must have integral or unscoped enumeration type, not 'double'}} #else - // expected-warning@-6 {{implicit conversion from 'double' to '__size_t' (aka 'unsigned int') changes value from 1.1 to 1}} + // expected-warning@-6 {{implicit conversion from 'double' to 'unsigned int' changes value from 1.1 to 1}} #endif (void)new int[1][i]; // expected-note {{read of non-const variable 'i' is not allowed in a constant expression}} diff --git a/clang/test/SemaCXX/static-assert-cxx26.cpp b/clang/test/SemaCXX/static-assert-cxx26.cpp index b2ebd2abb785e..b53c67ee67932 100644 --- a/clang/test/SemaCXX/static-assert-cxx26.cpp +++ b/clang/test/SemaCXX/static-assert-cxx26.cpp @@ -19,7 +19,7 @@ struct InvalidSize { const char* data() const; }; static_assert(true, InvalidSize{}); // expected-error {{the message in a static assertion must have a 'size()' member function returning an object convertible to 'std::size_t'}} \ - // expected-error {{value of type 'const char *' is not implicitly convertible to '__size_t' (aka 'unsigned long')}} + // expected-error {{value of type 'const char *' is not implicitly convertible to 'unsigned long'}} struct InvalidData { unsigned long size() const; unsigned long data() const; @@ -371,13 +371,13 @@ struct E { static_assert(true, A{}); // expected-error {{the message in this static assertion is not a constant expression}} // expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} +static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} +static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} +static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} static_assert(true, E{}); // expected-error {{the message in this static assertion is not a constant expression}} @@ -391,21 +391,21 @@ static_assert( static_assert( false, // expected-error {{static assertion failed}} - B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} + B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); static_assert( false, // expected-error {{static assertion failed}} - C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} + C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); static_assert( false, // expected-error {{static assertion failed}} - D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} + D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); diff --git a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp index 281ef5fa63d6f..87dc58861ee81 100644 --- a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp +++ b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp @@ -75,7 +75,7 @@ template void *operator new(std::type_identity, U); template void operator delete(std::type_identity, U, size_t, std::align_val_t); // expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 2nd parameter; use 'void *' instead}} template void operator delete(std::type_identity, void *, U, std::align_val_t); -// expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 3rd parameter; use '__size_t' (aka 'unsigned long') instead}} +// expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 3rd parameter; use 'unsigned long' instead}} template void operator delete(std::type_identity, void *, size_t, U); // expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 4th parameter; use 'std::align_val_t' instead}} template void *operator new(std::type_identity, typename S::size_ty, std::align_val_t); diff --git a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp index 56c564f170271..45fdec606ad1b 100644 --- a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp +++ b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp @@ -65,12 +65,12 @@ void testOveraligned() { #ifdef NO_ERRORS // expected-no-diagnostics #else -// expected-error-re@-16 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-17 {{if you supply your own aligned allocation functions}} // expected-error-re@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-19 {{if you supply your own aligned allocation functions}} -// expected-error-re@-20 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-20 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-21 {{if you supply your own aligned allocation functions}} // expected-error-re@-22 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-23 {{if you supply your own aligned allocation functions}} @@ -83,12 +83,12 @@ void testOveraligned() { // expected-error-re@-28 {{aligned deallocation function of type 'void (void *, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}} // expected-note@-29 {{if you supply your own aligned allocation functions}} -// expected-error-re@-29 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-29 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-30 {{if you supply your own aligned allocation functions}} // expected-error-re@-31 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-32 {{if you supply your own aligned allocation functions}} -// expected-error-re@-33 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-33 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-34 {{if you supply your own aligned allocation functions}} // expected-error-re@-35 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-36 {{if you supply your own aligned allocation functions}} @@ -111,19 +111,19 @@ void testOveralignedCheckOS() { // expected-no-diagnostics #else #if defined(IOS) -// expected-error@-7 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on iOS 11 or newer}} +// expected-error@-7 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on iOS 11 or newer}} // expected-error@-8 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on iOS 11 or newer}}} #elif defined(TVOS) -// expected-error@-10 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on tvOS 11 or newer}}} +// expected-error@-10 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on tvOS 11 or newer}}} // expected-error@-11 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on tvOS 11 or newer}}} #elif defined(WATCHOS) -// expected-error@-13 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on watchOS 4 or newer}}} +// expected-error@-13 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on watchOS 4 or newer}}} // expected-error@-14 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on watchOS 4 or newer}}} #elif defined(MACOS) -// expected-error@-16 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on macOS 10.13 or newer}}} +// expected-error@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on macOS 10.13 or newer}}} // expected-error@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on macOS 10.13 or newer}}} #elif defined(ZOS) -// expected-error@-19 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is not available on z/OS}}} +// expected-error@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is not available on z/OS}}} // expected-error@-20 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is not available on z/OS}}} #endif @@ -181,19 +181,19 @@ void testExplicitOperatorNewDeleteOveraligned() { #ifdef NO_ERRORS // expected-no-diagnostics #else -// expected-error-re@-11 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-11 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-12 {{if you supply your own aligned allocation functions}} // expected-error-re@-13 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-14 {{if you supply your own aligned allocation functions}} -// expected-error-re@-15 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-15 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-16 {{if you supply your own aligned allocation functions}} // expected-error-re@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-18 {{if you supply your own aligned allocation functions}} -// expected-error-re@-19 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-20 {{if you supply your own aligned allocation functions}} // expected-error-re@-21 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} diff --git a/clang/test/SemaHLSL/Language/AssignArray.hlsl b/clang/test/SemaHLSL/Language/AssignArray.hlsl index 16b60fe40f806..1f813e7a350b1 100644 --- a/clang/test/SemaHLSL/Language/AssignArray.hlsl +++ b/clang/test/SemaHLSL/Language/AssignArray.hlsl @@ -13,7 +13,7 @@ export void fn(int8 A) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' // CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' -// CHECK-NEXT: ArrayInitIndexExpr {{.*}} '__size_t':'unsigned long' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' int8 b = a; // CHECK-LABEL: VarDecl {{.*}} c 'int8':'vector[2]' cinit @@ -25,7 +25,7 @@ export void fn(int8 A) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' // CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' -// CHECK-NEXT: ArrayInitIndexExpr {{.*}} '__size_t':'unsigned long' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' int8 c = A; } diff --git a/clang/test/SemaHLSL/Language/InitListAST.hlsl b/clang/test/SemaHLSL/Language/InitListAST.hlsl index 460ec38bb44af..78bf269769ae6 100644 --- a/clang/test/SemaHLSL/Language/InitListAST.hlsl +++ b/clang/test/SemaHLSL/Language/InitListAST.hlsl @@ -97,12 +97,12 @@ TwoFloats case3(int Val) { // CHECK-NEXT: ImplicitCastExpr {{.*}}'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 TwoFloats case4(int2 TwoVals) { TwoFloats TF4 = {TwoVals}; return TF4; @@ -115,11 +115,11 @@ TwoFloats case4(int2 TwoVals) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 TwoInts case5(int2 TwoVals) { TwoInts TI1 = {TwoVals}; return TI1; @@ -209,22 +209,22 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} @@ -240,32 +240,32 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -273,32 +273,32 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 AnimalBits case8(Doggo D1) { AnimalBits A1 = {D1}; return A1; @@ -317,22 +317,22 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -347,32 +347,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -380,32 +380,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Doggo' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -413,25 +413,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -446,43 +446,43 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh[4]' // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' @@ -490,22 +490,22 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -520,32 +520,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -553,32 +553,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -586,25 +586,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -619,65 +619,65 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -692,32 +692,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -725,32 +725,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -758,25 +758,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -791,43 +791,43 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 Zoo case9(Doggo D1, AnimalBits A1) { Zoo Z1 = {D1, A1, D1, A1, D1, A1}; return Z1; @@ -867,28 +867,28 @@ FourFloats case10(TwoFloats TF1, TwoFloats TF2) { // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 FourFloats case11(float F) { FourFloats FF1 = {F.xxxx}; return FF1; @@ -1008,52 +1008,52 @@ FourFloats case16() { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 float case17() { IntAndFloat Structs[] = {1,2,3,4}; float Floats[] = {Structs, Structs}; diff --git a/clang/test/SemaObjC/format-size-spec-nsinteger.m b/clang/test/SemaObjC/format-size-spec-nsinteger.m index f25ce27f345db..8ecca6ec6a544 100644 --- a/clang/test/SemaObjC/format-size-spec-nsinteger.m +++ b/clang/test/SemaObjC/format-size-spec-nsinteger.m @@ -3,6 +3,10 @@ // RUN: %clang_cc1 -triple thumbv7k-apple-watchos2.0.0 -fsyntax-only -fblocks -verify %s // RUN: %clang_cc1 -triple thumbv7k-apple-watchos2.0.0 -fsyntax-only -fblocks -verify -Wformat-pedantic -DPEDANTIC %s +#if !defined(PEDANTIC) +// expected-no-diagnostics +#endif + #if __LP64__ typedef unsigned long NSUInteger; typedef long NSInteger; @@ -26,10 +30,12 @@ void testSizeSpecifier(void) { NSInteger i = 0; NSUInteger j = 0; NSLog(@"max NSInteger = %zi", i); + NSLog(@"max NSUinteger = %zu", j); + #if defined(PEDANTIC) - // expected-warning@-2 {{values of type 'NSInteger' should not be used as format arguments; add an explicit cast to 'long' instead}} + // expected-warning@-4 {{values of type 'NSInteger' should not be used as format arguments; add an explicit cast to 'long' instead}} + // expected-warning@-4 {{values of type 'NSUInteger' should not be used as format arguments; add an explicit cast to 'unsigned long' instead}} #endif - NSLog(@"max NSUinteger = %zu", j); // expected-warning {{values of type 'NSUInteger' should not be used as format arguments; add an explicit cast to 'unsigned long' instead}} } void testPtrdiffSpecifier(ptrdiff_t x) { @@ -37,9 +43,10 @@ void testPtrdiffSpecifier(ptrdiff_t x) { NSUInteger j = 0; NSLog(@"ptrdiff_t NSUinteger: %tu", j); + NSLog(@"ptrdiff_t NSInteger: %td", i); + NSLog(@"ptrdiff_t %tu, %td", x, x); #if __is_target_os(watchos) && defined(PEDANTIC) - // expected-warning@-2 {{values of type 'NSUInteger' should not be used as format arguments; add an explicit cast to 'unsigned long' instead}} + // expected-warning@-4 {{values of type 'NSUInteger' should not be used as format arguments; add an explicit cast to 'unsigned long' instead}} + // expected-warning@-4 {{values of type 'NSInteger' should not be used as format arguments; add an explicit cast to 'long' instead}} #endif - NSLog(@"ptrdiff_t NSInteger: %td", i); // expected-warning {{values of type 'NSInteger' should not be used as format arguments; add an explicit cast to 'long' instead}} - NSLog(@"ptrdiff_t %tu, %td", x, x); // no-warning } diff --git a/clang/test/SemaObjC/matrix-type-builtins.m b/clang/test/SemaObjC/matrix-type-builtins.m index 3916017cf0fe0..21b8bf864271d 100644 --- a/clang/test/SemaObjC/matrix-type-builtins.m +++ b/clang/test/SemaObjC/matrix-type-builtins.m @@ -27,5 +27,5 @@ void test_element_type_mismatch(u4x4 m, MatrixValue *mv) { __builtin_matrix_column_major_store(mv.value, mv.value, mv.value); // expected-error@-1 {{2nd argument must be a pointer to a valid matrix element type}} - // expected-error@-2 {{casting 'double4x4' (aka 'double __attribute__((matrix_type(4, 4)))') to incompatible type '__size_t' (aka 'unsigned long')}} + // expected-error@-2 {{casting 'double4x4' (aka 'double __attribute__((matrix_type(4, 4)))') to incompatible type 'unsigned long}} } diff --git a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl index 22569fa7b443c..a44d9dd86b86a 100644 --- a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl @@ -87,7 +87,7 @@ kernel void enqueue_kernel_tests(void) { }, 1024, 4294967296L); #ifdef B32 -// expected-warning@-2{{implicit conversion from 'long' to '__size_t' (aka 'unsigned int') changes value from 4294967296 to 0}} +// expected-warning@-2{{implicit conversion from 'long' to 'unsigned int' changes value from 4294967296 to 0}} #endif char c; @@ -97,7 +97,7 @@ kernel void enqueue_kernel_tests(void) { }, c, 1024L); #ifdef WCONV -// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to '__size_t' (aka 'unsigned {{int|long}}')}} +// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to 'unsigned {{int|long}}'}} #endif #define UINT_MAX 4294967295 @@ -107,7 +107,7 @@ kernel void enqueue_kernel_tests(void) { }, sizeof(int), sizeof(int) * UINT_MAX); #ifdef B32 -// expected-warning@-2{{implicit conversion from 'long' to '__size_t' (aka 'unsigned int') changes value from 17179869180 to 4294967292}} +// expected-warning@-2{{implicit conversion from 'long' to 'unsigned int' changes value from 17179869180 to 4294967292}} #endif typedef void (^bl_A_t)(local void *); diff --git a/clang/test/SemaTemplate/type_pack_element.cpp b/clang/test/SemaTemplate/type_pack_element.cpp index 5ff010c7db29c..264b4dcdc044d 100644 --- a/clang/test/SemaTemplate/type_pack_element.cpp +++ b/clang/test/SemaTemplate/type_pack_element.cpp @@ -7,9 +7,9 @@ using test1 = __type_pack_element<0, int>; // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr '0' -// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' +// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' // CHECK-NEXT: | |-value: Int 0 -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' // CHECK-NEXT: | `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} 'int' 0 // CHECK-NEXT: |-TemplateArgument type 'int' // CHECK-NEXT: | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' @@ -23,7 +23,7 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr 'N' -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' // CHECK-NEXT: | `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int' // CHECK-NEXT: `-TemplateArgument type 'Ts...' // CHECK-NEXT: `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'Ts...' dependent @@ -37,9 +37,9 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr '0' -// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' +// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' // CHECK-NEXT: | |-value: Int 0 -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' // CHECK-NEXT: | `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} 'int' 0 // CHECK-NEXT: `-TemplateArgument type 'Ts...' // CHECK-NEXT: `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'Ts...' dependent @@ -53,7 +53,7 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr 'N' -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' // CHECK-NEXT: | `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int' // CHECK-NEXT: `-TemplateArgument type 'int' // CHECK-NEXT: `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 9412d9735ef82..75afa87947be4 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -1672,10 +1672,6 @@ bool CursorVisitor::VisitTypedefTypeLoc(TypedefTypeLoc TL) { return Visit(MakeCursorTypeRef(TL.getTypedefNameDecl(), TL.getNameLoc(), TU)); } -bool CursorVisitor::VisitPredefinedSugarTypeLoc(PredefinedSugarTypeLoc TL) { - return false; -} - bool CursorVisitor::VisitUnresolvedUsingTypeLoc(UnresolvedUsingTypeLoc TL) { return Visit(MakeCursorTypeRef(TL.getDecl(), TL.getNameLoc(), TU)); } diff --git a/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp index c5f152a26a766..4fa4982de88fa 100644 --- a/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp +++ b/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp @@ -38,10 +38,10 @@ static void print(const Deque& d) { " : __back_spare() == %zu" " : __capacity() == %zu" " : bytes allocated == %zu\n", - std::size_t(d.size()), - std::size_t(d.__front_spare()), - std::size_t(d.__back_spare()), - std::size_t(d.__capacity()), + d.size(), + d.__front_spare(), + d.__back_spare(), + d.__capacity(), malloc_allocator_base::outstanding_bytes); } diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index 3226e0accc5ea..e847ede1a4ba6 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -2555,7 +2555,6 @@ RemoveWrappingTypes(QualType type, ArrayRef mask = {}) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: - case clang::Type::PredefinedSugar: type = type->getLocallyUnqualifiedSingleStepDesugaredType(); break; default: @@ -4131,7 +4130,6 @@ TypeSystemClang::GetTypeClass(lldb::opaque_compiler_type_t type) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: - case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: break; @@ -4842,7 +4840,6 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: - case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: @@ -5144,7 +5141,6 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: - case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: break; From aecd44818adcc26c0535e779629682c76ea44832 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 18 Jul 2025 00:21:04 -0400 Subject: [PATCH 283/813] [AMDGPU] Add support for `v_tanh_f16` on gfx1250 (#149439) Co-authored-by: Mekhanoshin, Stanislav --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 20 +++++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 5 ++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll | 83 +++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 ++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 48 +++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 56 +++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 60 ++++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 16 ++++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 45 ++++++++++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 48 +++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 56 +++++++++++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 60 ++++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16 ++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 20 +++++ .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 63 ++++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 59 +++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 16 ++++ .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 64 ++++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 64 +++++++++++++- .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 20 +++++ 23 files changed, 876 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 4111837d962b5..ed51f1d5de447 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -670,6 +670,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts") +TARGET_BUILTIN(__builtin_amdgcn_tanhh, "hh", "nc", "tanh-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index bcdb488f11639..a7d796ecccc61 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -504,6 +504,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return Builder.CreateCall(F, { Src }); } case AMDGPU::BI__builtin_amdgcn_tanhf: + case AMDGPU::BI__builtin_amdgcn_tanhh: case AMDGPU::BI__builtin_amdgcn_tanh_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_tanh); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index e120a46c6327b..738b7ab7f2b75 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -61,6 +61,26 @@ void test_tanh_f32(global float* out, float a) *out = __builtin_amdgcn_tanhf(a); } +// CHECK-LABEL: @test_tanh_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store ptr addrspace(1) [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr addrspace(1) [[TMP0]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.amdgcn.tanh.f16(half [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store half [[TMP2]], ptr addrspace(1) [[TMP3]], align 2 +// CHECK-NEXT: ret void +// +void test_tanh_f16(global half* out, global half* a) +{ + *out = __builtin_amdgcn_tanhh(*a); +} + // CHECK-LABEL: @test_tanh_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 8c35fea8259f4..1bbbb610305e9 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -530,6 +530,10 @@ defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>; defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>; defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>; +let SubtargetPredicate = HasTanhInsts in { +defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>; +} + let SubtargetPredicate = HasBF16TransInsts in { defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>; defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>; @@ -1142,6 +1146,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>; defm V_MOV_B64 : VOP1_Real_FULL ; defm V_TANH_F32 : VOP1_Real_FULL; +defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll index 81db7354757d9..dd89f80a54949 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll @@ -7,6 +7,7 @@ ; FIXME: GlobalISel does not work with bf16 declare float @llvm.amdgcn.tanh.f32(float) #0 +declare half @llvm.amdgcn.tanh.f16(half) #0 declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0 define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 { @@ -92,6 +93,88 @@ define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 { ret void } +define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 { +; SDAG-REAL16-LABEL: tanh_f16: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: v_tanh_f16_e32 v0.l, s2 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f16: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f16_e32 v0, s2 +; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call half @llvm.amdgcn.tanh.f16(half %src) #0 + store half %tanh, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_f16_constant_4.0: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_tanh_f16_e32 v0.l, 4.0 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f16_constant_4.0: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f16_e32 v0, 4.0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call half @llvm.amdgcn.tanh.f16(half 4.0) #0 + store half %tanh, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_f16_constant_100.0: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_tanh_f16_e32 v0.l, 0x5640 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_f16_constant_100.0: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_tanh_f16_e32 v0, 0x5640 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call half @llvm.amdgcn.tanh.f16(half 100.0) #0 + store half %tanh, ptr addrspace(1) %out, align 2 + ret void +} + +define amdgpu_kernel void @tanh_undef_f16(ptr addrspace(1) %out) #1 { +; SDAG-REAL16-LABEL: tanh_undef_f16: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: tanh_undef_f16: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_endpgm + %tanh = call half @llvm.amdgcn.tanh.f16(half undef) + store half %tanh, ptr addrspace(1) %out, align 2 + ret void +} + define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 { ; SDAG-REAL16-LABEL: tanh_bf16: ; SDAG-REAL16: ; %bb.0: diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index f9e217d1f0361..279bb262bff04 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -73,6 +73,51 @@ v_tanh_f32 v5, src_scc v_tanh_f32 v255, 0xaf123456 // GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf] +v_tanh_f16 v5, v1 +// GFX1250: v_tanh_f16_e32 v5, v1 ; encoding: [0x01,0x3f,0x0a,0x7e] + +v_tanh_f16 v5, v127 +// GFX1250: v_tanh_f16_e32 v5, v127 ; encoding: [0x7f,0x3f,0x0a,0x7e] + +v_tanh_f16 v5, s1 +// GFX1250: v_tanh_f16_e32 v5, s1 ; encoding: [0x01,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, s105 +// GFX1250: v_tanh_f16_e32 v5, s105 ; encoding: [0x69,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, vcc_lo +// GFX1250: v_tanh_f16_e32 v5, vcc_lo ; encoding: [0x6a,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, vcc_hi +// GFX1250: v_tanh_f16_e32 v5, vcc_hi ; encoding: [0x6b,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, ttmp15 +// GFX1250: v_tanh_f16_e32 v5, ttmp15 ; encoding: [0x7b,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, m0 +// GFX1250: v_tanh_f16_e32 v5, m0 ; encoding: [0x7d,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, exec_lo +// GFX1250: v_tanh_f16_e32 v5, exec_lo ; encoding: [0x7e,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, exec_hi +// GFX1250: v_tanh_f16_e32 v5, exec_hi ; encoding: [0x7f,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, null +// GFX1250: v_tanh_f16_e32 v5, null ; encoding: [0x7c,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, -1 +// GFX1250: v_tanh_f16_e32 v5, -1 ; encoding: [0xc1,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, 0.5 +// GFX1250: v_tanh_f16_e32 v5, 0.5 ; encoding: [0xf0,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, src_scc +// GFX1250: v_tanh_f16_e32 v5, src_scc ; encoding: [0xfd,0x3e,0x0a,0x7e] + +v_tanh_f16 v127, 0x8000 +// GFX1250: v_tanh_f16_e32 v127, 0x8000 ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00] + v_tanh_bf16 v5, v1 // GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index d51ef68bf1e19..76272d25d92d4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -73,6 +73,54 @@ v_tanh_f32 v5, src_scc v_tanh_f32 v255, 0xaf123456 // GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf] +v_tanh_f16 v5, v1 +// GFX1250: v_tanh_f16_e32 v5, v1 ; encoding: [0x01,0x3f,0x0a,0x7e] + +v_tanh_f16 v5, v127 +// GFX1250: v_tanh_f16_e32 v5, v127 ; encoding: [0x7f,0x3f,0x0a,0x7e] + +v_tanh_f16 v5, s1 +// GFX1250: v_tanh_f16_e32 v5, s1 ; encoding: [0x01,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, s105 +// GFX1250: v_tanh_f16_e32 v5, s105 ; encoding: [0x69,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, vcc_lo +// GFX1250: v_tanh_f16_e32 v5, vcc_lo ; encoding: [0x6a,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, vcc_hi +// GFX1250: v_tanh_f16_e32 v5, vcc_hi ; encoding: [0x6b,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, ttmp15 +// GFX1250: v_tanh_f16_e32 v5, ttmp15 ; encoding: [0x7b,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, m0 +// GFX1250: v_tanh_f16_e32 v5, m0 ; encoding: [0x7d,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, exec_lo +// GFX1250: v_tanh_f16_e32 v5, exec_lo ; encoding: [0x7e,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, exec_hi +// GFX1250: v_tanh_f16_e32 v5, exec_hi ; encoding: [0x7f,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, null +// GFX1250: v_tanh_f16_e32 v5, null ; encoding: [0x7c,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, -1 +// GFX1250: v_tanh_f16_e32 v5, -1 ; encoding: [0xc1,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, 0.5 +// GFX1250: v_tanh_f16_e32 v5, 0.5 ; encoding: [0xf0,0x3e,0x0a,0x7e] + +v_tanh_f16 v5, src_scc +// GFX1250: v_tanh_f16_e32 v5, src_scc ; encoding: [0xfd,0x3e,0x0a,0x7e] + +v_tanh_f16 v127, 0x8000 +// GFX1250: v_tanh_f16_e32 v127, 0x8000 ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00] + +v_tanh_f16 v5.h, v1.h +// GFX1250: v_tanh_f16_e32 v5.h, v1.h ; encoding: [0x81,0x3f,0x0a,0x7f] + v_tanh_bf16 v5, v1 // GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index ae22f68e54835..0a8ee84561d33 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -58,6 +58,62 @@ v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi // GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_tanh_f16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_mirror +// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_half_mirror +// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_shl:1 +// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_shl:15 +// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_shr:1 +// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_shr:15 +// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_ror:1 +// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_ror:15 +// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index 37ecb66bfe809..d4afb9d9b2d9a 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -58,6 +58,66 @@ v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi // GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_tanh_f16 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_mirror +// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_half_mirror +// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_shl:1 +// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_shl:15 +// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_shr:1 +// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_shr:15 +// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_ror:1 +// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_ror:15 +// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5.h, v1.h quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index f24122e24b70e..a7cb6bf8de69c 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -14,6 +14,18 @@ v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index 34abc829d4eb1..6acab7edc0d49 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -14,6 +14,22 @@ v_tanh_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_tanh_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3c,0xfe,0x7f,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 340a7857419c4..7486d849253e8 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -172,6 +172,51 @@ v_tanh_f32_e64 v5, src_scc mul:4 v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 // GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] +v_tanh_f16_e64 v5, v1 +// GFX1250: v_tanh_f16_e64 v5, v1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00] + +v_tanh_f16_e64 v5, v255 +// GFX1250: v_tanh_f16_e64 v5, v255 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00] + +v_tanh_f16_e64 v5, s1 +// GFX1250: v_tanh_f16_e64 v5, s1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, s105 +// GFX1250: v_tanh_f16_e64 v5, s105 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, vcc_lo +// GFX1250: v_tanh_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, vcc_hi +// GFX1250: v_tanh_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, ttmp15 +// GFX1250: v_tanh_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, m0 +// GFX1250: v_tanh_f16_e64 v5, m0 ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, exec_lo +// GFX1250: v_tanh_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, exec_hi +// GFX1250: v_tanh_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, null +// GFX1250: v_tanh_f16_e64 v5, null ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, -1 +// GFX1250: v_tanh_f16_e64 v5, -1 ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_tanh_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08] + +v_tanh_f16_e64 v5, src_scc mul:4 +// GFX1250: v_tanh_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10] + +v_tanh_f16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + v_rcp_bf16_e64 v5, v1 // GFX1250: v_rcp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 579a467b41052..b59b8b31e2d5f 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -175,6 +175,54 @@ v_tanh_f32_e64 v5, src_scc mul:4 v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 // GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] +v_tanh_f16_e64 v5, v1 +// GFX1250: v_tanh_f16_e64 v5, v1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00] + +v_tanh_f16_e64 v5, v255 +// GFX1250: v_tanh_f16_e64 v5, v255 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00] + +v_tanh_f16_e64 v5, s1 +// GFX1250: v_tanh_f16_e64 v5, s1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, s105 +// GFX1250: v_tanh_f16_e64 v5, s105 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, vcc_lo +// GFX1250: v_tanh_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, vcc_hi +// GFX1250: v_tanh_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, ttmp15 +// GFX1250: v_tanh_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, m0 +// GFX1250: v_tanh_f16_e64 v5, m0 ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, exec_lo +// GFX1250: v_tanh_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, exec_hi +// GFX1250: v_tanh_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, null +// GFX1250: v_tanh_f16_e64 v5, null ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, -1 +// GFX1250: v_tanh_f16_e64 v5, -1 ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00] + +v_tanh_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_tanh_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08] + +v_tanh_f16_e64 v5, src_scc mul:4 +// GFX1250: v_tanh_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10] + +v_tanh_f16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +v_tanh_f16 v5.l, v128.h +// GFX1250: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00] + v_rcp_bf16_e64 v5, v1 // GFX1250: v_rcp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index 423340cc90b30..f7f20f46161ce 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -58,6 +58,62 @@ v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask // GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_mirror +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index 7968b39839a78..e1241b01ccae1 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -58,6 +58,66 @@ v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask // GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_mirror +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1 +// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] +// GFX1250: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index dd469c2eef850..0106175301d20 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -18,6 +18,22 @@ v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 9fce77916b66e..93b86f3ffb841 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -18,6 +18,26 @@ v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_tanh_f16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index 0a6fc391e63a5..5f37ba91e071b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -74,6 +74,69 @@ 0x6a,0x3c,0x0a,0x7e # GFX1250: v_tanh_f32_e32 v5, vcc_lo ; encoding: [0x6a,0x3c,0x0a,0x7e] +0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e32 v127.l, 0x8000 ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e32 v127, 0x8000 ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00] + +0xc1,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, -1 ; encoding: [0xc1,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, -1 ; encoding: [0xc1,0x3e,0x0a,0x7e] + +0xf0,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, 0.5 ; encoding: [0xf0,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, 0.5 ; encoding: [0xf0,0x3e,0x0a,0x7e] + +0x7f,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, exec_hi ; encoding: [0x7f,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, exec_hi ; encoding: [0x7f,0x3e,0x0a,0x7e] + +0x7e,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, exec_lo ; encoding: [0x7e,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, exec_lo ; encoding: [0x7e,0x3e,0x0a,0x7e] + +0x7d,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, m0 ; encoding: [0x7d,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, m0 ; encoding: [0x7d,0x3e,0x0a,0x7e] + +0x7c,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, null ; encoding: [0x7c,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, null ; encoding: [0x7c,0x3e,0x0a,0x7e] + +0x01,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, s1 ; encoding: [0x01,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, s1 ; encoding: [0x01,0x3e,0x0a,0x7e] + +0x69,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, s105 ; encoding: [0x69,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, s105 ; encoding: [0x69,0x3e,0x0a,0x7e] + +0xfd,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, src_scc ; encoding: [0xfd,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, src_scc ; encoding: [0xfd,0x3e,0x0a,0x7e] + +0x7b,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, ttmp15 ; encoding: [0x7b,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, ttmp15 ; encoding: [0x7b,0x3e,0x0a,0x7e] + +0x01,0x3f,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, v1.l ; encoding: [0x01,0x3f,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, v1 ; encoding: [0x01,0x3f,0x0a,0x7e] + +0x7f,0x3f,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, v127.l ; encoding: [0x7f,0x3f,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, v127 ; encoding: [0x7f,0x3f,0x0a,0x7e] + +0x6b,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, vcc_hi ; encoding: [0x6b,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, vcc_hi ; encoding: [0x6b,0x3e,0x0a,0x7e] + +0x6a,0x3e,0x0a,0x7e +# GFX1250-REAL16: v_tanh_f16_e32 v5.l, vcc_lo ; encoding: [0x6a,0x3e,0x0a,0x7e] +# GFX1250-FAKE16: v_tanh_f16_e32 v5, vcc_lo ; encoding: [0x6a,0x3e,0x0a,0x7e] + +0x81,0x3f,0x0a,0x7f +# GFX1250-REAL16: v_tanh_f16_e32 v5.h, v1.h ; encoding: [0x81,0x3f,0x0a,0x7f] + 0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e32 v127, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index f099ffcba36e4..57bee2766ce44 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -44,6 +44,65 @@ 0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13 # GFX1250: v_tanh_f32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3c,0x0a,0x7e,0x01,0x60,0x09,0x13] +0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30 +# GFX1250-REAL16: v_tanh_f16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30] +# GFX1250-FAKE16: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30] + +0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13] + +0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff +# GFX1250-REAL16: v_tanh_f16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7f,0x81,0x1b,0x00,0xff] + 0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30 # GFX1250-REAL16: v_tanh_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30] # GFX1250-FAKE16: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index d86d4630c48ea..28ec6b11b4de3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -11,6 +11,22 @@ 0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05 # GFX1250: v_tanh_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3c,0x0a,0x7e,0x01,0x77,0x39,0x05] +0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x3e,0xfe,0x7e,0x7f,0x00,0x00,0x00] + +0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x3e,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x3e,0x0a,0x7f,0x81,0x77,0x39,0x05] +# GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] + 0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 4dc7ed4237f53..5004762729701 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -47,6 +47,70 @@ 0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00 # GFX1250: v_tanh_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9e,0xd5,0x6a,0x00,0x00,0x00] +0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, -1 ; encoding: [0x05,0x00,0x9f,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x9f,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, m0 ; encoding: [0x05,0x00,0x9f,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, null ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, null ; encoding: [0x05,0x00,0x9f,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, s1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, s105 ; encoding: [0x05,0x00,0x9f,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x9f,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x9f,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, v1 ; encoding: [0x05,0x00,0x9f,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, v255 ; encoding: [0x05,0x00,0x9f,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x9f,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x9f,0xd5,0x6a,0x00,0x00,0x00] + +0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64 v5, v128 ; encoding: [0x05,0x00,0x9f,0xd5,0x80,0x01,0x00,0x00] + 0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 # GFX1250-REAL16: v_tanh_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 1f03a43cd8bd4..de908b95d94f9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s 0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9e,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] @@ -44,6 +44,66 @@ 0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff # GFX1250: v_tanh_f32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9e,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0x9f,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + 0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index e673f9fdfc7bb..cfe7173c383b3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -14,6 +14,26 @@ 0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 # GFX1250: v_tanh_f32_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9e,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_f16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9f,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0x9f,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + 0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From fb81a0dd9ebe42702190a56db5c9dae7a3dbaec7 Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Fri, 18 Jul 2025 10:06:12 +0530 Subject: [PATCH 284/813] [LoopInterchange][NFCI] Split reductions-non-wrapped-operations.ll (#149449) This test has grown too big. Having one for float for int would be more manageable. --- .../LoopInterchange/fp-reductions.ll | 437 ++++++++++++++++++ .../reductions-non-wrapped-operations.ll | 434 ----------------- 2 files changed, 437 insertions(+), 434 deletions(-) create mode 100644 llvm/test/Transforms/LoopInterchange/fp-reductions.ll diff --git a/llvm/test/Transforms/LoopInterchange/fp-reductions.ll b/llvm/test/Transforms/LoopInterchange/fp-reductions.ll new file mode 100644 index 0000000000000..0703a7b27979a --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/fp-reductions.ll @@ -0,0 +1,437 @@ +; RUN: opt < %s -passes=loop-interchange -cache-line-size=64 -pass-remarks-output=%t -disable-output \ +; RUN: -verify-dom-info -verify-loop-info -verify-loop-lcssa +; RUN: FileCheck -input-file=%t %s + +; Check that the loops aren't exchanged if there is a reduction of +; non-reassociative floating-point addition. +; +; float sum = 0; +; for (int i = 0; i < 2; i++) +; for (int j = 0; j < 2; j++) +; sum += A[j][i]; + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: UnsupportedPHIOuter +; CHECK-NEXT: Function: reduction_fadd +define void @reduction_fadd(ptr %A) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %sum.j.next = fadd float %sum.j, %a + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %sum.i.lcssa = phi float [ %sum.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; Check that the interchange is legal if the floating-point addition is marked +; as reassoc. +; +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_reassoc_fadd +define void @reduction_reassoc_fadd(ptr %A) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %sum.j.next = fadd reassoc float %sum.j, %a + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %sum.i.lcssa = phi float [ %sum.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; FIXME: Is it really legal to interchange the loops when +; both reassoc and ninf are set? +; Check that the interchange is legal if the floating-point addition is marked +; as reassoc. +; +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_reassoc_ninf_fadd +define void @reduction_reassoc_ninf_fadd(ptr %A) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %sum.j.next = fadd reassoc ninf float %sum.j, %a + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %sum.i.lcssa = phi float [ %sum.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; Check that the loops aren't exchanged if there is a reduction of +; non-reassociative floating-point multiplication. +; +; float prod = 1; +; for (int i = 0; i < 2; i++) +; for (int j = 0; j < 2; j++) +; prod *= A[j][i]; + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: UnsupportedPHIOuter +; CHECK-NEXT: Function: reduction_fmul +define void @reduction_fmul(ptr %A) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %prod.j.next = fmul float %prod.j, %a + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %prod.i.lcssa = phi float [ %prod.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; Check that the interchange is legal if the floating-point multiplication is +; marked as reassoc. +; +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_reassoc_fmul +define void @reduction_reassoc_fmul(ptr %A) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %prod.j.next = fmul reassoc float %prod.j, %a + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %prod.i.lcssa = phi float [ %prod.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; Check that the loops aren't exchanged if there is a reduction of +; non-reassociative floating-point fmuladd. +; +; float fmuladd = 0; +; for (int i = 0; i < 2; i++) +; for (int j = 0; j < 2; j++) +; fmuladd += A[j][i] * B[j][i]; + +; CHECK: --- !Missed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: UnsupportedPHIOuter +; CHECK-NEXT: Function: reduction_fmuladd +define void @reduction_fmuladd(ptr %A, ptr %B) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ] + %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx.a, align 4 + %b = load float, ptr %idx.b, align 4 + %fmuladd.j.next = call float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j) + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; Check that the interchange is legal if the floating-point fmuladd is marked +; as reassoc. +; +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_reassoc_fmuladd +define void @reduction_reassoc_fmuladd(ptr %A, ptr %B) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ] + %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx.a, align 4 + %b = load float, ptr %idx.b, align 4 + %fmuladd.j.next = call reassoc float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j) + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; Check that interchanging the loops is legal for the reassociative +; floating-point minimum. +; +; float fmin = init; +; for (int i = 0; i < 2; i++) +; for (int j = 0; j < 2; j++) +; fmin = (A[j][i] < fmin) ? A[j][i] : fmin; + +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_fmin +define void @reduction_fmin(ptr %A, float %init) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %cmp = fcmp nnan nsz olt float %a, %fmin.j + %fmin.j.next = select nnan nsz i1 %cmp, float %a, float %fmin.j + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + + +; Check that interchanging the loops is legal for the floating-point +; llvm.minimumnum. +; +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_fmininumnum +define void @reduction_fmininumnum(ptr %A, float %init) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %fmin.j.next = call float @llvm.minimumnum.f32(float %a, float %fmin.j) + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; Check that interchanging the loops is legal for the reassociative +; floating-point maximum. +; +; float fmax = init; +; for (int i = 0; i < 2; i++) +; for (int j = 0; j < 2; j++) +; fmax = (A[j][i] > fmax) ? A[j][i] : fmax; + +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_fmax +define void @reduction_fmax(ptr %A, float %init) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %cmp = fcmp nnan nsz ogt float %a, %fmax.j + %fmax.j.next = select nnan nsz i1 %cmp, float %a, float %fmax.j + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +; Check that interchanging the loops is legal for the floating-point +; llvm.maximumnum. + +; CHECK: --- !Pass +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: reduction_fmaxinumnum +define void @reduction_fmaxinumnum(ptr %A, float %init) { +entry: + br label %for.i.header + +for.i.header: + %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] + %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ] + br label %for.j + +for.j: + %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] + %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ] + %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i + %a = load float, ptr %idx, align 4 + %fmax.j.next = call float @llvm.maximumnum.f32(float %a, float %fmax.j) + %j.inc = add i32 %j, 1 + %cmp.j = icmp slt i32 %j.inc, 2 + br i1 %cmp.j, label %for.j, label %for.i.latch + +for.i.latch: + %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ] + %i.inc = add i32 %i, 1 + %cmp.i = icmp slt i32 %i.inc, 2 + br i1 %cmp.i, label %for.i.header, label %exit + +exit: + ret void +} + +declare float @llvm.fmuladd.f32(float %a, float %b, float %c) +declare float @llvm.minimumnum.f32(float %a, float %b) +declare float @llvm.maximumnum.f32(float %a, float %b) \ No newline at end of file diff --git a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll index 0eb6fe98b8bb7..f5c6ad7889366 100644 --- a/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll +++ b/llvm/test/Transforms/LoopInterchange/reductions-non-wrapped-operations.ll @@ -333,437 +333,3 @@ for.i.latch: exit: ret void } - -; Check that the loops aren't exchanged if there is a reduction of -; non-reassociative floating-point addition. -; -; float sum = 0; -; for (int i = 0; i < 2; i++) -; for (int j = 0; j < 2; j++) -; sum += A[j][i]; - -; CHECK: --- !Missed -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: UnsupportedPHIOuter -; CHECK-NEXT: Function: reduction_fadd -define void @reduction_fadd(ptr %A) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %sum.j.next = fadd float %sum.j, %a - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %sum.i.lcssa = phi float [ %sum.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; Check that the interchange is legal if the floating-point addition is marked -; as reassoc. -; -; CHECK: --- !Pass -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged -; CHECK-NEXT: Function: reduction_reassoc_fadd -define void @reduction_reassoc_fadd(ptr %A) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %sum.j.next = fadd reassoc float %sum.j, %a - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %sum.i.lcssa = phi float [ %sum.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; FIXME: Is it really legal to interchange the loops when -; both reassoc and ninf are set? -; Check that the interchange is legal if the floating-point addition is marked -; as reassoc. -; -; CHECK: --- !Pass -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged -; CHECK-NEXT: Function: reduction_reassoc_ninf_fadd -define void @reduction_reassoc_ninf_fadd(ptr %A) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %sum.i = phi float [ 0.0, %entry ], [ %sum.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %sum.j = phi float [ %sum.i, %for.i.header ], [ %sum.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %sum.j.next = fadd reassoc ninf float %sum.j, %a - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %sum.i.lcssa = phi float [ %sum.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; Check that the loops aren't exchanged if there is a reduction of -; non-reassociative floating-point multiplication. -; -; float prod = 1; -; for (int i = 0; i < 2; i++) -; for (int j = 0; j < 2; j++) -; prod *= A[j][i]; - -; CHECK: --- !Missed -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: UnsupportedPHIOuter -; CHECK-NEXT: Function: reduction_fmul -define void @reduction_fmul(ptr %A) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %prod.j.next = fmul float %prod.j, %a - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %prod.i.lcssa = phi float [ %prod.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; Check that the interchange is legal if the floating-point multiplication is -; marked as reassoc. -; -; CHECK: --- !Pass -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged -; CHECK-NEXT: Function: reduction_reassoc_fmul -define void @reduction_reassoc_fmul(ptr %A) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %prod.i = phi float [ 1.0, %entry ], [ %prod.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %prod.j = phi float [ %prod.i, %for.i.header ], [ %prod.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %prod.j.next = fmul reassoc float %prod.j, %a - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %prod.i.lcssa = phi float [ %prod.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; Check that the loops aren't exchanged if there is a reduction of -; non-reassociative floating-point fmuladd. -; -; float fmuladd = 0; -; for (int i = 0; i < 2; i++) -; for (int j = 0; j < 2; j++) -; fmuladd += A[j][i] * B[j][i]; - -; CHECK: --- !Missed -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: UnsupportedPHIOuter -; CHECK-NEXT: Function: reduction_fmuladd -define void @reduction_fmuladd(ptr %A, ptr %B) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ] - %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx.a, align 4 - %b = load float, ptr %idx.b, align 4 - %fmuladd.j.next = call float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j) - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; Check that the interchange is legal if the floating-point fmuladd is marked -; as reassoc. -; -; CHECK: --- !Pass -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged -; CHECK-NEXT: Function: reduction_reassoc_fmuladd -define void @reduction_reassoc_fmuladd(ptr %A, ptr %B) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %fmuladd.i = phi float [ 1.0, %entry ], [ %fmuladd.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %fmuladd.j = phi float [ %fmuladd.i, %for.i.header ], [ %fmuladd.j.next, %for.j ] - %idx.a = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %idx.b = getelementptr inbounds [2 x [2 x i32]], ptr %B, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx.a, align 4 - %b = load float, ptr %idx.b, align 4 - %fmuladd.j.next = call reassoc float @llvm.fmuladd.f32(float %a, float %b, float %fmuladd.j) - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %fmuladd.i.lcssa = phi float [ %fmuladd.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; Check that interchanging the loops is legal for the reassociative -; floating-point minimum. -; -; float fmin = init; -; for (int i = 0; i < 2; i++) -; for (int j = 0; j < 2; j++) -; fmin = (A[j][i] < fmin) ? A[j][i] : fmin; - -; CHECK: --- !Pass -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged -; CHECK-NEXT: Function: reduction_fmin -define void @reduction_fmin(ptr %A, float %init) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %cmp = fcmp nnan nsz olt float %a, %fmin.j - %fmin.j.next = select nnan nsz i1 %cmp, float %a, float %fmin.j - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - - -; Check that interchanging the loops is legal for the floating-point -; llvm.minimumnum. -; -; CHECK: --- !Pass -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged -; CHECK-NEXT: Function: reduction_fmininumnum -define void @reduction_fmininumnum(ptr %A, float %init) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %fmin.i = phi float [ %init, %entry ], [ %fmin.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %fmin.j = phi float [ %fmin.i, %for.i.header ], [ %fmin.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %fmin.j.next = call float @llvm.minimumnum.f32(float %a, float %fmin.j) - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %fmin.i.lcssa = phi float [ %fmin.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; Check that interchanging the loops is legal for the reassociative -; floating-point maximum. -; -; float fmax = init; -; for (int i = 0; i < 2; i++) -; for (int j = 0; j < 2; j++) -; fmax = (A[j][i] > fmax) ? A[j][i] : fmax; - -; CHECK: --- !Pass -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged -; CHECK-NEXT: Function: reduction_fmax -define void @reduction_fmax(ptr %A, float %init) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %cmp = fcmp nnan nsz ogt float %a, %fmax.j - %fmax.j.next = select nnan nsz i1 %cmp, float %a, float %fmax.j - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -; Check that interchanging the loops is legal for the floating-point -; llvm.maximumnum. - -; CHECK: --- !Pass -; CHECK-NEXT: Pass: loop-interchange -; CHECK-NEXT: Name: Interchanged -; CHECK-NEXT: Function: reduction_fmaxinumnum -define void @reduction_fmaxinumnum(ptr %A, float %init) { -entry: - br label %for.i.header - -for.i.header: - %i = phi i32 [ 0, %entry ], [ %i.inc, %for.i.latch ] - %fmax.i = phi float [ %init, %entry ], [ %fmax.i.lcssa, %for.i.latch ] - br label %for.j - -for.j: - %j = phi i32 [ 0, %for.i.header ], [ %j.inc, %for.j ] - %fmax.j = phi float [ %fmax.i, %for.i.header ], [ %fmax.j.next, %for.j ] - %idx = getelementptr inbounds [2 x [2 x i32]], ptr %A, i32 0, i32 %j, i32 %i - %a = load float, ptr %idx, align 4 - %fmax.j.next = call float @llvm.maximumnum.f32(float %a, float %fmax.j) - %j.inc = add i32 %j, 1 - %cmp.j = icmp slt i32 %j.inc, 2 - br i1 %cmp.j, label %for.j, label %for.i.latch - -for.i.latch: - %fmax.i.lcssa = phi float [ %fmax.j.next, %for.j ] - %i.inc = add i32 %i, 1 - %cmp.i = icmp slt i32 %i.inc, 2 - br i1 %cmp.i, label %for.i.header, label %exit - -exit: - ret void -} - -declare float @llvm.fmuladd.f32(float %a, float %b, float %c) -declare float @llvm.minimumnum.f32(float %a, float %b) -declare float @llvm.maximumnum.f32(float %a, float %b) From 3eb07996b1d6874e4c288a49712d2a5ada57cd2d Mon Sep 17 00:00:00 2001 From: Madhur Amilkanthwar Date: Fri, 18 Jul 2025 10:07:47 +0530 Subject: [PATCH 285/813] [GVN][NFCI] Use early return in phiTranslateImpl() (#149273) --- llvm/lib/Transforms/Scalar/GVN.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 8bff458f88bb9..f6bf09d09433d 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -2367,11 +2367,14 @@ uint32_t GVNPass::ValueTable::phiTranslateImpl(const BasicBlock *Pred, // See if we can refine the value number by looking at the PN incoming value // for the given predecessor. if (PHINode *PN = NumberingPhi[Num]) { - if (PN->getParent() == PhiBlock) - for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) - if (PN->getIncomingBlock(I) == Pred) - if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false)) - return TransVal; + if (PN->getParent() != PhiBlock) + return Num; + for (unsigned I = 0; I != PN->getNumIncomingValues(); ++I) { + if (PN->getIncomingBlock(I) != Pred) + continue; + if (uint32_t TransVal = lookup(PN->getIncomingValue(I), false)) + return TransVal; + } return Num; } From 4e6157f7844cc801bc84ac06f53052e8c6d6c478 Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Fri, 18 Jul 2025 07:41:39 +0300 Subject: [PATCH 286/813] [clang-tidy][NFC] Add mention of running 'clang-tidy' on changes in Contributing.rst (#148547) Follow up to https://github.com/llvm/llvm-project/pull/147793. _Originally suggested by carlosgalvezp in https://github.com/llvm/llvm-project/pull/147793#issuecomment-3059021433_ --- .../docs/clang-tidy/Contributing.rst | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/clang-tools-extra/docs/clang-tidy/Contributing.rst b/clang-tools-extra/docs/clang-tidy/Contributing.rst index 9611c655886f2..66c0abadc6a40 100644 --- a/clang-tools-extra/docs/clang-tidy/Contributing.rst +++ b/clang-tools-extra/docs/clang-tidy/Contributing.rst @@ -657,6 +657,29 @@ directory. The path to this directory is available in a lit test with the varia .. _FileCheck: https://llvm.org/docs/CommandGuide/FileCheck.html .. _test/clang-tidy/checkers/google/readability-casting.cpp: https://github.com/llvm/llvm-project/blob/main/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp + +Submitting a Pull Request +------------------------- + +Before submitting a pull request, contributors are encouraged to run +:program:`clang-tidy` and :program:`clang-format` on their changes to ensure +code quality and catch potential issues. While :program:`clang-tidy` is not +currently enforced in CI, following this practice helps maintain code +consistency and prevent common errors. + +Here's useful command to check your staged changes: + +.. code-block:: console + + $ git diff --staged -U0 | ./clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py \ + -j $(nproc) -path build/ -p1 -only-check-in-db + $ git clang-format + +Note that some warnings may be false positives or require careful consideration +before fixing. Use your judgment and feel free to discuss in the pull request +if you're unsure about a particular warning. + + Out-of-tree check plugins ------------------------- From a8f5e9ed6b44562938ce07e2790be90be8f0a6b5 Mon Sep 17 00:00:00 2001 From: Owen Pan Date: Thu, 17 Jul 2025 21:43:43 -0700 Subject: [PATCH 287/813] [clang-format] Fix a regression of annotating PointerOrReference (#149039) Fixes #149010 --- clang/lib/Format/TokenAnnotator.cpp | 10 +++++++--- clang/unittests/Format/TokenAnnotatorTest.cpp | 4 ++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 739209a5681f8..581bfbab0972d 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -2996,14 +2996,18 @@ class AnnotatingParser { const FormatToken *PrevToken = Tok.getPreviousNonComment(); if (!PrevToken) return TT_UnaryOperator; - if (PrevToken->is(TT_TypeName)) + if (PrevToken->isTypeName(LangOpts)) return TT_PointerOrReference; if (PrevToken->isPlacementOperator() && Tok.is(tok::ampamp)) return TT_BinaryOperator; - const FormatToken *NextToken = Tok.getNextNonComment(); + auto *NextToken = Tok.getNextNonComment(); if (!NextToken) return TT_PointerOrReference; + if (NextToken->is(tok::greater)) { + NextToken->setFinalizedType(TT_TemplateCloser); + return TT_PointerOrReference; + } if (InTemplateArgument && NextToken->is(tok::kw_noexcept)) return TT_BinaryOperator; @@ -3112,7 +3116,7 @@ class AnnotatingParser { // It's more likely that & represents operator& than an uninitialized // reference. - if (Tok.is(tok::amp) && PrevToken && PrevToken->Tok.isAnyIdentifier() && + if (Tok.is(tok::amp) && PrevToken->Tok.isAnyIdentifier() && IsChainedOperatorAmpOrMember(PrevToken->getPreviousNonComment()) && NextToken && NextToken->Tok.isAnyIdentifier()) { if (auto NextNext = NextToken->getNextNonComment(); diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index e281a4945a862..ce7787ede0f5c 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -390,6 +390,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) { EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace); EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator); + Tokens = annotate("bool foo = requires { static_cast(1); };"); + ASSERT_EQ(Tokens.size(), 17u) << Tokens; + EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_PointerOrReference); + Tokens = annotate("return s.operator int *();"); ASSERT_EQ(Tokens.size(), 10u) << Tokens; // Not TT_FunctionDeclarationName. From 03fe1a493d55cfc27a32ee3064639b86cb54d16a Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Fri, 18 Jul 2025 07:08:50 +0200 Subject: [PATCH 288/813] [AMDGPU] Fix sgpr to vreg_1 copy analysis (#149181) --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 19 ++-- .../test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll | 100 ++++++++++++++++++ .../CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir | 31 ++++++ 3 files changed, 143 insertions(+), 7 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 44d9ef5a0792e..f018f77bc83e1 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -947,13 +947,18 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { // Copies and REG_SEQUENCE do not contribute to the final assembly // So, skip them but take care of the SGPR to VGPR copies bookkeeping. - if (Inst->isCopy() || Inst->isRegSequence()) { - if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { - if (!Inst->isCopy() || - !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { - Info.NumSVCopies++; - continue; - } + if (Inst->isRegSequence() && + TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) { + Info.NumSVCopies++; + continue; + } + if (Inst->isCopy()) { + const TargetRegisterClass *SrcRC, *DstRC; + std::tie(SrcRC, DstRC) = getCopyRegClasses(*Inst, *TRI, *MRI); + if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI) && + !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) { + Info.NumSVCopies++; + continue; } } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll new file mode 100644 index 0000000000000..192bd2073886a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN %s + +define amdgpu_kernel void @copy_to_vreg_1(i32 %0) { +; GCN-LABEL: copy_to_vreg_1: +; GCN: ; %bb.0: ; %._crit_edge +; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_mov_b64_e32 v[2:3], 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sub_i32 s5, 1, s4 +; GCN-NEXT: s_cmp_lt_u32 s4, 2 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_and_b64 s[2:3], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s3, s5, 1 +; GCN-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-NEXT: s_addc_u32 s0, 1, 0 +; GCN-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-NEXT: s_cmp_ge_u32 s3, s4 +; GCN-NEXT: s_cselect_b32 s4, s0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_cmp_lg_u64 0, 0 +; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_branch .LBB0_3 +; GCN-NEXT: .LBB0_1: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] +; GCN-NEXT: s_xor_b64 s[8:9], exec, -1 +; GCN-NEXT: .LBB0_2: ; %Flow3 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_and_b64 s[4:5], exec, s[8:9] +; GCN-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; GCN-NEXT: s_mov_b32 s4, 0 +; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB0_8 +; GCN-NEXT: .LBB0_3: ; %.lr.ph27 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_or_b64 s[8:9], vcc, s[4:5] +; GCN-NEXT: s_xor_b64 s[6:7], s[8:9], -1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[8:9] +; GCN-NEXT: s_cbranch_execz .LBB0_5 +; GCN-NEXT: ; %bb.4: ; %pred.store.if +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec +; GCN-NEXT: global_store_byte v[2:3], v1, off +; GCN-NEXT: .LBB0_5: ; %Flow2 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; GCN-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN-NEXT: s_mov_b64 s[8:9], -1 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: ; %bb.6: ; %pred.store.continue +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[2:3] +; GCN-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB0_1 +; GCN-NEXT: ; %bb.7: ; %pred.store.if41 +; GCN-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; GCN-NEXT: global_store_byte v[2:3], v1, off +; GCN-NEXT: s_branch .LBB0_1 +; GCN-NEXT: .LBB0_8: ; %DummyReturnBlock +; GCN-NEXT: s_endpgm +._crit_edge: + %id.x = tail call i32 @llvm.amdgcn.workitem.id.x() + %div = udiv i32 1, %0 + br label %.lr.ph27 + +.lr.ph27: ; preds = %pred.store.if41, %pred.store.continue, %._crit_edge + %iv = phi i32 [ %div, %._crit_edge ], [ 0, %pred.store.if41 ], [ 0, %pred.store.continue ] + %cmp = icmp ugt i32 %iv, 0 + %broadcast.splatinsert37 = insertelement <4 x i1> zeroinitializer, i1 %cmp, i64 0 + %.zext = zext i32 %id.x to i64 + %broadcast.splatinsert39 = insertelement <4 x i64> zeroinitializer, i64 %.zext, i64 0 + %cmp.1 = icmp uge <4 x i64> %broadcast.splatinsert39, splat (i64 1) + %or = or <4 x i1> %cmp.1, %broadcast.splatinsert37 + %extract = extractelement <4 x i1> %or, i64 0 + br i1 %extract, label %pred.store.if, label %pred.store.continue + +pred.store.if: ; preds = %.lr.ph27 + store i8 0, ptr addrspace(1) null, align 64 + br label %pred.store.continue + +pred.store.continue: ; preds = %pred.store.if, %.lr.ph27 + %extract.1 = extractelement <4 x i1> %or, i64 1 + br i1 %extract.1, label %pred.store.if41, label %.lr.ph27 + +pred.store.if41: ; preds = %pred.store.continue + store i8 0, ptr addrspace(1) null, align 64 + br label %.lr.ph27 +} + +declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir new file mode 100644 index 0000000000000..2daea2b2eeb74 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-to-vreg1-copy.mir @@ -0,0 +1,31 @@ +# RUN: llc -mtriple=amdgcn -run-pass si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: copy_to_vreg_1 +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: copy_to_vreg_1 + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.1(0x80000000) + ; GCN-NEXT: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: [[V_CVT_U32_F32_e64:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[IMPLICIT_DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF + ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GCN-NEXT: [[V_CMP_GT_U32_e64:%[0-9]+]]:sreg_64_xexec = samesign V_CMP_GT_U32_e64 [[V_CVT_U32_F32_e64]], killed [[COPY1]], implicit $exec + ; GCN-NEXT: [[VREG1:%[0-9]+]]:vreg_1 = COPY [[V_CMP_GT_U32_e64]] + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: S_ENDPGM 0 + bb.0: + liveins: $vgpr0, $vgpr1 + %0:vgpr_32 = nofpexcept V_CVT_U32_F32_e64 0, killed $vgpr0, 0, 0, implicit $mode, implicit $exec + %1:sreg_32 = COPY %0:vgpr_32 + %2:sreg_32 = COPY $vgpr1 + samesign S_CMP_GT_U32 %1:sreg_32, killed %2:sreg_32, implicit-def $scc + %3:sreg_64 = COPY $scc + %4:vreg_1 = COPY %3:sreg_64 + + bb.1: + S_ENDPGM 0 +... From de453e86977adf4f418b003b5c25931b8365c9cc Mon Sep 17 00:00:00 2001 From: Wanyi Date: Fri, 18 Jul 2025 01:14:28 -0400 Subject: [PATCH 289/813] [lldb-dap] Fix type req->arguments == "disconnect" (#149446) This typo was introduced in PR #140331. This branch will never get executed. We also set the `disconnecting = true` in the `DAP::Disconnect()` so I am not sure if we need it in both places. --- lldb/tools/lldb-dap/DAP.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index fd89f52595ec6..cbd3b14463e25 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -983,7 +983,7 @@ llvm::Error DAP::Loop() { if (const protocol::Request *req = std::get_if(&*next); - req && req->arguments == "disconnect") + req && req->command == "disconnect") disconnecting = true; const std::optional cancel_args = From f761d73265119eeb3b1ab64543e6d3012078ad13 Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Fri, 18 Jul 2025 13:29:33 +0800 Subject: [PATCH 290/813] [AMDGPU] Add freeze for LowerSELECT (#148796) Trying to solve https://github.com/llvm/llvm-project/issues/147635 Add freeze for legalizer when breaking i64 select to 2 i32 select. Several tests changed, still need to investigate why. --------- Co-authored-by: Shilei Tian --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2 +- .../AMDGPU/amdgpu-codegenprepare-idiv.ll | 20 +- llvm/test/CodeGen/AMDGPU/div_i128.ll | 18 +- llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 43 +- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 43 +- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 21 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 57 +-- llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 57 ++- llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll | 3 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 378 ++++++++++-------- llvm/test/CodeGen/AMDGPU/lround.ll | 51 ++- llvm/test/CodeGen/AMDGPU/rem_i128.ll | 18 +- llvm/test/CodeGen/AMDGPU/roundeven.ll | 37 +- llvm/test/CodeGen/AMDGPU/select-undef.ll | 20 + llvm/test/CodeGen/AMDGPU/srem.ll | 4 +- llvm/test/CodeGen/AMDGPU/srem64.ll | 88 ++-- .../test/CodeGen/AMDGPU/vector-reduce-smax.ll | 347 ++++++++-------- .../test/CodeGen/AMDGPU/vector-reduce-smin.ll | 347 ++++++++-------- .../test/CodeGen/AMDGPU/vector-reduce-umax.ll | 347 ++++++++-------- .../test/CodeGen/AMDGPU/vector-reduce-umin.ll | 347 ++++++++-------- 20 files changed, 1194 insertions(+), 1054 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 27212fda7638c..400795c29b0e4 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11131,7 +11131,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { assert(VT.getSizeInBits() == 64); SDLoc DL(Op); - SDValue Cond = Op.getOperand(0); + SDValue Cond = DAG.getFreeze(Op.getOperand(0)); SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index b7097a9557b75..c7385e4324e2c 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -7791,7 +7791,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: sdiv_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -7927,7 +7927,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: sdiv_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 @@ -8982,7 +8982,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX6-LABEL: srem_i64_pow2_shl_denom: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_load_dword s0, s[4:5], 0xd +; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s6, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -9116,7 +9116,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x ; ; GFX9-LABEL: srem_i64_pow2_shl_denom: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_load_dword s0, s[4:5], 0x34 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshl_b64 s[0:1], 0x1000, s0 @@ -10096,9 +10096,15 @@ define i64 @udiv_i64_9divbits(i8 %size) { } define <2 x i64> @srem_zero_zero() { -; GCN-LABEL: kernel: -; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_endpgm +; GFX6-LABEL: srem_zero_zero: +; GFX6: ; %bb.0: ; %entry +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: srem_zero_zero: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] entry: %B = srem <2 x i64> zeroinitializer, zeroinitializer ret <2 x i64> %B diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index f8e13fcdd2273..4cb0d2d7b3789 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -521,16 +521,19 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 @@ -2710,16 +2713,19 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 069a47ec97bfe..e5fe4160a4b05 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -3272,9 +3272,10 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3306,9 +3307,10 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3343,11 +3345,12 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3374,14 +3377,17 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3446,14 +3452,17 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -|v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index d8746b58b16b7..6873c617c64a1 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -3272,9 +3272,10 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, v[2:3] +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3306,9 +3307,10 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3343,11 +3345,12 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3374,14 +3377,17 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, |v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3446,14 +3452,17 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], -v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] +; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -|v[4:5]| -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index e9fd6119d0c36..193cee967f3c4 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -223,8 +223,9 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-NEXT: v_bfi_b32 v1, s8, v1, v6 ; SI-NEXT: v_mov_b32_e32 v7, s2 ; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1] +; SI-NEXT: s_bitset0_b32 s3, 31 ; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -284,14 +285,16 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; SI-NEXT: v_mov_b32_e32 v9, s5 ; SI-NEXT: v_mov_b32_e32 v10, s4 ; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] +; SI-NEXT: s_bitset0_b32 s7, 31 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] +; SI-NEXT: s_bitset0_b32 s5, 31 ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -365,26 +368,30 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; SI-NEXT: v_mov_b32_e32 v14, s5 ; SI-NEXT: v_mov_b32_e32 v15, s4 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] +; SI-NEXT: s_bitset0_b32 s3, 31 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v7 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc ; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[4:5] +; SI-NEXT: s_bitset0_b32 s1, 31 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v12 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5] +; SI-NEXT: s_bitset0_b32 s7, 31 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v14 -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5] +; SI-NEXT: s_bitset0_b32 s5, 31 ; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5] -; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9] +; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index d97ea042b50fc..f50944cc8a5b1 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2356,10 +2356,11 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX6-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX6-NEXT: v_min_f64 v[6:7], v[6:7], s[8:9] ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: s_mov_b32 s9, 0x7ff00000 ; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc +; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-NEXT: s_mov_b32 s9, 0x7ff00000 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc -; GFX6-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[8:9] +; GFX6-NEXT: v_cmp_neq_f64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 @@ -2374,17 +2375,18 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, 0 +; GFX7-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX7-NEXT: v_fract_f64_e32 v[6:7], v[0:1] +; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX7-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX7-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX7-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX7-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX7-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX7-NEXT: buffer_store_dwordx2 v[6:7], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64 +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2392,25 +2394,27 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, 0 +; GFX8-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX8-NEXT: v_fract_f64_e32 v[6:7], v[0:1] +; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX8-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX8-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX8-NEXT: v_floor_f64_e32 v[6:7], v[0:1] -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc -; GFX8-NEXT: global_store_dwordx2 v[2:3], v[6:7], off +; GFX8-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX8-NEXT: global_store_dwordx2 v[2:3], v[4:5], off +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: safe_math_fract_f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX11-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX11-NEXT: v_floor_f64_e32 v[6:7], v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 -; GFX11-NEXT: global_store_b64 v[2:3], v[6:7], off +; GFX11-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX11-NEXT: v_fract_f64_e32 v[6:7], v[0:1] +; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] +; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v6 :: v_dual_cndmask_b32 v1, 0, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: safe_math_fract_f64: @@ -2420,13 +2424,14 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] -; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| -; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: v_floor_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_fract_f64_e32 v[6:7], v[0:1] +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] +; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 -; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off +; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v6 :: v_dual_cndmask_b32 v1, 0, v7 ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 3a4bf1c81ed58..0bb973c0e5512 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -1759,11 +1759,13 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[2:3], v[0:1] -; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f64_i32_only_use_fract: @@ -1959,20 +1961,24 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) { } define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { -; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_fract: -; GFX6: ; %bb.0: -; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: s_mov_b32 s4, 0 -; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] -; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-NEXT: s_setpc_b64 s[30:31] +; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 +; GFX6-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, v0 +; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[6:7], v[0:1] +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] +; GFX6-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, v2 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] +; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_fract: ; GFX8: ; %bb.0: @@ -2005,6 +2011,21 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { ; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 +; GFX6-GISEL-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] +; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0 ret <2 x double> %result.0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll index 28781ae9f13c7..53660ffffa691 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s @@ -7,7 +8,7 @@ ; SI-DAG: v_add_f64 ; SI-DAG: v_add_f64 -; SI-DAG: v_cmp_gt_f64_e64 +; SI-DAG: v_cmp_gt_f64_e32 ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index af914bd4043cf..2500af1ae109f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -9,32 +9,33 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s7, s3, 0xb0014 -; SI-NEXT: s_addk_i32 s7, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 -; SI-NEXT: s_and_b32 s8, s3, 0x80000000 +; SI-NEXT: s_bfe_u32 s8, s3, 0xb0014 +; SI-NEXT: s_addk_i32 s8, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 ; SI-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] -; SI-NEXT: s_cmp_lt_i32 s7, 0 +; SI-NEXT: s_and_b32 s9, s3, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s8, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s8, s5 -; SI-NEXT: s_cmp_gt_i32 s7, 51 +; SI-NEXT: s_cselect_b32 s5, s9, s5 +; SI-NEXT: s_cmp_gt_i32 s8, 51 ; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: s_cselect_b32 s9, s3, s5 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec -; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: s_brev_b32 s0, -2 +; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfi_b32 v1, s2, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -49,9 +50,10 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s4, s0 ; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] -; CI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] ; CI-NEXT: v_mov_b32_e32 v2, s3 -; CI-NEXT: s_and_b64 s[2:3], s[8:9], exec +; CI-NEXT: s_and_b64 s[2:3], vcc, exec ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: v_bfi_b32 v3, s5, v3, v2 @@ -78,7 +80,7 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; SI-NEXT: s_brev_b32 s4, -2 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 ; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 @@ -95,13 +97,14 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: s_brev_b32 s2, -2 -; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc -; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 +; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 +; SI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; SI-NEXT: v_bfi_b32 v3, s4, v2, v3 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -115,13 +118,14 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 -; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; CI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 +; CI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc ; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 ; CI-NEXT: v_mov_b32_e32 v2, v1 ; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] @@ -160,35 +164,37 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 -; SI-NEXT: s_brev_b32 s10, -2 -; SI-NEXT: s_and_b64 s[4:5], s[14:15], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: s_and_b32 s6, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 -; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] +; SI-NEXT: v_bfi_b32 v1, s3, v0, v1 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v1, s6 ; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_bfi_b32 v1, s10, v1, v4 +; SI-NEXT: v_bfi_b32 v1, s3, v1, v4 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -207,14 +213,16 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[2:3] ; CI-NEXT: v_mov_b32_e32 v1, s11 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 +; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 +; CI-NEXT: s_and_b64 s[4:5], vcc, exec ; CI-NEXT: v_bfi_b32 v1, s2, v8, v1 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -253,76 +261,80 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b64 s[4:5], s[18:19], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] -; SI-NEXT: s_and_b32 s10, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s11, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s10, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 -; SI-NEXT: s_brev_b32 s18, -2 +; SI-NEXT: s_cselect_b32 s5, s11, s5 +; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_brev_b32 s3, -2 ; SI-NEXT: s_cselect_b32 s4, s8, s4 -; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 +; SI-NEXT: v_bfi_b32 v5, s3, v0, v1 ; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s3 -; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s3 +; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s8 +; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 +; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[8:9] -; SI-NEXT: s_and_b32 s10, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s11, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cselect_b32 s9, s10, s9 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cselect_b32 s9, s11, s9 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: s_cselect_b32 s9, s15, s9 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] -; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s3, v5, v6 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v8, s3 -; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 -; SI-NEXT: s_addk_i32 s3, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 +; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s4 +; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 +; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 ; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] ; SI-NEXT: s_and_b32 s6, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_cmp_lt_i32 s10, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cmp_gt_i32 s10, 51 ; SI-NEXT: s_cselect_b32 s5, s13, s5 ; SI-NEXT: s_cselect_b32 s4, s12, s4 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: v_add_f64 v[6:7], s[12:13], -v[5:6] ; SI-NEXT: v_mov_b32_e32 v9, s15 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 -; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec -; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] +; SI-NEXT: v_bfi_b32 v5, s3, v8, v9 +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v5, s6 ; SI-NEXT: v_mov_b32_e32 v8, s13 -; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 +; SI-NEXT: v_bfi_b32 v5, s3, v5, v8 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -342,31 +354,35 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s11 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] ; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] ; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 ; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 +; CI-NEXT: s_and_b64 s[4:5], vcc, exec ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] ; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v10, s9 +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] ; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[6:7], s[12:13], -v[10:11] +; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v5, s4 -; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v12, s15 -; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec +; CI-NEXT: s_and_b64 s[4:5], vcc, exec ; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] @@ -407,9 +423,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s25 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b64 s[4:5], s[26:27], exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 @@ -429,9 +446,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] ; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] -; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_and_b64 s[10:11], vcc, exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 @@ -449,9 +467,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] ; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 @@ -469,10 +488,11 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5] ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 +; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[8:9] -; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec +; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 @@ -490,10 +510,11 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5] ; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 +; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v12, s4 ; SI-NEXT: s_bfe_u32 s4, s17, 0xb0014 @@ -511,10 +532,11 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v13, s19 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 +; SI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] ; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 ; SI-NEXT: v_add_f64 v[12:13], s[8:9], v[8:9] -; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec +; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v14, s8 ; SI-NEXT: s_bfe_u32 s8, s23, 0xb0014 @@ -532,10 +554,11 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: v_add_f64 v[10:11], s[22:23], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 +; SI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] ; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_and_b64 s[4:5], vcc, exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_bfe_u32 s4, s21, 0xb0014 @@ -553,9 +576,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v14, s4 ; SI-NEXT: v_add_f64 v[14:15], s[20:21], -v[14:15] ; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 +; SI-NEXT: v_and_b32_e32 v15, 0x7fffffff, v15 +; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[14:15] ; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 -; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, exec ; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[16:17], s[8:9], v[8:9] ; SI-NEXT: v_mov_b32_e32 v9, s6 @@ -574,87 +598,95 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 ; CI-NEXT: s_brev_b32 s6, -2 -; CI-NEXT: v_mov_b32_e32 v12, 0 +; CI-NEXT: v_mov_b32_e32 v4, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[4:5] -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 -; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 +; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7] +; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] +; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v9 +; CI-NEXT: s_and_b64 s[2:3], vcc, exec +; CI-NEXT: v_cmp_le_f64_e64 s[0:1], 0.5, v[8:9] +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v5, s11 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] +; CI-NEXT: v_bfi_b32 v5, s6, v2, v5 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v8, s11 -; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec -; CI-NEXT: v_mov_b32_e32 v2, s7 -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] -; CI-NEXT: v_bfi_b32 v13, s6, v2, v8 +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[12:13] -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_mov_b32_e32 v9, s9 -; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[6:7] -; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 -; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[12:13] -; CI-NEXT: v_trunc_f64_e32 v[4:5], s[12:13] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[8:9], s[12:13], -v[4:5] +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v10, s9 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 +; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13] +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v10, s0 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[8:9]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[18:19] -; CI-NEXT: v_mov_b32_e32 v11, s15 -; CI-NEXT: v_bfi_b32 v13, s6, v10, v11 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9] +; CI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v12, s15 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] +; CI-NEXT: v_bfi_b32 v5, s6, v5, v12 ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s0 +; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v14, s13 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 -; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 +; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11] +; CI-NEXT: v_bfi_b32 v5, s6, v5, v14 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_and_b32_e32 v13, 0x7fffffff, v13 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[12:13] +; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15] +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_and_b32_e32 v13, 0x7fffffff, v13 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[12:13] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15] -; CI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] -; CI-NEXT: v_mov_b32_e32 v13, s0 +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s0 ; CI-NEXT: v_mov_b32_e32 v16, s19 -; CI-NEXT: v_bfi_b32 v13, s6, v13, v16 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_bfi_b32 v5, s6, v5, v16 +; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v10, s17 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 ; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] +; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] +; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21] +; CI-NEXT: v_and_b32_e32 v19, 0x7fffffff, v19 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[18:19] +; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15] +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_and_b32_e32 v19, 0x7fffffff, v19 +; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[18:19] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] -; CI-NEXT: v_mov_b32_e32 v8, s0 -; CI-NEXT: v_mov_b32_e32 v9, s17 -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 -; CI-NEXT: v_trunc_f64_e32 v[18:19], s[20:21] -; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 -; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] -; CI-NEXT: v_add_f64 v[13:14], s[20:21], -v[18:19] -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[13:14]|, 0.5 -; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: s_and_b64 s[0:1], vcc, exec +; CI-NEXT: v_mov_b32_e32 v18, s23 ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v13, s2 -; CI-NEXT: v_mov_b32_e32 v14, s23 -; CI-NEXT: v_mov_b32_e32 v20, s0 +; CI-NEXT: v_bfi_b32 v5, s6, v5, v18 +; CI-NEXT: v_mov_b32_e32 v18, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 -; CI-NEXT: v_mov_b32_e32 v21, s21 -; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13] -; CI-NEXT: v_bfi_b32 v13, s6, v20, v21 -; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] +; CI-NEXT: v_mov_b32_e32 v19, s21 +; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] +; CI-NEXT: v_bfi_b32 v5, s6, v18, v19 +; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index 8036e32f90eb0..5e2412742ec69 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -101,7 +101,8 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x3ff00000 -; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v0, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -129,8 +130,9 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, s4 +; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, vcc_lo ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -156,9 +158,10 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, s0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, vcc_lo ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v0, v1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -352,7 +355,8 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v4, v1 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -397,8 +401,9 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 +; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] @@ -431,12 +436,12 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 +; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0x7fffffff, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -643,7 +648,8 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 +; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v4, v1 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -688,8 +694,9 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 +; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 +; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] @@ -722,12 +729,12 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 +; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0x7fffffff, v5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index ba9dd8f7c2468..5d0e4bf1d34d0 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -559,16 +559,19 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 @@ -1943,16 +1946,19 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[12:13] ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; implicit-def: $sgpr12 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[8:9] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[12:13] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 59a1fe041bf90..3b9462cd690d5 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -1125,16 +1125,18 @@ define double @v_roundeven_f64(double %x) { ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG_GFX6-NEXT: s_brev_b32 s6, -2 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0x43300000 -; SDAG_GFX6-NEXT: v_bfi_b32 v3, s6, v2, v1 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0x43300000 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v4, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0 ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[0:1], v[2:3] +; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] +; SDAG_GFX6-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, v0 ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff -; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] -; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] -; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] +; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[2:3] +; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_f64: @@ -1215,9 +1217,10 @@ define double @v_roundeven_f64_fneg(double %x) { ; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0 ; SDAG_GFX6-NEXT: v_add_f64 v[4:5], -v[0:1], v[2:3] ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 +; SDAG_GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] -; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1305,20 +1308,24 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) { ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG_GFX6-NEXT: s_brev_b32 s6, -2 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v8, 0x43300000 -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v9, 0x43300000 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v9, v1 ; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0 ; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; SDAG_GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] -; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG_GFX6-NEXT: v_and_b32_e32 v8, 0x7fffffff, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v7, v0 +; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff +; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[7:8] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v3 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v9, v3 ; SDAG_GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] -; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5] +; SDAG_GFX6-NEXT: v_and_b32_e32 v7, 0x7fffffff, v3 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v6, v2 +; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[6:7] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/select-undef.ll b/llvm/test/CodeGen/AMDGPU/select-undef.ll index ec3781fbf0fc4..f497752994852 100644 --- a/llvm/test/CodeGen/AMDGPU/select-undef.ll +++ b/llvm/test/CodeGen/AMDGPU/select-undef.ll @@ -841,3 +841,23 @@ ret: ret void } +define i64 @poison_should_freeze(i1 %cond1, i32 %val, i16 %val2, i64 %a, i64 %b) { +; GCN-LABEL: poison_should_freeze: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_mov_b32_e32 v7, 0x5040100 +; GCN-NEXT: v_perm_b32 v2, v2, s4, v7 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v6, v4, vcc +; GCN-NEXT: s_setpc_b64 s[30:31] + %poisonv = insertelement <2 x i16> poison, i16 %val2, i32 1 + %poison = bitcast <2 x i16> %poisonv to i32 + %cond2 = select i1 %cond1, i32 %poison, i32 %val + %cmp = icmp eq i32 %cond2, 0 + %select = select i1 %cmp, i64 %a, i64 %b + ret i64 %select +} diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll index 6da7d1b7ee868..a6b8ea3963b38 100644 --- a/llvm/test/CodeGen/AMDGPU/srem.ll +++ b/llvm/test/CodeGen/AMDGPU/srem.ll @@ -1819,7 +1819,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; TAHITI-NEXT: v_mul_hi_u32 v1, v0, v1 ; TAHITI-NEXT: v_mul_lo_u32 v1, v1, v2 ; TAHITI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 -; TAHITI-NEXT: v_subrev_i32_e32 v1, vcc, v2, v0 +; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 ; TAHITI-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2 ; TAHITI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; TAHITI-NEXT: v_sub_i32_e32 v1, vcc, v0, v2 @@ -6232,7 +6232,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i ; TONGA-NEXT: v_mul_hi_u32 v8, v14, v8 ; TONGA-NEXT: v_mul_lo_u32 v8, v8, v10 ; TONGA-NEXT: v_sub_u32_e32 v8, vcc, v14, v8 -; TONGA-NEXT: v_subrev_u32_e32 v9, vcc, v10, v8 +; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 ; TONGA-NEXT: v_cmp_ge_u32_e32 vcc, v8, v10 ; TONGA-NEXT: v_cndmask_b32_e32 v8, v8, v9, vcc ; TONGA-NEXT: v_sub_u32_e32 v9, vcc, v8, v10 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 47dfa9f4fc2d3..33c2ce628e108 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -921,45 +921,47 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_ashr_i64 s[10:11], s[2:3], 31 -; GCN-NEXT: s_ashr_i64 s[6:7], s[4:5], 31 -; GCN-NEXT: s_ashr_i32 s4, s5, 31 -; GCN-NEXT: s_add_u32 s6, s6, s4 -; GCN-NEXT: s_mov_b32 s5, s4 -; GCN-NEXT: s_addc_u32 s7, s7, s4 -; GCN-NEXT: s_xor_b64 s[8:9], s[6:7], s[4:5] +; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 31 +; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 31 +; GCN-NEXT: s_ashr_i32 s6, s5, 31 +; GCN-NEXT: s_add_u32 s4, s4, s6 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: s_addc_u32 s5, s5, s6 +; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], s[6:7] ; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8 ; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9 -; GCN-NEXT: s_sub_u32 s2, 0, s8 -; GCN-NEXT: s_subb_u32 s4, 0, s9 -; GCN-NEXT: s_ashr_i32 s12, s3, 31 +; GCN-NEXT: s_sub_u32 s4, 0, s8 +; GCN-NEXT: s_subb_u32 s5, 0, s9 +; GCN-NEXT: s_ashr_i32 s10, s3, 31 ; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0 ; GCN-NEXT: v_rcp_f32_e32 v0, v0 -; GCN-NEXT: s_mov_b32 s13, s12 -; GCN-NEXT: s_mov_b32 s5, s1 -; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_add_u32 s2, s2, s10 +; GCN-NEXT: s_mov_b32 s11, s10 +; GCN-NEXT: s_addc_u32 s3, s3, s10 ; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0 ; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0 ; GCN-NEXT: v_trunc_f32_e32 v1, v1 ; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0 ; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1 ; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0 +; GCN-NEXT: s_xor_b64 s[12:13], s[2:3], s[10:11] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v5, s5, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 ; GCN-NEXT: s_mov_b32 s6, -1 -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v5, s4, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s2, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 ; GCN-NEXT: v_mul_hi_u32 v3, v0, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v6, v0, v2 -; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 -; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; GCN-NEXT: v_mul_hi_u32 v7, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc +; GCN-NEXT: v_mul_hi_u32 v7, v1, v2 +; GCN-NEXT: v_mul_lo_u32 v2, v1, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v5, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v7, vcc @@ -967,12 +969,12 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s2, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s2, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s4, v0 -; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: v_mul_lo_u32 v2, s4, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s5, v0 +; GCN-NEXT: s_mov_b32 s5, s1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_mul_lo_u32 v3, s2, v0 +; GCN-NEXT: v_mul_lo_u32 v3, s4, v0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_mul_lo_u32 v6, v0, v2 ; GCN-NEXT: v_mul_hi_u32 v7, v0, v3 @@ -988,20 +990,18 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: s_add_u32 s2, s10, s12 ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GCN-NEXT: s_addc_u32 s3, s11, s12 ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GCN-NEXT: s_xor_b64 s[10:11], s[2:3], s[12:13] -; GCN-NEXT: v_mul_lo_u32 v2, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v3, s10, v0 -; GCN-NEXT: v_mul_hi_u32 v4, s10, v1 -; GCN-NEXT: v_mul_hi_u32 v5, s11, v1 -; GCN-NEXT: v_mul_lo_u32 v1, s11, v1 +; GCN-NEXT: v_mul_lo_u32 v2, s12, v1 +; GCN-NEXT: v_mul_hi_u32 v3, s12, v0 +; GCN-NEXT: v_mul_hi_u32 v4, s12, v1 +; GCN-NEXT: v_mul_hi_u32 v5, s13, v1 +; GCN-NEXT: v_mul_lo_u32 v1, s13, v1 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_lo_u32 v4, s11, v0 -; GCN-NEXT: v_mul_hi_u32 v0, s11, v0 +; GCN-NEXT: v_mul_lo_u32 v4, s13, v0 +; GCN-NEXT: v_mul_hi_u32 v0, s13, v0 +; GCN-NEXT: s_mov_b32 s4, s0 ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v0, vcc, v3, v0, vcc ; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v5, vcc @@ -1013,9 +1013,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_mul_lo_u32 v0, s8, v0 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s11, v1 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, s13, v1 ; GCN-NEXT: v_mov_b32_e32 v3, s9 -; GCN-NEXT: v_sub_i32_e32 v0, vcc, s10, v0 +; GCN-NEXT: v_sub_i32_e32 v0, vcc, s12, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s8, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1] @@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1] ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v6 ; GCN-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] -; GCN-NEXT: v_mov_b32_e32 v4, s11 +; GCN-NEXT: v_mov_b32_e32 v4, s13 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s9, v1 ; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc @@ -1042,10 +1042,10 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v0, s12, v0 -; GCN-NEXT: v_xor_b32_e32 v1, s12, v1 -; GCN-NEXT: v_mov_b32_e32 v2, s12 -; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s12, v0 +; GCN-NEXT: v_xor_b32_e32 v0, s10, v0 +; GCN-NEXT: v_xor_b32_e32 v1, s10, v1 +; GCN-NEXT: v_mov_b32_e32 v2, s10 +; GCN-NEXT: v_subrev_i32_e32 v0, vcc, s10, v0 ; GCN-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll index f0829b53168d9..c12265bd7f372 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll @@ -3924,37 +3924,37 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4028,37 +4028,37 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[10:11], v[26:27] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[18:19] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4132,47 +4132,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[8:9], v[12:13], v[28:29] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[10:11], v[4:5], v[20:21] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[10:11], v[26:27] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[0:1], v[2:3], v[18:19] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[12:13] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[2:3], v[6:7], v[22:23] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[2:3], v[2:3], v[10:11] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[8:9], v[24:25] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[0:1], v[0:1], v[16:17] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[2:3], v[12:13], v[28:29] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[8:9], v[10:11], v[26:27] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[10:11], v[2:3], v[18:19] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v5, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v23, v7, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v27, v11, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[10:11] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v22, v6, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v26, v10, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[10:11] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[6:7] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[0:1], v[16:17], v[12:13] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e64 s[2:3], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v17, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v16, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[14:15], v[30:31] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc -; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[6:7], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v7, v31, v15, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v30, v14, vcc +; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4242,49 +4244,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s6, v[6:7], v[22:23] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[18:19] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[6:7], v[22:23] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s6, v[8:9], v[24:25] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s7, v[0:1], v[16:17] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s8, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s5 +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s6 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s6, v[0:1], v[8:9] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s6 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[14:15], v[30:31] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s6 +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[0:1], v[8:9] ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[10:11] -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s5 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4 -; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s5 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4 +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4346,50 +4348,49 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[16:17] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[6:7], v[22:23] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[18:19] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[22:23] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[8:9], v[24:25] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s3, v[0:1], v[16:17] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[4:5], v[20:21] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[4:5], v[20:21] ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[0:1], v[8:9] ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[10:11] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[14:15] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[6:7], v[14:15] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4453,58 +4454,58 @@ define i64 @test_vector_reduce_smax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[16:17] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[6:7], v[22:23] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[18:19] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[22:23] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[8:9], v[24:25] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s3, v[0:1], v[16:17] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s4, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[4:5], v[20:21] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s5, v[4:5], v[20:21] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[14:15], v[30:31] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[4:5], v[12:13] ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s1, v[6:7], v[14:15] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX12-SDAG-NEXT: v_cmp_gt_i64_e64 s0, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll index e67420562e257..5056747c33cc2 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll @@ -3924,37 +3924,37 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4028,37 +4028,37 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[10:11], v[26:27] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[18:19] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4132,47 +4132,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[8:9], v[12:13], v[28:29] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[10:11], v[4:5], v[20:21] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[10:11], v[26:27] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[18:19] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[12:13] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[2:3], v[6:7], v[22:23] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[10:11] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[24:25] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[0:1], v[0:1], v[16:17] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[2:3], v[12:13], v[28:29] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[8:9], v[10:11], v[26:27] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[10:11], v[2:3], v[18:19] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v5, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v23, v7, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v27, v11, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[10:11] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v22, v6, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v26, v10, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[10:11] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[6:7] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[0:1], v[16:17], v[12:13] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e64 s[2:3], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v17, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v16, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[14:15], v[30:31] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc -; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[6:7], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v7, v31, v15, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v30, v14, vcc +; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4242,49 +4244,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s6, v[6:7], v[22:23] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[18:19] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[6:7], v[22:23] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s6, v[8:9], v[24:25] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s7, v[0:1], v[16:17] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s8, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s5 +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s6 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s6, v[0:1], v[8:9] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s6 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[14:15], v[30:31] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s6 +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[0:1], v[8:9] ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[10:11] -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s5 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4 -; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s5 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4 +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4346,50 +4348,49 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[16:17] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[6:7], v[22:23] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[18:19] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[22:23] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[8:9], v[24:25] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s3, v[0:1], v[16:17] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[4:5], v[20:21] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[4:5], v[20:21] ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[0:1], v[8:9] ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[10:11] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[14:15] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[14:15] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4453,58 +4454,58 @@ define i64 @test_vector_reduce_smin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[24:25] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[16:17] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[6:7], v[22:23] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[10:11], v[26:27] ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[18:19] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[22:23] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[8:9], v[24:25] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s3, v[0:1], v[16:17] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s4, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[4:5], v[20:21] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s5, v[4:5], v[20:21] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[14:15], v[30:31] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[12:13] ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s1, v[6:7], v[14:15] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX12-SDAG-NEXT: v_cmp_lt_i64_e64 s0, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 92993d07b4f8f..184c80765430c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -3805,37 +3805,37 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -3909,37 +3909,37 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[10:11], v[26:27] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[18:19] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -4013,47 +4013,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[8:9], v[12:13], v[28:29] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[10:11], v[4:5], v[20:21] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[10:11], v[26:27] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[2:3], v[18:19] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[12:13] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[2:3], v[6:7], v[22:23] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[2:3], v[2:3], v[10:11] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[8:9], v[24:25] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[16:17] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[2:3], v[12:13], v[28:29] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[8:9], v[10:11], v[26:27] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[10:11], v[2:3], v[18:19] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v5, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v23, v7, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v27, v11, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[10:11] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v22, v6, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v26, v10, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[10:11] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[6:7] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[16:17], v[12:13] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e64 s[2:3], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v17, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v16, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[14:15], v[30:31] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc -; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[6:7], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v7, v31, v15, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v30, v14, vcc +; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -4123,49 +4125,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s6, v[6:7], v[22:23] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[18:19] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[6:7], v[22:23] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s6, v[8:9], v[24:25] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s7, v[0:1], v[16:17] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s8, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s5 +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s6 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s6, v[0:1], v[8:9] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s6 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[14:15], v[30:31] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s6 +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[0:1], v[8:9] ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[10:11] -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s5 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4 -; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s5 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4 +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -4227,50 +4229,49 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[16:17] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[6:7], v[22:23] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[18:19] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[22:23] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[8:9], v[24:25] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s3, v[0:1], v[16:17] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[4:5], v[20:21] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[4:5], v[20:21] ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[0:1], v[8:9] ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[10:11] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[14:15] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[6:7], v[14:15] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4334,58 +4335,58 @@ define i64 @test_vector_reduce_umax_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[16:17] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[6:7], v[22:23] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[18:19] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[22:23] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[8:9], v[24:25] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s3, v[0:1], v[16:17] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s4, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[4:5], v[20:21] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s5, v[4:5], v[20:21] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[14:15], v[30:31] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s1, v[6:7], v[14:15] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX12-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index 2bcee373d9247..e3a7ae5fd0256 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -3544,37 +3544,37 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX7-SDAG: ; %bb.0: ; %entry ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[12:13], v[28:29] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[0:1], v[8:9] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX7-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -3648,37 +3648,37 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[12:13], v[28:29] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[10:11], v[26:27] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[16:17] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v11, v27, v11, s[4:5] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[28:29] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v10, v26, v10, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[20:21] -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[0:1], v[8:9] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[8:9] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc -; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[4:5] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[22:23] +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[12:13] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[18:19] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc +; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[10:11] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc ; GFX8-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX8-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[6:7] ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) @@ -3752,47 +3752,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-SDAG-NEXT: scratch_load_dword v31, off, s32 -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[0:1], v[16:17] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[8:9], v[12:13], v[28:29] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[10:11], v[4:5], v[20:21] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[26:27] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[0:1], v[2:3], v[18:19] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[6:7] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[8:9] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s[10:11] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[12:13] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[8:9] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[2:3], v[6:7], v[22:23] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[0:1] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[0:1] -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[2:3], v[2:3], v[10:11] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[24:25] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[0:1], v[0:1], v[16:17] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[2:3], v[12:13], v[28:29] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[8:9], v[10:11], v[26:27] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[10:11], v[2:3], v[18:19] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v17, v21, v5, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v5, v23, v7, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v27, v11, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[10:11] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v4, v22, v6, s[6:7] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v26, v10, s[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[10:11] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[6:7] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[0:1], v[16:17], v[12:13] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e64 s[2:3], v[0:1], v[8:9] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v17, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v16, s[0:1] +; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[14:15], v[30:31] ; GFX9-SDAG-NEXT: s_nop 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v31, v15, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v30, v14, vcc -; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[6:7], v[4:5] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[2:3] -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v7, v31, v15, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v6, v30, v14, vcc +; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7] +; GFX9-SDAG-NEXT: s_nop 1 +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc +; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; GFX9-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[2:3], v[4:5] ; GFX9-SDAG-NEXT: s_nop 1 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc @@ -3862,49 +3864,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[0:1], v[16:17] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s6, v[6:7], v[22:23] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[18:19] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[6:7], v[22:23] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s6, v[8:9], v[24:25] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s7, v[0:1], v[16:17] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s8, v[12:13], v[28:29] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s5 +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s6 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[20:21] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s8 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s6, v[0:1], v[8:9] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s6 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s6 -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[14:15], v[30:31] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v15, v31, v15, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v14, v30, v14, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s6 +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[0:1], v[8:9] ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[10:11] -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s5 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 ; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[0:1], v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s5 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s4 -; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s5 +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v15, v31, v15, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[6:7], v[14:15] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s4 +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[6:7] +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 ; GFX10-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo ; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo @@ -3966,50 +3968,49 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[16:17] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[6:7], v[22:23] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[18:19] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[22:23] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[8:9], v[24:25] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s3, v[0:1], v[16:17] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[12:13], v[28:29] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[4:5], v[20:21] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[4:5], v[20:21] ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[0:1], v[8:9] ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[10:11] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[0:1], v[8:9] +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[14:15] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[6:7], v[14:15] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[4:5] -; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 -; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] +; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX11-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX11-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 ; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4073,58 +4074,58 @@ define i64 @test_vector_reduce_umin_v16i64(<16 x i64> %v) { ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-NEXT: scratch_load_b32 v31, off, s32 +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[8:9], v[24:25] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[16:17] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[6:7], v[22:23] +; GFX12-SDAG-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v9, v25, v9 :: v_dual_cndmask_b32 v8, v24, v8 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[10:11], v[26:27] ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[18:19] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[22:23] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[8:9], v[24:25] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s3, v[0:1], v[16:17] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s4, v[12:13], v[28:29] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s1 +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[4:5], v[20:21] +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s2 ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v11, v27, v11 :: v_dual_cndmask_b32 v10, v26, v10 -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s5, v[4:5], v[20:21] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v23, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v9, v25, v9, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v17, v1, s3 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v13, v29, v13, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s5 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v12, v28, v12, s4 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s5 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v8, v24, v8, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v16, v0, s3 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v22, v6, s2 +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[10:11] -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s2, v[0:1], v[8:9] ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s1 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 ; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v9, v1, s2 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s2 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v8, v0, s1 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[14:15], v[30:31] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v15, v31, v15 :: v_dual_cndmask_b32 v14, v30, v14 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[4:5], v[12:13] ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s1, v[6:7], v[14:15] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[6:7], v[14:15] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v5, v13, v5 :: v_dual_cndmask_b32 v4, v12, v4 ; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s1 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s1 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v7, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v6, s0 ; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[0:1], v[4:5] -; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[6:7] -; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 -; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v0, v4, v0, s0 +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX12-SDAG-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[6:7] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v3, v7, v3 :: v_dual_cndmask_b32 v2, v6, v2 +; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v1, v5, v1 :: v_dual_cndmask_b32 v0, v4, v0 +; GFX12-SDAG-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX12-SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 ; GFX12-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] ; GFX12-SDAG-NEXT: s_wait_alu 0xfffd ; GFX12-SDAG-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_cndmask_b32 v1, v3, v1 From 5fe9b5235298dc67864e947ea23df201aec177fc Mon Sep 17 00:00:00 2001 From: Shoreshen <372660931@qq.com> Date: Fri, 18 Jul 2025 14:17:15 +0800 Subject: [PATCH 291/813] Add FABS to canCreateUndefOrPoison (#149440) FABS will not create undef/poison, add it into canCreateUndefOrPoison return false --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 1 + llvm/test/CodeGen/AMDGPU/fmaximum3.ll | 43 +- llvm/test/CodeGen/AMDGPU/fminimum3.ll | 43 +- llvm/test/CodeGen/AMDGPU/fnearbyint.ll | 21 +- llvm/test/CodeGen/AMDGPU/fract-match.ll | 57 ++- llvm/test/CodeGen/AMDGPU/freeze.ll | 240 ++++++++++- llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 57 +-- llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll | 2 +- llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll | 381 ++++++++---------- llvm/test/CodeGen/AMDGPU/lround.ll | 51 +-- llvm/test/CodeGen/AMDGPU/roundeven.ll | 37 +- 11 files changed, 537 insertions(+), 396 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 682d93d0abf3f..56c8bb441ddf8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5569,6 +5569,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::BUILD_VECTOR: case ISD::BUILD_PAIR: case ISD::SPLAT_VECTOR: + case ISD::FABS: return false; case ISD::ABS: diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index e5fe4160a4b05..069a47ec97bfe 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -3272,10 +3272,9 @@ define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3307,10 +3306,9 @@ define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3345,12 +3343,11 @@ define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3377,17 +3374,14 @@ define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_max_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_max_f64 v[6:7], |v[0:1]|, |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], |v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3452,17 +3446,14 @@ define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fmaximum3_f64_fneg_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_max_f64 v[6:7], -v[0:1], -v[2:3] +; GFX9-NEXT: v_max_f64 v[6:7], -|v[0:1]|, -|v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], -|v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 6873c617c64a1..d8746b58b16b7 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -3272,10 +3272,9 @@ define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, v[2:3] ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3307,10 +3306,9 @@ define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[2:3]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc @@ -3345,12 +3343,11 @@ define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) { ; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3377,17 +3374,14 @@ define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_min_f64 v[6:7], v[0:1], v[2:3] +; GFX9-NEXT: v_min_f64 v[6:7], |v[0:1]|, |v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], |v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], |v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc @@ -3452,17 +3446,14 @@ define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) { ; GFX9-LABEL: v_fminimum3_f64_fneg_fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX9-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; GFX9-NEXT: v_min_f64 v[6:7], -v[0:1], -v[2:3] +; GFX9-NEXT: v_min_f64 v[6:7], -|v[0:1]|, -|v[2:3]| ; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[2:3] -; GFX9-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, -|v[0:1]|, -|v[2:3]| +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc -; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -v[4:5] -; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], -|v[4:5]| +; GFX9-NEXT: v_cmp_u_f64_e64 vcc, v[0:1], -|v[4:5]| ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll index 193cee967f3c4..e9fd6119d0c36 100644 --- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -223,9 +223,8 @@ define amdgpu_kernel void @nearbyint_f64(ptr addrspace(1) %out, double %in) { ; SI-NEXT: v_bfi_b32 v1, s8, v1, v6 ; SI-NEXT: v_mov_b32_e32 v7, s2 ; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1] -; SI-NEXT: s_bitset0_b32 s3, 31 ; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[2:3] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[2:3] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -285,16 +284,14 @@ define amdgpu_kernel void @nearbyint_v2f64(ptr addrspace(1) %out, <2 x double> % ; SI-NEXT: v_mov_b32_e32 v9, s5 ; SI-NEXT: v_mov_b32_e32 v10, s4 ; SI-NEXT: v_add_f64 v[2:3], s[6:7], v[0:1] -; SI-NEXT: s_bitset0_b32 s7, 31 ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc ; SI-NEXT: v_add_f64 v[6:7], s[4:5], v[0:1] -; SI-NEXT: s_bitset0_b32 s5, 31 ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[4:5] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -368,30 +365,26 @@ define amdgpu_kernel void @nearbyint_v4f64(ptr addrspace(1) %out, <4 x double> % ; SI-NEXT: v_mov_b32_e32 v14, s5 ; SI-NEXT: v_mov_b32_e32 v15, s4 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] -; SI-NEXT: s_bitset0_b32 s3, 31 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v7 -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc ; SI-NEXT: v_add_f64 v[0:1], s[0:1], v[4:5] -; SI-NEXT: s_bitset0_b32 s1, 31 ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v12 -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5] -; SI-NEXT: s_bitset0_b32 s7, 31 ; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5] ; SI-NEXT: v_bfi_b32 v5, s14, v10, v14 -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc ; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5] -; SI-NEXT: s_bitset0_b32 s5, 31 ; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5] -; SI-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[8:9] +; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll index f50944cc8a5b1..d97ea042b50fc 100644 --- a/llvm/test/CodeGen/AMDGPU/fract-match.ll +++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll @@ -2356,11 +2356,10 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX6-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX6-NEXT: v_min_f64 v[6:7], v[6:7], s[8:9] ; GFX6-NEXT: s_mov_b32 s8, 0 -; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc -; GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX6-NEXT: s_mov_b32 s9, 0x7ff00000 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc -; GFX6-NEXT: v_cmp_neq_f64_e32 vcc, s[8:9], v[0:1] +; GFX6-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[8:9] ; GFX6-NEXT: s_mov_b32 s6, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 ; GFX6-NEXT: s_mov_b32 s4, s6 @@ -2375,18 +2374,17 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: s_mov_b32 s4, 0 -; GFX7-NEXT: v_floor_f64_e32 v[4:5], v[0:1] -; GFX7-NEXT: v_fract_f64_e32 v[6:7], v[0:1] -; GFX7-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX7-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX7-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] +; GFX7-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX7-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX7-NEXT: v_floor_f64_e32 v[6:7], v[0:1] ; GFX7-NEXT: s_mov_b32 s6, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s4, s6 ; GFX7-NEXT: s_mov_b32 s5, s6 -; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX7-NEXT: buffer_store_dwordx2 v[6:7], v[2:3], s[4:7], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -2394,27 +2392,25 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_floor_f64_e32 v[4:5], v[0:1] -; GFX8-NEXT: v_fract_f64_e32 v[6:7], v[0:1] -; GFX8-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; GFX8-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX8-NEXT: v_cmp_neq_f64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: global_store_dwordx2 v[2:3], v[4:5], off -; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc +; GFX8-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX8-NEXT: v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX8-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc +; GFX8-NEXT: global_store_dwordx2 v[2:3], v[6:7], off ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: safe_math_fract_f64: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_floor_f64_e32 v[4:5], v[0:1] -; GFX11-NEXT: v_fract_f64_e32 v[6:7], v[0:1] -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] -; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off -; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v6 :: v_dual_cndmask_b32 v1, 0, v7 +; GFX11-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX11-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX11-NEXT: v_floor_f64_e32 v[6:7], v[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 +; GFX11-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: safe_math_fract_f64: @@ -2424,14 +2420,13 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) writeonly capture ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_floor_f64_e32 v[4:5], v[0:1] -; GFX12-NEXT: v_fract_f64_e32 v[6:7], v[0:1] -; GFX12-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cmp_neq_f64_e32 vcc_lo, 0x7ff00000, v[0:1] -; GFX12-NEXT: global_store_b64 v[2:3], v[4:5], off +; GFX12-NEXT: v_fract_f64_e32 v[4:5], v[0:1] +; GFX12-NEXT: v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]| +; GFX12-NEXT: v_floor_f64_e32 v[6:7], v[0:1] ; GFX12-NEXT: s_wait_alu 0xfffd -; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v6 :: v_dual_cndmask_b32 v1, 0, v7 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5 +; GFX12-NEXT: global_store_b64 v[2:3], v[6:7], off ; GFX12-NEXT: s_setpc_b64 s[30:31] entry: %floor = tail call double @llvm.floor.f64(double %x) diff --git a/llvm/test/CodeGen/AMDGPU/freeze.ll b/llvm/test/CodeGen/AMDGPU/freeze.ll index ac438062ae208..9a347d71bf430 100644 --- a/llvm/test/CodeGen/AMDGPU/freeze.ll +++ b/llvm/test/CodeGen/AMDGPU/freeze.ll @@ -14592,5 +14592,241 @@ define void @freeze_v4i1_vcc(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) { store <4 x i1> %freeze, ptr addrspace(1) %ptrb ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX8-SDAG: {{.*}} + +define double @freeze_fabs_double(float %a, double %b, double %c) { +; GFX6-SDAG-LABEL: freeze_fabs_double: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX6-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX6-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX6-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: freeze_fabs_double: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX6-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX6-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX6-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-SDAG-LABEL: freeze_fabs_double: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX7-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX7-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX7-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: freeze_fabs_double: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX7-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX7-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX7-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: freeze_fabs_double: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX8-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX8-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX8-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: freeze_fabs_double: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX8-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX8-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX8-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: freeze_fabs_double: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX9-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX9-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: freeze_fabs_double: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX10-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: freeze_fabs_double: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX10-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX10-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: freeze_fabs_double: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-SDAG-NEXT: v_add_f64 v[0:1], |v[4:5]|, v[1:2] +; GFX11-SDAG-NEXT: v_add_f64 v[2:3], |v[4:5]|, v[3:4] +; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: freeze_fabs_double: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v5, 0x7fffffff, v0 +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[4:5], v[1:2] +; GFX11-GISEL-NEXT: v_add_f64 v[2:3], v[4:5], v[3:4] +; GFX11-GISEL-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %pv = insertelement <2 x float> poison, float %a, i32 1 + %d = bitcast <2 x float> %pv to double + %r = call double @llvm.fabs.f64(double %d) + %fr = freeze double %r + %add1 = fadd double %fr, %b + %add2 = fadd double %fr, %c + %add = fadd double %add1, %add2 + ret double %add +} + +define <4 x float> @freeze_fabs_v4float(<4 x float> %A, <4 x float> %B) { +; GFX6-SDAG-LABEL: freeze_fabs_v4float: +; GFX6-SDAG: ; %bb.0: +; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX6-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX6-GISEL-LABEL: freeze_fabs_v4float: +; GFX6-GISEL: ; %bb.0: +; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX6-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX6-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX6-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX6-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX6-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-SDAG-LABEL: freeze_fabs_v4float: +; GFX7-SDAG: ; %bb.0: +; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX7-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX7-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX7-GISEL-LABEL: freeze_fabs_v4float: +; GFX7-GISEL: ; %bb.0: +; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX7-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX7-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX7-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-SDAG-LABEL: freeze_fabs_v4float: +; GFX8-SDAG: ; %bb.0: +; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX8-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX8-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-GISEL-LABEL: freeze_fabs_v4float: +; GFX8-GISEL: ; %bb.0: +; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX8-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX8-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX8-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX8-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX8-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-GISEL-LABEL: freeze_fabs_v4float: +; GFX9-GISEL: ; %bb.0: +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX9-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX9-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: freeze_fabs_v4float: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: freeze_fabs_v4float: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX10-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-LABEL: freeze_fabs_v4float: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-SDAG-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: freeze_fabs_v4float: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 +; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 +; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0x7fffffff, v2 +; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] + %A0 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %A) + %F1 = freeze <4 x float> %A0 + %A1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %F1) + ret <4 x float> %A1 +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll index 0bb973c0e5512..3a4bf1c81ed58 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll @@ -1759,13 +1759,11 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) { ; GFX6-SDAG: ; %bb.0: ; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3] -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[2:3], v[0:1] +; GFX6-SDAG-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc ; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_f64_i32_only_use_fract: @@ -1961,24 +1959,20 @@ define { <2 x double>, <2 x i32> } @test_frexp_v2f64_v2i32(<2 x double> %a) { } define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { -; GFX6-SDAG-LABEL: test_frexp_v2f64_v2i32_only_use_fract: -; GFX6-SDAG: ; %bb.0: -; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-SDAG-NEXT: s_mov_b32 s4, 0 -; GFX6-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v1 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, v0 -; GFX6-SDAG-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[6:7], v[0:1] -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] -; GFX6-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v3 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, v2 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX6-SDAG-NEXT: v_frexp_mant_f64_e32 v[6:7], v[2:3] -; GFX6-SDAG-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[4:5] -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX6-LABEL: test_frexp_v2f64_v2i32_only_use_fract: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_mov_b32 s4, 0 +; GFX6-NEXT: s_mov_b32 s5, 0x7ff00000 +; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] +; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX6-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] +; GFX6-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] +; GFX6-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX6-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_frexp_v2f64_v2i32_only_use_fract: ; GFX8: ; %bb.0: @@ -2011,21 +2005,6 @@ define <2 x double> @test_frexp_v2f64_v2i32_only_use_fract(<2 x double> %a) { ; GFX12-NEXT: v_frexp_mant_f64_e32 v[0:1], v[0:1] ; GFX12-NEXT: v_frexp_mant_f64_e32 v[2:3], v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] -; -; GFX6-GISEL-LABEL: test_frexp_v2f64_v2i32_only_use_fract: -; GFX6-GISEL: ; %bb.0: -; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-GISEL-NEXT: s_mov_b32 s4, 0 -; GFX6-GISEL-NEXT: s_mov_b32 s5, 0x7ff00000 -; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[0:1] -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5] -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX6-GISEL-NEXT: v_frexp_mant_f64_e32 v[4:5], v[2:3] -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e64 vcc, |v[2:3]|, s[4:5] -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31] %result = call { <2 x double>, <2 x i32> } @llvm.frexp.v2f64.v2i32(<2 x double> %a) %result.0 = extractvalue { <2 x double>, <2 x i32> } %result, 0 ret <2 x double> %result.0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll index 53660ffffa691..c6cf6f64db1eb 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll @@ -8,7 +8,7 @@ ; SI-DAG: v_add_f64 ; SI-DAG: v_add_f64 -; SI-DAG: v_cmp_gt_f64_e32 +; SI-DAG: v_cmp_gt_f64_e64 ; SI: v_cndmask_b32 ; SI: v_cndmask_b32 ; SI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll index 2500af1ae109f..355f77acfd302 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -9,33 +9,32 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_mov_b32 s5, 0xfffff ; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_bfe_u32 s8, s3, 0xb0014 -; SI-NEXT: s_addk_i32 s8, 0xfc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s8 +; SI-NEXT: s_bfe_u32 s7, s3, 0xb0014 +; SI-NEXT: s_addk_i32 s7, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; SI-NEXT: s_and_b32 s8, s3, 0x80000000 ; SI-NEXT: s_andn2_b64 s[4:5], s[2:3], s[4:5] -; SI-NEXT: s_and_b32 s9, s3, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s8, 0 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s9, s5 -; SI-NEXT: s_cmp_gt_i32 s8, 51 +; SI-NEXT: s_cselect_b32 s5, s8, s5 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: s_cselect_b32 s8, s2, s4 ; SI-NEXT: s_cselect_b32 s9, s3, s5 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], -v[0:1] ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] -; SI-NEXT: s_brev_b32 s0, -2 -; SI-NEXT: s_and_b64 s[10:11], vcc, exec -; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfi_b32 v1, s0, v0, v1 +; SI-NEXT: v_bfi_b32 v1, s2, v0, v1 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], v[0:1] +; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -50,10 +49,9 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 { ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[2:3] ; CI-NEXT: s_mov_b32 s4, s0 ; CI-NEXT: v_add_f64 v[2:3], s[2:3], -v[0:1] -; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] +; CI-NEXT: v_cmp_ge_f64_e64 s[8:9], |v[2:3]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v2, s3 -; CI-NEXT: s_and_b64 s[2:3], vcc, exec +; CI-NEXT: s_and_b64 s[2:3], s[8:9], exec ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v3, s0 ; CI-NEXT: v_bfi_b32 v3, s5, v3, v2 @@ -78,12 +76,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[2:3] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_movk_i32 s4, 0xfc01 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_brev_b32 s4, -2 +; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v6, vcc, 0xfffffc01, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 ; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; SI-NEXT: v_not_b32_e32 v5, v5 @@ -97,14 +96,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 -; SI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] -; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc -; SI-NEXT: v_bfi_b32 v3, s4, v2, v3 +; SI-NEXT: s_brev_b32 s2, -2 +; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; SI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc +; SI-NEXT: v_bfi_b32 v3, s2, v2, v3 ; SI-NEXT: v_mov_b32_e32 v2, v1 ; SI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] ; SI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; @@ -118,14 +116,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_mov_b64 s[4:5], s[2:3] ; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 +; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; CI-NEXT: s_brev_b32 s2, -2 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3] ; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; CI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 -; CI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] -; CI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc +; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 +; CI-NEXT: v_cndmask_b32_e32 v2, 0, v8, vcc ; CI-NEXT: v_bfi_b32 v3, s2, v2, v3 ; CI-NEXT: v_mov_b32_e32 v2, v1 ; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3] @@ -164,37 +161,35 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] -; SI-NEXT: s_brev_b32 s3, -2 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 -; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: v_cmp_ge_f64_e64 s[14:15], |v[0:1]|, 0.5 +; SI-NEXT: s_brev_b32 s10, -2 +; SI-NEXT: s_and_b64 s[4:5], s[14:15], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] ; SI-NEXT: s_and_b32 s6, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 -; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s4, s8, s4 ; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: v_add_f64 v[2:3], s[8:9], -v[2:3] ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] -; SI-NEXT: v_bfi_b32 v1, s3, v0, v1 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[2:3]|, 0.5 +; SI-NEXT: v_bfi_b32 v1, s10, v0, v1 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SI-NEXT: v_mov_b32_e32 v0, 0 -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[2:3], s[12:13], v[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_mov_b32_e32 v4, s9 -; SI-NEXT: v_bfi_b32 v1, s3, v1, v4 +; SI-NEXT: v_bfi_b32 v1, s10, v1, v4 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -213,16 +208,14 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[4:5], s[10:11], -v[2:3] ; CI-NEXT: v_mov_b32_e32 v1, s11 -; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 ; CI-NEXT: v_add_f64 v[4:5], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], vcc, exec -; CI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v8, s4 -; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5 ; CI-NEXT: v_bfi_b32 v1, s2, v8, v1 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1] ; CI-NEXT: v_mov_b32_e32 v1, s4 @@ -261,80 +254,76 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s17 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[18:19], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 -; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: s_and_b64 s[4:5], s[18:19], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_bfe_u32 s3, s9, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[8:9], s[4:5] -; SI-NEXT: s_and_b32 s11, s9, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_and_b32 s10, s9, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 -; SI-NEXT: s_cselect_b32 s5, s11, s5 -; SI-NEXT: s_cmp_gt_i32 s10, 51 -; SI-NEXT: s_brev_b32 s3, -2 +; SI-NEXT: s_cselect_b32 s5, s10, s5 +; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_brev_b32 s18, -2 ; SI-NEXT: s_cselect_b32 s4, s8, s4 -; SI-NEXT: v_bfi_b32 v5, s3, v0, v1 +; SI-NEXT: v_bfi_b32 v5, s18, v0, v1 ; SI-NEXT: s_cselect_b32 s5, s9, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] ; SI-NEXT: v_add_f64 v[2:3], s[16:17], v[4:5] -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v6, s9 -; SI-NEXT: s_and_b64 s[10:11], vcc, exec -; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v5, s8 -; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 -; SI-NEXT: s_add_i32 s10, s8, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s10 +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: s_bfe_u32 s3, s15, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[8:9], s[6:7], s3 ; SI-NEXT: s_andn2_b64 s[8:9], s[14:15], s[8:9] -; SI-NEXT: s_and_b32 s11, s15, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_and_b32 s10, s15, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s8, 0, s8 -; SI-NEXT: s_cselect_b32 s9, s11, s9 -; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cselect_b32 s9, s10, s9 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s8, s14, s8 ; SI-NEXT: s_cselect_b32 s9, s15, s9 ; SI-NEXT: v_mov_b32_e32 v0, s8 ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] -; SI-NEXT: v_bfi_b32 v5, s3, v5, v6 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_bfi_b32 v5, s18, v5, v6 +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[4:5] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec -; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; SI-NEXT: v_mov_b32_e32 v8, s4 -; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 -; SI-NEXT: s_add_i32 s10, s4, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s10 +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 +; SI-NEXT: v_mov_b32_e32 v8, s3 +; SI-NEXT: s_bfe_u32 s3, s13, 0xb0014 +; SI-NEXT: s_addk_i32 s3, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[6:7], s3 ; SI-NEXT: s_andn2_b64 s[4:5], s[12:13], s[4:5] ; SI-NEXT: s_and_b32 s6, s13, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s10, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 0 ; SI-NEXT: s_cselect_b32 s4, 0, s4 ; SI-NEXT: s_cselect_b32 s5, s6, s5 -; SI-NEXT: s_cmp_gt_i32 s10, 51 +; SI-NEXT: s_cmp_gt_i32 s3, 51 ; SI-NEXT: s_cselect_b32 s5, s13, s5 ; SI-NEXT: s_cselect_b32 s4, s12, s4 ; SI-NEXT: v_mov_b32_e32 v6, s5 ; SI-NEXT: v_mov_b32_e32 v5, s4 ; SI-NEXT: v_add_f64 v[6:7], s[12:13], -v[5:6] ; SI-NEXT: v_mov_b32_e32 v9, s15 -; SI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] -; SI-NEXT: v_bfi_b32 v5, s3, v8, v9 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec -; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[6:7]|, 0.5 +; SI-NEXT: v_bfi_b32 v5, s18, v8, v9 +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_cselect_b32 s3, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[4:5] -; SI-NEXT: v_mov_b32_e32 v5, s6 +; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_mov_b32_e32 v8, s13 -; SI-NEXT: v_bfi_b32 v5, s3, v5, v8 +; SI-NEXT: v_bfi_b32 v5, s18, v5, v8 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[4:5] ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_waitcnt lgkmcnt(0) @@ -354,35 +343,31 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in) ; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] ; CI-NEXT: v_mov_b32_e32 v5, s11 -; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 ; CI-NEXT: v_add_f64 v[2:3], s[8:9], -v[6:7] -; CI-NEXT: s_and_b64 s[4:5], vcc, exec -; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] ; CI-NEXT: v_mov_b32_e32 v8, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[2:3]|, 0.5 ; CI-NEXT: v_bfi_b32 v5, s2, v8, v5 -; CI-NEXT: s_and_b64 s[4:5], vcc, exec ; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] ; CI-NEXT: v_mov_b32_e32 v5, s4 ; CI-NEXT: v_mov_b32_e32 v10, s9 -; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] ; CI-NEXT: v_bfi_b32 v5, s2, v5, v10 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[0:1]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[10:11], s[12:13] -; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] ; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_add_f64 v[6:7], s[12:13], -v[10:11] -; CI-NEXT: s_and_b64 s[4:5], vcc, exec -; CI-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[6:7] ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_mov_b32_e32 v5, s4 +; CI-NEXT: v_cmp_ge_f64_e64 s[4:5], |v[6:7]|, 0.5 ; CI-NEXT: v_mov_b32_e32 v12, s15 -; CI-NEXT: s_and_b64 s[4:5], vcc, exec +; CI-NEXT: s_and_b64 s[4:5], s[4:5], exec ; CI-NEXT: v_bfi_b32 v5, s2, v5, v12 ; CI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; CI-NEXT: v_add_f64 v[6:7], v[8:9], v[4:5] @@ -423,10 +408,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s25 ; SI-NEXT: v_add_f64 v[0:1], s[10:11], -v[0:1] ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[26:27], |v[0:1]|, 0.5 ; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], s[26:27], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_bfe_u32 s4, s9, 0xb0014 @@ -446,10 +430,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_add_f64 v[0:1], s[8:9], -v[0:1] ; SI-NEXT: v_mov_b32_e32 v5, s9 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[2:3], s[24:25], v[8:9] -; SI-NEXT: s_and_b64 s[10:11], vcc, exec +; SI-NEXT: s_and_b64 s[10:11], s[10:11], exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: s_bfe_u32 s8, s15, 0xb0014 @@ -467,10 +450,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v1, s9 ; SI-NEXT: v_add_f64 v[0:1], s[14:15], -v[0:1] ; SI-NEXT: v_bfi_b32 v9, s3, v4, v5 -; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[0:1]|, 0.5 ; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v6, s4 ; SI-NEXT: s_bfe_u32 s4, s13, 0xb0014 @@ -488,11 +470,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: v_add_f64 v[4:5], s[12:13], -v[4:5] ; SI-NEXT: v_mov_b32_e32 v7, s15 -; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v6, v7 ; SI-NEXT: v_add_f64 v[6:7], s[8:9], v[8:9] -; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: s_bfe_u32 s8, s19, 0xb0014 @@ -510,11 +491,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: v_add_f64 v[4:5], s[18:19], -v[4:5] ; SI-NEXT: v_mov_b32_e32 v10, s13 -; SI-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[4:5]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v9, v10 ; SI-NEXT: v_add_f64 v[4:5], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v12, s4 ; SI-NEXT: s_bfe_u32 s4, s17, 0xb0014 @@ -532,11 +512,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_add_f64 v[10:11], s[16:17], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v13, s19 -; SI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v12, v13 ; SI-NEXT: v_add_f64 v[12:13], s[8:9], v[8:9] -; SI-NEXT: s_and_b64 s[8:9], vcc, exec +; SI-NEXT: s_and_b64 s[8:9], s[10:11], exec ; SI-NEXT: s_cselect_b32 s8, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v14, s8 ; SI-NEXT: s_bfe_u32 s8, s23, 0xb0014 @@ -554,11 +533,10 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v9, s8 ; SI-NEXT: v_add_f64 v[10:11], s[22:23], -v[9:10] ; SI-NEXT: v_mov_b32_e32 v15, s17 -; SI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] +; SI-NEXT: v_cmp_ge_f64_e64 s[10:11], |v[10:11]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v14, v15 ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[8:9] -; SI-NEXT: s_and_b64 s[4:5], vcc, exec +; SI-NEXT: s_and_b64 s[4:5], s[10:11], exec ; SI-NEXT: s_cselect_b32 s4, 0x3ff00000, 0 ; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: s_bfe_u32 s4, s21, 0xb0014 @@ -576,10 +554,9 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; SI-NEXT: v_mov_b32_e32 v14, s4 ; SI-NEXT: v_add_f64 v[14:15], s[20:21], -v[14:15] ; SI-NEXT: v_mov_b32_e32 v16, s23 -; SI-NEXT: v_and_b32_e32 v15, 0x7fffffff, v15 -; SI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[14:15] +; SI-NEXT: v_cmp_ge_f64_e64 s[6:7], |v[14:15]|, 0.5 ; SI-NEXT: v_bfi_b32 v9, s3, v9, v16 -; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_and_b64 s[6:7], s[6:7], exec ; SI-NEXT: s_cselect_b32 s6, 0x3ff00000, 0 ; SI-NEXT: v_add_f64 v[16:17], s[8:9], v[8:9] ; SI-NEXT: v_mov_b32_e32 v9, s6 @@ -598,95 +575,87 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in) ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x19 ; CI-NEXT: s_brev_b32 s6, -2 -; CI-NEXT: v_mov_b32_e32 v4, 0 +; CI-NEXT: v_mov_b32_e32 v12, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_trunc_f64_e32 v[0:1], s[10:11] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[8:9] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[8:9] ; CI-NEXT: v_add_f64 v[2:3], s[10:11], -v[0:1] -; CI-NEXT: v_add_f64 v[8:9], s[8:9], -v[6:7] -; CI-NEXT: v_and_b32_e32 v3, 0x7fffffff, v3 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[2:3] -; CI-NEXT: v_and_b32_e32 v9, 0x7fffffff, v9 -; CI-NEXT: s_and_b64 s[2:3], vcc, exec -; CI-NEXT: v_cmp_le_f64_e64 s[0:1], 0.5, v[8:9] -; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s11 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_trunc_f64_e32 v[8:9], s[14:15] -; CI-NEXT: v_bfi_b32 v5, s6, v2, v5 +; CI-NEXT: v_add_f64 v[6:7], s[8:9], -v[4:5] +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[2:3]|, 0.5 +; CI-NEXT: v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5 ; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec -; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[4:5] +; CI-NEXT: s_cselect_b32 s7, 0x3ff00000, 0 +; CI-NEXT: v_mov_b32_e32 v8, s11 +; CI-NEXT: s_and_b64 s[0:1], s[2:3], exec +; CI-NEXT: v_mov_b32_e32 v2, s7 +; CI-NEXT: v_trunc_f64_e32 v[6:7], s[14:15] +; CI-NEXT: v_bfi_b32 v13, s6, v2, v8 ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[8:9] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v10, s9 -; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 -; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[0:1] -; CI-NEXT: v_add_f64 v[0:1], v[6:7], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[6:7], s[12:13] -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_add_f64 v[10:11], s[12:13], -v[6:7] +; CI-NEXT: v_add_f64 v[2:3], v[0:1], v[12:13] +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: v_mov_b32_e32 v9, s9 +; CI-NEXT: v_add_f64 v[0:1], s[14:15], -v[6:7] +; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[0:1]|, 0.5 +; CI-NEXT: v_add_f64 v[0:1], v[4:5], v[12:13] +; CI-NEXT: v_trunc_f64_e32 v[4:5], s[12:13] +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_add_f64 v[8:9], s[12:13], -v[4:5] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[10:11] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v12, s15 -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_trunc_f64_e32 v[10:11], s[18:19] -; CI-NEXT: v_bfi_b32 v5, s6, v5, v12 +; CI-NEXT: v_mov_b32_e32 v10, s0 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[8:9]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[8:9], s[18:19] +; CI-NEXT: v_mov_b32_e32 v11, s15 +; CI-NEXT: v_bfi_b32 v13, s6, v10, v11 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_add_f64 v[10:11], s[18:19], -v[8:9] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[8:9], v[8:9], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[12:13] +; CI-NEXT: v_mov_b32_e32 v13, s0 ; CI-NEXT: v_mov_b32_e32 v14, s13 -; CI-NEXT: v_add_f64 v[12:13], s[18:19], -v[10:11] -; CI-NEXT: v_bfi_b32 v5, s6, v5, v14 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 +; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 ; CI-NEXT: v_trunc_f64_e32 v[14:15], s[16:17] -; CI-NEXT: v_and_b32_e32 v13, 0x7fffffff, v13 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[12:13] -; CI-NEXT: v_add_f64 v[12:13], s[16:17], -v[14:15] -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_and_b32_e32 v13, 0x7fffffff, v13 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[12:13] +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_add_f64 v[6:7], v[6:7], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_add_f64 v[10:11], s[16:17], -v[14:15] +; CI-NEXT: v_add_f64 v[4:5], v[4:5], v[12:13] +; CI-NEXT: v_mov_b32_e32 v13, s0 ; CI-NEXT: v_mov_b32_e32 v16, s19 -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_bfi_b32 v5, s6, v5, v16 -; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 +; CI-NEXT: v_bfi_b32 v13, s6, v13, v16 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[10:11]|, 0.5 ; CI-NEXT: v_trunc_f64_e32 v[16:17], s[22:23] -; CI-NEXT: v_add_f64 v[12:13], v[10:11], v[4:5] -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: v_mov_b32_e32 v10, s17 -; CI-NEXT: v_bfi_b32 v5, s6, v5, v10 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: v_add_f64 v[18:19], s[22:23], -v[16:17] -; CI-NEXT: v_add_f64 v[10:11], v[14:15], v[4:5] -; CI-NEXT: v_trunc_f64_e32 v[14:15], s[20:21] -; CI-NEXT: v_and_b32_e32 v19, 0x7fffffff, v19 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[18:19] -; CI-NEXT: v_add_f64 v[18:19], s[20:21], -v[14:15] -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_and_b32_e32 v19, 0x7fffffff, v19 -; CI-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[18:19] ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_mov_b32_e32 v5, s0 -; CI-NEXT: s_and_b64 s[0:1], vcc, exec -; CI-NEXT: v_mov_b32_e32 v18, s23 +; CI-NEXT: v_add_f64 v[10:11], v[8:9], v[12:13] +; CI-NEXT: v_mov_b32_e32 v8, s0 +; CI-NEXT: v_mov_b32_e32 v9, s17 +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[18:19]|, 0.5 +; CI-NEXT: v_trunc_f64_e32 v[18:19], s[20:21] +; CI-NEXT: v_bfi_b32 v13, s6, v8, v9 +; CI-NEXT: v_add_f64 v[8:9], v[14:15], v[12:13] +; CI-NEXT: v_add_f64 v[13:14], s[20:21], -v[18:19] +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CI-NEXT: v_cmp_ge_f64_e64 s[0:1], |v[13:14]|, 0.5 +; CI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0 +; CI-NEXT: s_and_b64 s[0:1], s[0:1], exec ; CI-NEXT: s_cselect_b32 s0, 0x3ff00000, 0 -; CI-NEXT: v_bfi_b32 v5, s6, v5, v18 -; CI-NEXT: v_mov_b32_e32 v18, s0 +; CI-NEXT: v_mov_b32_e32 v13, s2 +; CI-NEXT: v_mov_b32_e32 v14, s23 +; CI-NEXT: v_mov_b32_e32 v20, s0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; CI-NEXT: v_mov_b32_e32 v19, s21 -; CI-NEXT: v_add_f64 v[16:17], v[16:17], v[4:5] -; CI-NEXT: v_bfi_b32 v5, s6, v18, v19 -; CI-NEXT: v_add_f64 v[14:15], v[14:15], v[4:5] +; CI-NEXT: v_bfi_b32 v13, s6, v13, v14 +; CI-NEXT: v_mov_b32_e32 v21, s21 +; CI-NEXT: v_add_f64 v[14:15], v[16:17], v[12:13] +; CI-NEXT: v_bfi_b32 v13, s6, v20, v21 +; CI-NEXT: v_add_f64 v[12:13], v[18:19], v[12:13] ; CI-NEXT: s_mov_b32 s3, 0xf000 ; CI-NEXT: s_mov_b32 s2, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:48 -; CI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32 -; CI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16 +; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48 +; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32 +; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 ; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; CI-NEXT: s_endpgm %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll index 5e2412742ec69..8036e32f90eb0 100644 --- a/llvm/test/CodeGen/AMDGPU/lround.ll +++ b/llvm/test/CodeGen/AMDGPU/lround.ll @@ -101,8 +101,7 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0x3ff00000 -; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v0, v1 ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 @@ -130,9 +129,8 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, s4 ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v0, v1 ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -158,10 +156,9 @@ define i32 @intrinsic_lround_i32_f64(double %arg) { ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, vcc_lo +; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 0x3ff00000, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v0, v1 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -355,8 +352,7 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v4, v1 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -401,9 +397,8 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] @@ -436,12 +431,12 @@ define i64 @intrinsic_lround_i64_f64(double %arg) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0x7fffffff, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -648,8 +643,7 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX9-SDAG-NEXT: s_brev_b32 s4, -2 ; GFX9-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX9-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX9-SDAG-NEXT: v_cmp_le_f64_e32 vcc, 0.5, v[4:5] +; GFX9-SDAG-NEXT: v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5 ; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v6, vcc ; GFX9-SDAG-NEXT: v_bfi_b32 v1, s4, v4, v1 ; GFX9-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] @@ -694,9 +688,8 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] ; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SDAG-NEXT: v_and_b32_e32 v5, 0x7fffffff, v5 -; GFX10-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo +; GFX10-SDAG-NEXT: v_cmp_ge_f64_e64 s4, |v[4:5]|, 0.5 +; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s4 ; GFX10-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX10-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] ; GFX10-SDAG-NEXT: v_trunc_f64_e32 v[0:1], v[0:1] @@ -729,12 +722,12 @@ define i64 @intrinsic_llround_i64_f64(double %arg) { ; GFX11-SDAG: ; %bb.0: ; %entry ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-NEXT: v_trunc_f64_e32 v[2:3], v[0:1] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_add_f64 v[4:5], v[0:1], -v[2:3] -; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v5, 0x7fffffff, v5 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_le_f64_e32 vcc_lo, 0.5, v[4:5] -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, vcc_lo +; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-SDAG-NEXT: v_cmp_ge_f64_e64 s0, |v[4:5]|, 0.5 +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 0x3ff00000, s0 ; GFX11-SDAG-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_add_f64 v[0:1], v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll index 3b9462cd690d5..59a1fe041bf90 100644 --- a/llvm/test/CodeGen/AMDGPU/roundeven.ll +++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll @@ -1125,18 +1125,16 @@ define double @v_roundeven_f64(double %x) { ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG_GFX6-NEXT: s_brev_b32 s6, -2 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0x43300000 -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v4, v1 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0x43300000 +; SDAG_GFX6-NEXT: v_bfi_b32 v3, s6, v2, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0 ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] -; SDAG_GFX6-NEXT: v_and_b32_e32 v3, 0x7fffffff, v1 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, v0 +; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[0:1], v[2:3] ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff -; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[6:7], -v[4:5] -; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[2:3] -; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] +; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] +; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] ; ; SDAG_GFX7-LABEL: v_roundeven_f64: @@ -1217,10 +1215,9 @@ define double @v_roundeven_f64_fneg(double %x) { ; SDAG_GFX6-NEXT: v_mov_b32_e32 v2, 0 ; SDAG_GFX6-NEXT: v_add_f64 v[4:5], -v[0:1], v[2:3] ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1 ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff ; SDAG_GFX6-NEXT: v_add_f64 v[2:3], v[4:5], -v[2:3] -; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] +; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v6, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] @@ -1308,24 +1305,20 @@ define <2 x double> @v_roundeven_v2f64(<2 x double> %x) { ; SDAG_GFX6: ; %bb.0: ; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG_GFX6-NEXT: s_brev_b32 s6, -2 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v9, 0x43300000 -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v9, v1 +; SDAG_GFX6-NEXT: v_mov_b32_e32 v8, 0x43300000 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v1 ; SDAG_GFX6-NEXT: v_mov_b32_e32 v4, 0 ; SDAG_GFX6-NEXT: v_add_f64 v[6:7], v[0:1], v[4:5] ; SDAG_GFX6-NEXT: s_mov_b32 s4, -1 -; SDAG_GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] -; SDAG_GFX6-NEXT: v_and_b32_e32 v8, 0x7fffffff, v1 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v7, v0 ; SDAG_GFX6-NEXT: s_mov_b32 s5, 0x432fffff -; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[7:8] +; SDAG_GFX6-NEXT: v_add_f64 v[5:6], v[6:7], -v[4:5] +; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[0:1]|, s[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc -; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v9, v3 +; SDAG_GFX6-NEXT: v_bfi_b32 v5, s6, v8, v3 ; SDAG_GFX6-NEXT: v_add_f64 v[7:8], v[2:3], v[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc ; SDAG_GFX6-NEXT: v_add_f64 v[4:5], v[7:8], -v[4:5] -; SDAG_GFX6-NEXT: v_and_b32_e32 v7, 0x7fffffff, v3 -; SDAG_GFX6-NEXT: v_mov_b32_e32 v6, v2 -; SDAG_GFX6-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[6:7] +; SDAG_GFX6-NEXT: v_cmp_gt_f64_e64 vcc, |v[2:3]|, s[4:5] ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SDAG_GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31] From 06528070fce04d580821b3448f1d5321e0a9a97b Mon Sep 17 00:00:00 2001 From: Vikram Hegde <115221833+vikramRH@users.noreply.github.com> Date: Fri, 18 Jul 2025 11:58:01 +0530 Subject: [PATCH 292/813] [CodeGen][NPM] Clear MachineFunctions without using PA (#148113) same as https://github.com/llvm/llvm-project/pull/139517 This replaces the InvalidateAnalysisPass pass. There are no cross-function analysis requirements right now, so clearing all analyses works for the last pass in the pipeline. Having the InvalidateAnalysisPass() is causing a problem with ModuleToCGSCCPassAdaptor by deleting machine functions for other functions and ending up with exactly one correctly compiled MF, with the rest being vanished. This is because ModuleToCGSCCPAdaptor propagates PassPA (received from the CGSCCToFunctionPassAdaptor that runs the actual codegen pipeline on MFs) to the next SCC. That causes MFA invalidation on functions in the next SCC. For us, PassPA happens to be returned from invalidate which abandons the MachineFunctionAnalysis. So while the first function runs through the pipeline normally, invalidate also deletes the functions in the next SCC before its pipeline is run. (this seems to be the intended mechanism of the CG adaptor to allow cross-SCC invalidations. Co-authored-by : Oke, Akshat <[Akshat.Oke@amd.com](mailto:Akshat.Oke@amd.com)> --- .../llvm/CodeGen/MachineFunctionAnalysis.h | 5 +++++ llvm/include/llvm/IR/PassManager.h | 16 ++++++++++++++++ llvm/include/llvm/Passes/CodeGenPassBuilder.h | 2 +- llvm/lib/CodeGen/MachineFunctionAnalysis.cpp | 6 ++++++ llvm/lib/Passes/PassRegistry.def | 1 + llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 6 +++--- llvm/test/tools/llc/new-pm/start-stop.ll | 2 +- 7 files changed, 33 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h index 98a60c987bbe3..1d954cf60c68c 100644 --- a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h +++ b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h @@ -46,6 +46,11 @@ class MachineFunctionAnalysis LLVM_ABI Result run(Function &F, FunctionAnalysisManager &FAM); }; +class FreeMachineFunctionPass : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + } // namespace llvm #endif // LLVM_CODEGEN_MachineFunctionAnalysis diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h index 4f44ae56eb3c7..ea8226c6e17ba 100644 --- a/llvm/include/llvm/IR/PassManager.h +++ b/llvm/include/llvm/IR/PassManager.h @@ -491,6 +491,22 @@ template class AnalysisManager { /// invalidate them, unless they are preserved by the PreservedAnalyses set. void invalidate(IRUnitT &IR, const PreservedAnalyses &PA); + /// Directly clear a cached analysis for an IR unit. + /// + /// Using invalidate() over this is preferred unless you are really + /// sure you want to *only* clear this analysis without asking if it is + /// invalid. + template void clearAnalysis(IRUnitT &IR) { + AnalysisResultListT &ResultsList = AnalysisResultLists[&IR]; + AnalysisKey *ID = AnalysisT::ID(); + + auto I = + llvm::find_if(ResultsList, [&ID](auto &E) { return E.first == ID; }); + assert(I != ResultsList.end() && "Analysis must be available"); + ResultsList.erase(I); + AnalysisResults.erase({ID, &IR}); + } + private: /// Look up a registered analysis pass. PassConceptT &lookUpPass(AnalysisKey *ID) { diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h index a8176ebb776cf..b0360f1903c0e 100644 --- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h +++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h @@ -281,7 +281,7 @@ template class CodeGenPassBuilder { FunctionPassManager FPM; FPM.addPass(createFunctionToMachineFunctionPassAdaptor(std::move(MFPM))); - FPM.addPass(InvalidateAnalysisPass()); + FPM.addPass(FreeMachineFunctionPass()); if (this->PB.AddInCGSCCOrder) { MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor( createCGSCCToFunctionPassAdaptor(std::move(FPM)))); diff --git a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp index e7a4d6d61e211..116a919585d70 100644 --- a/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp +++ b/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp @@ -45,3 +45,9 @@ MachineFunctionAnalysis::run(Function &F, FunctionAnalysisManager &FAM) { return Result(std::move(MF)); } + +PreservedAnalyses FreeMachineFunctionPass::run(Function &F, + FunctionAnalysisManager &FAM) { + FAM.clearAnalysis(F); + return PreservedAnalyses::all(); +} diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 9a943155aa19f..caa78b613b901 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -434,6 +434,7 @@ FUNCTION_PASS("extra-vector-passes", FUNCTION_PASS("fix-irreducible", FixIrreduciblePass()) FUNCTION_PASS("flatten-cfg", FlattenCFGPass()) FUNCTION_PASS("float2int", Float2IntPass()) +FUNCTION_PASS("free-machine-function", FreeMachineFunctionPass()) FUNCTION_PASS("gc-lowering", GCLoweringPass()) FUNCTION_PASS("guard-widening", GuardWideningPass()) FUNCTION_PASS("gvn-hoist", GVNHoistPass()) diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 243cb95d24e4e..50fa7ac2a19aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -8,11 +8,11 @@ ; RUN: | FileCheck -check-prefix=GCN-O3 %s -; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) +; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) +; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),invalidate)) +; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll index 13d9663221115..e4c454900fd38 100644 --- a/llvm/test/tools/llc/new-pm/start-stop.ll +++ b/llvm/test/tools/llc/new-pm/start-stop.ll @@ -2,4 +2,4 @@ ; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -o /dev/null %s | FileCheck --match-full-lines %s --check-prefix=OBJ ; NULL: require,require,require,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify) -; OBJ: require,require,require,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),invalidate) +; OBJ: require,require,require,function(verify,mergeicmps,expand-memcmp,gc-lowering,verify),PrintMIRPreparePass,function(machine-function(print),free-machine-function) From 1614c3b3c74b50dc6d5a7f359897bca221de931b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Jul 2025 15:31:50 +0900 Subject: [PATCH 293/813] AMDGPU: Always use AV spill pseudos on targets with AGPRs (#149099) This increases allocator freedom to inflate register classes to the AV class, we don't need to introduce a new restriction by basing the opcode on the current virtual register class. Ideally we would avoid this if we don't have any allocatable AGPRs for the function, but it probably doesn't make much difference in the end result if they are excluded from the final allocation order. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 110 +++------------- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 10 ++ .../AMDGPU/buffer-fat-pointers-memcpy.ll | 105 +++++---------- .../CodeGen/AMDGPU/inflate-av-remat-imm.mir | 38 +++--- ...-reg-class-snippet-copy-use-after-free.mir | 2 +- .../AMDGPU/regalloc-undef-copy-fold.mir | 22 ++-- llvm/test/CodeGen/AMDGPU/spill-agpr.mir | 120 +++++++++--------- ...-last-chance-recoloring-alloc-succeeds.mir | 22 ++-- 8 files changed, 159 insertions(+), 270 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9da8a1c8e8fb6..b1116974642c9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1625,41 +1625,6 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { } } -static unsigned getAGPRSpillSaveOpcode(unsigned Size) { - switch (Size) { - case 4: - return AMDGPU::SI_SPILL_A32_SAVE; - case 8: - return AMDGPU::SI_SPILL_A64_SAVE; - case 12: - return AMDGPU::SI_SPILL_A96_SAVE; - case 16: - return AMDGPU::SI_SPILL_A128_SAVE; - case 20: - return AMDGPU::SI_SPILL_A160_SAVE; - case 24: - return AMDGPU::SI_SPILL_A192_SAVE; - case 28: - return AMDGPU::SI_SPILL_A224_SAVE; - case 32: - return AMDGPU::SI_SPILL_A256_SAVE; - case 36: - return AMDGPU::SI_SPILL_A288_SAVE; - case 40: - return AMDGPU::SI_SPILL_A320_SAVE; - case 44: - return AMDGPU::SI_SPILL_A352_SAVE; - case 48: - return AMDGPU::SI_SPILL_A384_SAVE; - case 64: - return AMDGPU::SI_SPILL_A512_SAVE; - case 128: - return AMDGPU::SI_SPILL_A1024_SAVE; - default: - llvm_unreachable("unknown register size"); - } -} - static unsigned getAVSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: @@ -1707,22 +1672,20 @@ static unsigned getWWMRegSpillSaveOpcode(unsigned Size, return AMDGPU::SI_SPILL_WWM_V32_SAVE; } -static unsigned getVectorRegSpillSaveOpcode(Register Reg, - const TargetRegisterClass *RC, - unsigned Size, - const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); +unsigned SIInstrInfo::getVectorRegSpillSaveOpcode( + Register Reg, const TargetRegisterClass *RC, unsigned Size, + const SIMachineFunctionInfo &MFI) const { + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); // Choose the right opcode if spilling a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass); - if (IsVectorSuperClass) + // TODO: Check if AGPRs are available + if (ST.hasMAIInsts()) return getAVSpillSaveOpcode(Size); - return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) - : getVGPRSpillSaveOpcode(Size); + return getVGPRSpillSaveOpcode(Size); } void SIInstrInfo::storeRegToStackSlot( @@ -1770,8 +1733,8 @@ void SIInstrInfo::storeRegToStackSlot( return; } - unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, - SpillSize, RI, *MFI); + unsigned Opcode = + getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC, SpillSize, *MFI); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) @@ -1854,41 +1817,6 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { } } -static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { - switch (Size) { - case 4: - return AMDGPU::SI_SPILL_A32_RESTORE; - case 8: - return AMDGPU::SI_SPILL_A64_RESTORE; - case 12: - return AMDGPU::SI_SPILL_A96_RESTORE; - case 16: - return AMDGPU::SI_SPILL_A128_RESTORE; - case 20: - return AMDGPU::SI_SPILL_A160_RESTORE; - case 24: - return AMDGPU::SI_SPILL_A192_RESTORE; - case 28: - return AMDGPU::SI_SPILL_A224_RESTORE; - case 32: - return AMDGPU::SI_SPILL_A256_RESTORE; - case 36: - return AMDGPU::SI_SPILL_A288_RESTORE; - case 40: - return AMDGPU::SI_SPILL_A320_RESTORE; - case 44: - return AMDGPU::SI_SPILL_A352_RESTORE; - case 48: - return AMDGPU::SI_SPILL_A384_RESTORE; - case 64: - return AMDGPU::SI_SPILL_A512_RESTORE; - case 128: - return AMDGPU::SI_SPILL_A1024_RESTORE; - default: - llvm_unreachable("unknown register size"); - } -} - static unsigned getAVSpillRestoreOpcode(unsigned Size) { switch (Size) { case 4: @@ -1930,27 +1858,27 @@ static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, if (Size != 4) llvm_unreachable("unknown wwm register spill size"); - if (IsVectorSuperClass) + if (IsVectorSuperClass) // TODO: Always use this if there are AGPRs return AMDGPU::SI_SPILL_WWM_AV32_RESTORE; return AMDGPU::SI_SPILL_WWM_V32_RESTORE; } -static unsigned -getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, - unsigned Size, const SIRegisterInfo &TRI, - const SIMachineFunctionInfo &MFI) { - bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); +unsigned SIInstrInfo::getVectorRegSpillRestoreOpcode( + Register Reg, const TargetRegisterClass *RC, unsigned Size, + const SIMachineFunctionInfo &MFI) const { + bool IsVectorSuperClass = RI.isVectorSuperClass(RC); // Choose the right opcode if restoring a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass); - if (IsVectorSuperClass) + // TODO: Check if AGPRs are available + if (ST.hasMAIInsts()) return getAVSpillRestoreOpcode(Size); - return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) - : getVGPRSpillRestoreOpcode(Size); + assert(!RI.isAGPRClass(RC)); + return getVGPRSpillRestoreOpcode(Size); } void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, @@ -1998,7 +1926,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, } unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC, - SpillSize, RI, *MFI); + SpillSize, *MFI); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // vaddr .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 3a48e6579238e..2764ed3d3f0b1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -33,6 +33,7 @@ class LiveVariables; class MachineDominatorTree; class MachineRegisterInfo; class RegScavenger; +class SIMachineFunctionInfo; class TargetRegisterClass; class ScheduleHazardRecognizer; @@ -287,6 +288,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool getConstValDefinedInReg(const MachineInstr &MI, const Register Reg, int64_t &ImmVal) const override; + unsigned getVectorRegSpillSaveOpcode(Register Reg, + const TargetRegisterClass *RC, + unsigned Size, + const SIMachineFunctionInfo &MFI) const; + unsigned + getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, + unsigned Size, + const SIMachineFunctionInfo &MFI) const; + void storeRegToStackSlot( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll index c69e12731e10d..3c991cfb7a1aa 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll @@ -444,14 +444,6 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32 -; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 -; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 -; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v62, s[8:11], 0 offen offset:48 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v62, s[8:11], 0 offen offset:64 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v62, s[8:11], 0 offen offset:80 @@ -464,20 +456,15 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v62, s[8:11], 0 offen offset:192 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:240 -; GISEL-GFX942-NEXT: s_nop 0 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240 +; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1 +; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1 +; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0 +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v63, s[4:7], 0 offen offset:16 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2) -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v63, s[4:7], 0 offen offset:32 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v63, s[4:7], 0 offen offset:48 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v63, s[4:7], 0 offen offset:64 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v63, s[4:7], 0 offen offset:80 @@ -490,10 +477,8 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v63, s[4:7], 0 offen offset:192 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v63, s[4:7], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v63, s[4:7], 0 offen offset:224 -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse +; GISEL-GFX942-NEXT: scratch_load_dwordx4 v[2:5], off, off ; 16-byte Folded Reload +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen offset:240 ; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB0_1 ; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split @@ -822,14 +807,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[4:7], 0 offen ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[4:7], 0 offen offset:16 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:32 -; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0 -; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 -; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[4:7], 0 offen offset:48 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[4:7], 0 offen offset:64 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[4:7], 0 offen offset:80 @@ -842,20 +819,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[4:7], 0 offen offset:192 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[4:7], 0 offen offset:208 ; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[4:7], 0 offen offset:224 -; SDAG-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[4:7], 0 offen offset:240 -; SDAG-GFX942-NEXT: s_nop 0 +; SDAG-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[4:7], 0 offen offset:240 +; SDAG-GFX942-NEXT: v_add_u32_e32 v62, s8, v0 +; SDAG-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 +; SDAG-GFX942-NEXT: s_and_b64 vcc, exec, vcc +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) +; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse +; SDAG-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[12:15], 0 offen offset:16 -; SDAG-GFX942-NEXT: s_waitcnt vmcnt(2) -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse -; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:32 +; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[12:15], 0 offen offset:32 ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[12:15], 0 offen offset:48 ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[12:15], 0 offen offset:64 ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[12:15], 0 offen offset:80 @@ -868,10 +841,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[12:15], 0 offen offset:192 ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[12:15], 0 offen offset:208 ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[12:15], 0 offen offset:224 -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse -; SDAG-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse +; SDAG-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload +; SDAG-GFX942-NEXT: s_waitcnt vmcnt(0) ; SDAG-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[12:15], 0 offen offset:240 ; SDAG-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 ; SDAG-GFX942-NEXT: ; %bb.2: ; %memcpy-split @@ -993,16 +964,6 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v1, s[8:11], 0 offen ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v1, s[8:11], 0 offen offset:16 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:32 -; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0 -; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 -; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1 -; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1 -; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a0, v13 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a1, v12 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a2, v11 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a3, v10 ; Reload Reuse ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[14:17], v1, s[8:11], 0 offen offset:48 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[18:21], v1, s[8:11], 0 offen offset:64 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[22:25], v1, s[8:11], 0 offen offset:80 @@ -1015,20 +976,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[50:53], v1, s[8:11], 0 offen offset:192 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v1, s[8:11], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v1, s[8:11], 0 offen offset:224 -; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v1, s[8:11], 0 offen offset:240 -; GISEL-GFX942-NEXT: s_nop 0 +; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v1, s[8:11], 0 offen offset:240 +; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s12, v0 +; GISEL-GFX942-NEXT: v_add_co_u32_e32 v0, vcc, 0x100, v0 +; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], vcc, -1 +; GISEL-GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], -1 +; GISEL-GFX942-NEXT: s_and_b64 vcc, s[2:3], exec +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) +; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v63, a3 ; Reload Reuse +; GISEL-GFX942-NEXT: scratch_store_dwordx3 off, a[0:2], off ; 12-byte Folded Spill ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[6:9], v62, s[4:7], 0 offen offset:16 -; GISEL-GFX942-NEXT: s_waitcnt vmcnt(2) -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a4, v13 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a0 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a1 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a2 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a3 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a5, v12 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a6, v11 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_write_b32 a7, v10 ; Reload Reuse -; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:32 +; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[10:13], v62, s[4:7], 0 offen offset:32 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[14:17], v62, s[4:7], 0 offen offset:48 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[18:21], v62, s[4:7], 0 offen offset:64 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[22:25], v62, s[4:7], 0 offen offset:80 @@ -1041,10 +1000,8 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[50:53], v62, s[4:7], 0 offen offset:192 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[54:57], v62, s[4:7], 0 offen offset:208 ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[58:61], v62, s[4:7], 0 offen offset:224 -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v5, a4 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v4, a5 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v3, a6 ; Reload Reuse -; GISEL-GFX942-NEXT: v_accvgpr_read_b32 v2, a7 ; Reload Reuse +; GISEL-GFX942-NEXT: scratch_load_dwordx3 v[2:4], off, off ; 12-byte Folded Reload +; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0) ; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v62, s[4:7], 0 offen offset:240 ; GISEL-GFX942-NEXT: s_cbranch_vccnz .LBB1_1 ; GISEL-GFX942-NEXT: ; %bb.2: ; %memcpy-split diff --git a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir index 2cc25d88347ee..c34c9749d553a 100644 --- a/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/inflate-av-remat-imm.mir @@ -17,24 +17,22 @@ body: | liveins: $vgpr0, $sgpr4_sgpr5 ; CHECK-LABEL: name: av_mov_b32_split - ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5 + ; CHECK: liveins: $agpr3, $agpr4, $vgpr0, $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec ; CHECK-NEXT: renamable $agpr1 = V_ACCVGPR_WRITE_B32_e64 1, implicit $exec ; CHECK-NEXT: renamable $agpr2 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec - ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 3, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - ; CHECK-NEXT: renamable $agpr0 = V_ACCVGPR_WRITE_B32_e64 4, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec + ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 4, implicit $exec + ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr2 - ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec - ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0 - ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec - ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0 + ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0 %0:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec %1:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec %2:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec @@ -68,29 +66,25 @@ body: | liveins: $vgpr0, $sgpr4_sgpr5 ; CHECK-LABEL: name: v_mov_b32_split - ; CHECK: liveins: $vgpr0, $vgpr3, $vgpr4, $vgpr5, $sgpr4_sgpr5 + ; CHECK: liveins: $agpr3, $agpr4, $vgpr0, $sgpr4_sgpr5 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 1, implicit $exec ; CHECK-NEXT: renamable $vgpr2 = V_MOV_B32_e32 2, implicit $exec ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; CHECK-NEXT: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 3, implicit $exec - ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec + ; CHECK-NEXT: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 4, implicit $exec - ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec - ; CHECK-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec - ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec + ; CHECK-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr2 - ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec - ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0 - ; CHECK-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec - ; CHECK-NEXT: S_NOP 0, implicit killed renamable $agpr0 + ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec + ; CHECK-NEXT: S_NOP 0, implicit killed renamable $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 1, implicit $exec %2:vgpr_32 = V_MOV_B32_e32 2, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir index c1e0d0716acae..11de6c8d52d59 100644 --- a/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir +++ b/llvm/test/CodeGen/AMDGPU/inflated-reg-class-snippet-copy-use-after-free.mir @@ -27,7 +27,7 @@ # CHECK-LABEL: name: inflated_reg_class_copy_use_after_free # CHECK: S_NOP 0, implicit-def [[ORIG_REG:%[0-9]+]].sub0_sub1_sub2_sub3 # CHECK-NEXT: SI_SPILL_AV512_SAVE [[ORIG_REG]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) -# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) +# CHECK-NEXT: [[RESTORE0:%[0-9]+]]:vreg_512_align2 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) # CHECK-NEXT: early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = V_MFMA_F32_16X16X1F32_vgprcd_e64 undef %3:vgpr_32, undef %3:vgpr_32, [[RESTORE0]], 0, 0, 0, implicit $mode, implicit $exec, implicit $mode, implicit $exec # CHECK-NEXT: undef [[SPLIT0:%[0-9]+]].sub2_sub3:av_512_align2 = COPY undef $vgpr2_vgpr3 { # CHECK-NEXT: internal [[SPLIT0]].sub0:av_512_align2 = COPY undef $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir index b416c96d74d98..d27b4eaff1ed9 100644 --- a/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/regalloc-undef-copy-fold.mir @@ -46,20 +46,20 @@ body: | ; CHECK-NEXT: SI_SPILL_AV256_SAVE %1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.3, align 4, addrspace 5) ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_512 = COPY %10 ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vreg_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; CHECK-NEXT: SI_SPILL_V512_SAVE [[COPY1]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:av_512 = COPY $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; CHECK-NEXT: SI_SPILL_AV512_SAVE [[COPY1]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.6, align 4, addrspace 5) ; CHECK-NEXT: INLINEASM &"; clobber", 1 /* sideeffect attdialect */, 10 /* regdef */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 - ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.6, align 4, addrspace 5) - ; CHECK-NEXT: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY [[SI_SPILL_V512_RESTORE]] - ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE1:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_V512_SAVE [[SI_SPILL_V512_RESTORE1]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_AV256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5) - ; CHECK-NEXT: SI_SPILL_V256_SAVE [[SI_SPILL_AV256_RESTORE]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.6, align 4, addrspace 5) + ; CHECK-NEXT: $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 = COPY [[SI_SPILL_AV512_RESTORE]] + ; CHECK-NEXT: [[SI_SPILL_V512_RESTORE:%[0-9]+]]:vreg_512 = SI_SPILL_V512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_AV512_SAVE [[SI_SPILL_V512_RESTORE]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE:%[0-9]+]]:av_256 = SI_SPILL_AV256_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.3, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_AV256_SAVE [[SI_SPILL_AV256_RESTORE]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.5, align 4, addrspace 5) ; CHECK-NEXT: [[SI_SPILL_AV160_RESTORE:%[0-9]+]]:vreg_160 = SI_SPILL_AV160_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.1, align 4, addrspace 5) ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: [[SI_SPILL_V256_RESTORE:%[0-9]+]]:vreg_256 = SI_SPILL_V256_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.5, align 4, addrspace 5) - ; CHECK-NEXT: INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9 /* reguse */, [[SI_SPILL_AV512_RESTORE]], 9 /* reguse */, [[SI_SPILL_V256_RESTORE]], 9 /* reguse */, [[SI_SPILL_AV160_RESTORE]], 9 /* reguse */, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9 /* reguse */, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 + ; CHECK-NEXT: [[SI_SPILL_AV512_RESTORE1:%[0-9]+]]:av_512 = SI_SPILL_AV512_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_AV256_RESTORE1:%[0-9]+]]:av_256 = SI_SPILL_AV256_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.5, align 4, addrspace 5) + ; CHECK-NEXT: INLINEASM &"; use $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 9 /* reguse */, [[SI_SPILL_AV512_RESTORE1]], 9 /* reguse */, [[SI_SPILL_AV256_RESTORE1]], 9 /* reguse */, [[SI_SPILL_AV160_RESTORE]], 9 /* reguse */, undef $vgpr0_vgpr1_vgpr2_vgpr3, 9 /* reguse */, $agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16 ; CHECK-NEXT: SI_RETURN INLINEASM &"; def $0 $1 $2 $3 $4", 1 /* sideeffect attdialect */, 10, def %22:vreg_512, 10, def %25:vreg_256, 10, def %28:vreg_160, 10, def $vgpr0_vgpr1_vgpr2_vgpr3, 10, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 %30:av_128 = COPY undef $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir index 8e6da4bf92ee0..3f6956b83ae92 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.mir @@ -18,9 +18,9 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0 - ; GFX908-SPILLED-NEXT: SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0 - ; GFX908-SPILLED-NEXT: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -29,8 +29,8 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GFX908-SPILLED-NEXT: $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0 = SI_SPILL_AV32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr1 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr32 @@ -62,9 +62,9 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV32_SAVE killed $agpr0, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -73,8 +73,8 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0 = SI_SPILL_A32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; GFX90A-SPILLED-NEXT: $agpr1 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0 = SI_SPILL_AV32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr1 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0, implicit killed renamable $agpr1 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr32 @@ -124,7 +124,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1 - ; GFX908-SPILLED-NEXT: SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -133,7 +133,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr64 @@ -164,7 +164,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV64_SAVE killed $agpr0_agpr1, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -173,7 +173,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1 = SI_SPILL_A64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr64 @@ -222,14 +222,14 @@ body: | ; GFX908-SPILLED-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0 - ; GFX908-SPILLED-NEXT: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: ; GFX908-SPILLED-NEXT: successors: %bb.2(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX908-SPILLED-NEXT: S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 ; GFX908-SPILLED-NEXT: S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -288,14 +288,14 @@ body: | ; GFX90A-SPILLED-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251_vgpr252_vgpr253_vgpr254_vgpr255 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV32_SAVE killed $agpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: ; GFX90A-SPILLED-NEXT: successors: %bb.2(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0 = SI_SPILL_A32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0 = SI_SPILL_AV32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit undef $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit undef $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit undef $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 @@ -385,7 +385,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2 - ; GFX908-SPILLED-NEXT: SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -394,7 +394,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2 = SI_SPILL_AV96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr96 @@ -427,7 +427,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV96_SAVE killed $agpr0_agpr1_agpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s96) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -436,7 +436,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2 = SI_SPILL_A96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2 = SI_SPILL_AV96_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s96) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr96 @@ -486,7 +486,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3 - ; GFX908-SPILLED-NEXT: SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -495,7 +495,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr128 @@ -530,7 +530,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV128_SAVE killed $agpr0_agpr1_agpr2_agpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -539,7 +539,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_A128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3 = SI_SPILL_AV128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr128 @@ -591,7 +591,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4 - ; GFX908-SPILLED-NEXT: SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -600,7 +600,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_AV160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr160 @@ -637,7 +637,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV160_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s160) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -646,7 +646,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_A160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4 = SI_SPILL_AV160_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s160) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr160 @@ -700,7 +700,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; GFX908-SPILLED-NEXT: SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -709,7 +709,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr192 @@ -748,7 +748,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV192_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5, %stack.0, $sgpr32, 0, implicit $exec :: (store (s192) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -757,7 +757,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_A192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = SI_SPILL_AV192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr192 @@ -813,7 +813,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908-SPILLED-NEXT: SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -822,7 +822,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_AV256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr256 @@ -865,7 +865,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV256_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, %stack.0, $sgpr32, 0, implicit $exec :: (store (s256) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -874,7 +874,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_A256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = SI_SPILL_AV256_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s256) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr256 @@ -934,7 +934,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 - ; GFX908-SPILLED-NEXT: SI_SPILL_A288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -943,7 +943,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_AV288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr288 @@ -988,7 +988,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV288_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s288) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -997,7 +997,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_A288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 = SI_SPILL_AV288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr288 @@ -1059,7 +1059,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 - ; GFX908-SPILLED-NEXT: SI_SPILL_A320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -1068,7 +1068,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_AV320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr320 @@ -1115,7 +1115,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV320_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9, %stack.0, $sgpr32, 0, implicit $exec :: (store (s320) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -1124,7 +1124,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_A320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 = SI_SPILL_AV320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr320 @@ -1188,7 +1188,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 - ; GFX908-SPILLED-NEXT: SI_SPILL_A352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -1197,7 +1197,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_AV352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr352 @@ -1246,7 +1246,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV352_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10, %stack.0, $sgpr32, 0, implicit $exec :: (store (s352) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -1255,7 +1255,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_A352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 = SI_SPILL_AV352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr352 @@ -1321,7 +1321,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 - ; GFX908-SPILLED-NEXT: SI_SPILL_A384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -1330,7 +1330,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_AV384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr384 @@ -1381,7 +1381,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV384_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -1390,7 +1390,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_A384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 = SI_SPILL_AV384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr384 @@ -1458,7 +1458,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908-SPILLED-NEXT: SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -1467,7 +1467,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr512 @@ -1526,7 +1526,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV512_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, %stack.0, $sgpr32, 0, implicit $exec :: (store (s512) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -1535,7 +1535,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_A512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = SI_SPILL_AV512_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s512) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr512 @@ -1611,7 +1611,7 @@ body: | ; GFX908-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; GFX908-SPILLED-NEXT: SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: SI_SPILL_AV1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.1: @@ -1620,7 +1620,7 @@ body: | ; GFX908-SPILLED-NEXT: S_NOP 1 ; GFX908-SPILLED-NEXT: {{ $}} ; GFX908-SPILLED-NEXT: bb.2: - ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; GFX908-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_AV1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5) ; GFX908-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; ; GFX908-EXPANDED-LABEL: name: spill_restore_agpr1024 @@ -1711,7 +1711,7 @@ body: | ; GFX90A-SPILLED-NEXT: successors: %bb.1(0x80000000) ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit-def renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; GFX90A-SPILLED-NEXT: SI_SPILL_A1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: SI_SPILL_AV1024_SAVE killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s1024) into %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.1: @@ -1720,7 +1720,7 @@ body: | ; GFX90A-SPILLED-NEXT: S_NOP 1 ; GFX90A-SPILLED-NEXT: {{ $}} ; GFX90A-SPILLED-NEXT: bb.2: - ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_A1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5) + ; GFX90A-SPILLED-NEXT: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = SI_SPILL_AV1024_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s1024) from %stack.0, align 4, addrspace 5) ; GFX90A-SPILLED-NEXT: S_NOP 0, implicit killed renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; ; GFX90A-EXPANDED-LABEL: name: spill_restore_agpr1024 diff --git a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir index 831570800d06c..6966c3d8b6d6a 100644 --- a/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir +++ b/llvm/test/CodeGen/AMDGPU/swdev502267-use-after-free-last-chance-recoloring-alloc-succeeds.mir @@ -34,26 +34,26 @@ body: | ; CHECK-NEXT: liveins: $vgpr0_vgpr1_vgpr2_vgpr3:0x00000000000000FF, $vgpr4_vgpr5_vgpr6_vgpr7:0x00000000000000FF ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: SI_SPILL_AV128_SAVE killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = V_FMA_F32_e64 0, $vgpr6, 0, $vgpr6, 0, killed $vgpr2, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: SI_SPILL_V128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_AV128_SAVE $vgpr4_vgpr5_vgpr6_vgpr7, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = V_TRUNC_F32_e32 killed $vgpr0, implicit $mode, implicit $exec - ; CHECK-NEXT: SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SI_SPILL_AV32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr5 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr7, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr0 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: renamable $vgpr0 = SI_SPILL_AV32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) ; CHECK-NEXT: renamable $vgpr9 = COPY killed renamable $vgpr5 - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr8 = nofpexcept V_FMA_F32_e64 1, killed $vgpr0, 0, killed $vgpr6, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: renamable $vgpr2_vgpr3 = COPY killed renamable $vgpr8_vgpr9 ; CHECK-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: renamable $vgpr0 = nofpexcept V_DIV_FIXUP_F32_e64 0, killed $vgpr0, 0, killed $vgpr4, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec - ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) - ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SI_SPILL_AV128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5) + ; CHECK-NEXT: renamable $vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_AV128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5) ; CHECK-NEXT: dead renamable $vgpr1 = V_FMA_F32_e64 0, killed $vgpr5, 0, $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec ; CHECK-NEXT: dead renamable $vgpr4_vgpr5_vgpr6_vgpr7 = SCRATCH_LOAD_DWORDX4_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load (s128), addrspace 5) ; CHECK-NEXT: renamable $vgpr4_vgpr5 = IMPLICIT_DEF From c435cd173059863b44262ace75e0f381bbc6cb86 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Fri, 18 Jul 2025 08:26:11 +0200 Subject: [PATCH 294/813] [SimplifyCFG] Cache unique predecessors in `simplifyDuplicateSwitchArms` Avoid repeatedly querying `getUniquePredecessor` for already-visited switch successors so as not to incur quadratic runtime. Fixes: https://github.com/llvm/llvm-project/issues/147239. --- llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 30 ++++++++------ .../Transforms/SimplifyCFG/switch-dup-bbs.ll | 41 +++++++++++++++++++ .../SimplifyCFG/switch-range-to-icmp.ll | 2 +- 3 files changed, 59 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index a75f29000ca18..75c96503d556d 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -7493,7 +7493,7 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, SmallPtrSet Phis; SmallPtrSet Seen; DenseMap> PhiPredIVs; - DenseMap> BBToSuccessorIndexes; + DenseMap> BBToSuccessorIndexes; SmallVector Cases; Cases.reserve(SI->getNumSuccessors()); @@ -7505,12 +7505,6 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, if (BB->size() != 1) continue; - // FIXME: This case needs some extra care because the terminators other than - // SI need to be updated. For now, consider only backedges to the SI. - if (BB->hasNPredecessorsOrMore(4) || - BB->getUniquePredecessor() != SI->getParent()) - continue; - // FIXME: Relax that the terminator is a BranchInst by checking for equality // on other kinds of terminators. We decide to only support unconditional // branches for now for compile time reasons. @@ -7518,14 +7512,24 @@ bool SimplifyCFGOpt::simplifyDuplicateSwitchArms(SwitchInst *SI, if (!BI || BI->isConditional()) continue; - if (Seen.insert(BB).second) { - // Keep track of which PHIs we need as keys in PhiPredIVs below. - for (BasicBlock *Succ : BI->successors()) - Phis.insert_range(llvm::make_pointer_range(Succ->phis())); - // Add the successor only if not previously visited. - Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs}); + if (!Seen.insert(BB).second) { + auto It = BBToSuccessorIndexes.find(BB); + if (It != BBToSuccessorIndexes.end()) + It->second.emplace_back(I); + continue; } + // FIXME: This case needs some extra care because the terminators other than + // SI need to be updated. For now, consider only backedges to the SI. + if (BB->getUniquePredecessor() != SI->getParent()) + continue; + + // Keep track of which PHIs we need as keys in PhiPredIVs below. + for (BasicBlock *Succ : BI->successors()) + Phis.insert_range(llvm::make_pointer_range(Succ->phis())); + + // Add the successor only if not previously visited. + Cases.emplace_back(SwitchSuccWrapper{BB, &PhiPredIVs}); BBToSuccessorIndexes[BB].emplace_back(I); } diff --git a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll index 32581bbf8f141..d2d917de11897 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-dup-bbs.ll @@ -199,3 +199,44 @@ exit: %ret = phi i64 [ 0, %default ], [ 0, %bb1 ], [ 1, %entry ], [ 1, %bb2 ] ret i64 %ret } + +define i32 @switch_dup_unbounded_predecessors(i32 %val) { +; SIMPLIFY-CFG-LABEL: define i32 @switch_dup_unbounded_predecessors( +; SIMPLIFY-CFG-SAME: i32 [[VAL:%.*]]) { +; SIMPLIFY-CFG-NEXT: [[ENTRY:.*]]: +; SIMPLIFY-CFG-NEXT: switch i32 [[VAL]], label %[[EXIT:.*]] [ +; SIMPLIFY-CFG-NEXT: i32 99, label %[[BB1:.*]] +; SIMPLIFY-CFG-NEXT: i32 115, label %[[BB1]] +; SIMPLIFY-CFG-NEXT: i32 102, label %[[BB1]] +; SIMPLIFY-CFG-NEXT: i32 70, label %[[BB1]] +; SIMPLIFY-CFG-NEXT: i32 101, label %[[BB1]] +; SIMPLIFY-CFG-NEXT: i32 69, label %[[BB1]] +; SIMPLIFY-CFG-NEXT: i32 103, label %[[BB1]] +; SIMPLIFY-CFG-NEXT: ] +; SIMPLIFY-CFG: [[BB1]]: +; SIMPLIFY-CFG-NEXT: br label %[[EXIT]] +; SIMPLIFY-CFG: [[EXIT]]: +; SIMPLIFY-CFG-NEXT: [[PHI:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ 1, %[[BB1]] ] +; SIMPLIFY-CFG-NEXT: ret i32 [[PHI]] +; +entry: + switch i32 %val, label %exit [ + i32 99, label %bb1 + i32 115, label %bb1 + i32 102, label %bb2 + i32 70, label %bb2 + i32 101, label %bb2 + i32 69, label %bb2 + i32 103, label %bb2 + ] + +bb1: + br label %exit + +bb2: + br label %exit + +exit: + %phi = phi i32 [ 0, %entry ], [ 1, %bb1 ], [ 1, %bb2 ] + ret i32 %phi +} diff --git a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll index 4136f33983a2b..8f2ae2d054f1e 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch-range-to-icmp.ll @@ -149,7 +149,7 @@ unreach2: define void @pr53208_single_reachable_dest(i8 %sw, ptr %p0) { ; CHECK-LABEL: @pr53208_single_reachable_dest( -; CHECK-NEXT: group2: +; CHECK-NEXT: exit: ; CHECK-NEXT: call void @bar(ptr [[P0:%.*]]) ; CHECK-NEXT: ret void ; From 176ae32de060d8c4767904bf16fbde3faa59b60a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Jul 2025 15:34:47 +0900 Subject: [PATCH 295/813] AMDGPU: Fix introducing use of killed vgpr in gfx908 agpr copy (#149291) When searching for an existing VGPR source for an AGPR to AGPR copy on gfx908, this wasn't verifying the vgpr wasn't killed by other prior uses. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +- llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir | 83 +++++++++++++++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index b1116974642c9..c8935f0cb6034 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -687,7 +687,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, if (!SafeToPropagate) break; - DefOp.setIsKill(false); + for (auto I = Def; I != MI; ++I) + I->clearRegisterKills(DefOp.getReg(), &RI); } MachineInstrBuilder Builder = diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir index 2bd1b8bf3f3f6..d22a4b978980f 100644 --- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -45,6 +45,9 @@ define amdgpu_kernel void @copy_agpr_to_agpr_tuple() #0 { ret void } define amdgpu_kernel void @copy_agpr_to_agpr_tuple_kill() #0 { ret void } + define amdgpu_kernel void @look_for_vgpr_killed() #0 { ret void } + define amdgpu_kernel void @look_for_vgpr_killed_tuple() #0 { ret void } + attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } ... @@ -1517,3 +1520,83 @@ body: | renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ... + +# Make sure the expansion of the a-to-a copy doesn't introduce a use +# after kill of the source vgpr +--- +name: look_for_vgpr_killed +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0 + + ; GFX908-LABEL: name: look_for_vgpr_killed + ; GFX908: liveins: $agpr0 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-NEXT: S_NOP 0, implicit $vgpr0 + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; + ; GFX90A-LABEL: name: look_for_vgpr_killed + ; GFX90A: liveins: $agpr0 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX90A-NEXT: S_NOP 0, implicit killed $vgpr0 + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec + ; + ; GFX942-LABEL: name: look_for_vgpr_killed + ; GFX942: liveins: $agpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX942-NEXT: S_NOP 0, implicit killed $vgpr0 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + $agpr0 = COPY $vgpr0 + S_NOP 0, implicit killed $vgpr0 + $agpr1 = COPY $agpr0 + +... + +--- +name: look_for_vgpr_killed_tuple +tracksRegLiveness: true +body: | + bb.0: + liveins: $agpr0 + + ; GFX908-LABEL: name: look_for_vgpr_killed_tuple + ; GFX908: liveins: $agpr0 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1 + ; GFX908-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GFX908-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908-NEXT: S_NOP 0, implicit $vgpr0_vgpr1 + ; GFX908-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; + ; GFX90A-LABEL: name: look_for_vgpr_killed_tuple + ; GFX90A: liveins: $agpr0 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1 + ; GFX90A-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GFX90A-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX90A-NEXT: S_NOP 0, implicit killed $vgpr0_vgpr1 + ; GFX90A-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec + ; + ; GFX942-LABEL: name: look_for_vgpr_killed_tuple + ; GFX942: liveins: $agpr0 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1 + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 1, implicit $exec + ; GFX942-NEXT: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX942-NEXT: S_NOP 0, implicit killed $vgpr0_vgpr1 + ; GFX942-NEXT: $agpr1 = V_ACCVGPR_MOV_B32 $agpr0, implicit $exec + $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1 + $vgpr1 = V_MOV_B32_e32 1, implicit $exec + $agpr0 = COPY $vgpr0 + S_NOP 0, implicit killed $vgpr0_vgpr1 + $agpr1 = COPY $agpr0 + +... From 90f733ce6eaea6930c31d7aa320e18a5ef00ac75 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 18 Jul 2025 07:39:28 +0100 Subject: [PATCH 296/813] [LoopUnroll] Add tests for unrolling loops with reductions. Add tests for unrolling loops with reductions. In some cases, multiple parallel reduction phis could be retained to improve performance. --- .../LoopUnroll/AArch64/apple-unrolling.ll | 315 +++++++++++++ .../LoopUnroll/partial-unroll-reductions.ll | 446 ++++++++++++++++++ .../LoopUnroll/runtime-unroll-reductions.ll | 238 ++++++++++ 3 files changed, 999 insertions(+) create mode 100644 llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll create mode 100644 llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll index 1a091e847ca34..0b78beea54aa9 100644 --- a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll @@ -578,8 +578,323 @@ loop.latch: exit: ret void } + +define i32 @test_add_reduction_unroll_partial(ptr %a, i64 noundef %n) { +; APPLE-LABEL: define i32 @test_add_reduction_unroll_partial( +; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: br label %[[LOOP:.*]] +; APPLE: [[LOOP]]: +; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 +; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]] +; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ] +; APPLE-NEXT: ret i32 [[BIN_RDX2]] +; +; OTHER-LABEL: define i32 @test_add_reduction_unroll_partial( +; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +; OTHER-NEXT: [[ENTRY:.*]]: +; OTHER-NEXT: br label %[[LOOP:.*]] +; OTHER: [[LOOP]]: +; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; OTHER-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 +; OTHER-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]] +; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; OTHER-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] +; OTHER-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2 +; OTHER-NEXT: [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]] +; OTHER-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; OTHER-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]] +; OTHER-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2 +; OTHER-NEXT: [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP2]] +; OTHER-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; OTHER-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]] +; OTHER-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2 +; OTHER-NEXT: [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP3]] +; OTHER-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; OTHER-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024 +; OTHER-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]] +; OTHER: [[EXIT]]: +; OTHER-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; OTHER-NEXT: ret i32 [[BIN_RDX2]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] + %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %1 = load i32, ptr %gep.a, align 2 + %rdx.next = add nuw nsw i32 %rdx, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1024 + br i1 %ec, label %exit, label %loop + +exit: + %res = phi i32 [ %rdx.next, %loop ] + ret i32 %res +} + +declare i1 @cond() + +define i32 @test_add_reduction_multi_block(ptr %a, i64 noundef %n) { +; APPLE-LABEL: define i32 @test_add_reduction_multi_block( +; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: br label %[[LOOP:.*]] +; APPLE: [[LOOP]]: +; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP_LATCH]] ] +; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 +; APPLE-NEXT: [[C:%.*]] = call i1 @cond() +; APPLE-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; APPLE: [[THEN]]: +; APPLE-NEXT: store i32 0, ptr [[GEP_A]], align 4 +; APPLE-NEXT: br label %[[LOOP_LATCH]] +; APPLE: [[LOOP_LATCH]]: +; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]] +; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP_LATCH]] ] +; APPLE-NEXT: ret i32 [[RES]] +; +; OTHER-LABEL: define i32 @test_add_reduction_multi_block( +; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +; OTHER-NEXT: [[ENTRY:.*]]: +; OTHER-NEXT: br label %[[LOOP:.*]] +; OTHER: [[LOOP]]: +; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP_LATCH]] ] +; OTHER-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; OTHER-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 +; OTHER-NEXT: [[C:%.*]] = call i1 @cond() +; OTHER-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; OTHER: [[THEN]]: +; OTHER-NEXT: store i32 0, ptr [[GEP_A]], align 4 +; OTHER-NEXT: br label %[[LOOP_LATCH]] +; OTHER: [[LOOP_LATCH]]: +; OTHER-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]] +; OTHER-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; OTHER-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; OTHER-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; OTHER: [[EXIT]]: +; OTHER-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP_LATCH]] ] +; OTHER-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop.latch ] + %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %1 = load i32, ptr %gep.a, align 2 + %c = call i1 @cond() + br i1 %c, label %then, label %loop.latch + +then: + store i32 0, ptr %gep.a + br label %loop.latch + +loop.latch: + %rdx.next = add nuw nsw i32 %rdx, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1024 + br i1 %ec, label %exit, label %loop + +exit: + %res = phi i32 [ %rdx.next, %loop.latch ] + ret i32 %res +} + +define i32 @test_add_and_mul_reduction_unroll_partial(ptr %a, i64 noundef %n) { +; APPLE-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial( +; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: br label %[[LOOP:.*]] +; APPLE: [[LOOP]]: +; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 +; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]] +; APPLE-NEXT: [[RDX_2_NEXT]] = mul i32 [[RDX_2]], [[TMP0]] +; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ] +; APPLE-NEXT: [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT]], %[[LOOP]] ] +; APPLE-NEXT: [[SUM:%.*]] = add i32 [[BIN_RDX3]], [[RES_2]] +; APPLE-NEXT: ret i32 [[SUM]] +; +; OTHER-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial( +; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +; OTHER-NEXT: [[ENTRY:.*]]: +; OTHER-NEXT: br label %[[LOOP:.*]] +; OTHER: [[LOOP]]: +; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT_1:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; OTHER-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2 +; OTHER-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP0]] +; OTHER-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[TMP0]] +; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; OTHER-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] +; OTHER-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2 +; OTHER-NEXT: [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP1]] +; OTHER-NEXT: [[RDX_2_NEXT_1]] = mul i32 [[RDX_2_NEXT]], [[TMP1]] +; OTHER-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 +; OTHER-NEXT: [[EC_1:%.*]] = icmp eq i64 [[IV_NEXT_1]], 1024 +; OTHER-NEXT: br i1 [[EC_1]], label %[[EXIT:.*]], label %[[LOOP]] +; OTHER: [[EXIT]]: +; OTHER-NEXT: [[BIN_RDX:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] +; OTHER-NEXT: [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT_1]], %[[LOOP]] ] +; OTHER-NEXT: [[SUM:%.*]] = add i32 [[BIN_RDX]], [[RES_2]] +; OTHER-NEXT: ret i32 [[SUM]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] + %rdx.2 = phi i32 [ 0, %entry ], [ %rdx.2.next, %loop ] + %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %1 = load i32, ptr %gep.a, align 2 + %rdx.next = add nuw nsw i32 %rdx, %1 + %rdx.2.next = mul i32 %rdx.2, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, 1024 + br i1 %ec, label %exit, label %loop + +exit: + %res.1 = phi i32 [ %rdx.next, %loop ] + %res.2 = phi i32 [ %rdx.2.next, %loop ] + %sum = add i32 %res.1, %res.2 + ret i32 %sum +} + + +define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) { +; APPLE-LABEL: define i32 @test_add_reduction_runtime( +; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +; APPLE-NEXT: [[ENTRY:.*]]: +; APPLE-NEXT: br label %[[LOOP:.*]] +; APPLE: [[LOOP]]: +; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP]] ] +; APPLE-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]] +; APPLE-NEXT: [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2 +; APPLE-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]] +; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1 +; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]] +; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]] +; APPLE: [[EXIT]]: +; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP]] ] +; APPLE-NEXT: ret i32 [[RES]] +; +; OTHER-LABEL: define i32 @test_add_reduction_runtime( +; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] { +; OTHER-NEXT: [[ENTRY:.*]]: +; OTHER-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; OTHER-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 3 +; OTHER-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3 +; OTHER-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; OTHER: [[ENTRY_NEW]]: +; OTHER-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] +; OTHER-NEXT: br label %[[LOOP:.*]] +; OTHER: [[LOOP]]: +; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP]] ] +; OTHER-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; OTHER-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2 +; OTHER-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]] +; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; OTHER-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] +; OTHER-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2 +; OTHER-NEXT: [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]] +; OTHER-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; OTHER-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]] +; OTHER-NEXT: [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2 +; OTHER-NEXT: [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]] +; OTHER-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; OTHER-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]] +; OTHER-NEXT: [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2 +; OTHER-NEXT: [[RDX_NEXT_3]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]] +; OTHER-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; OTHER-NEXT: [[NITER_NEXT_3]] = add i64 [[NITER]], 4 +; OTHER-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]] +; OTHER-NEXT: br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]] +; OTHER: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; OTHER-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; OTHER-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[LOOP]] ] +; OTHER-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; OTHER-NEXT: br label %[[EXIT_UNR_LCSSA]] +; OTHER: [[EXIT_UNR_LCSSA]]: +; OTHER-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; OTHER-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; OTHER-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; OTHER-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; OTHER-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; OTHER: [[LOOP_EPIL_PREHEADER]]: +; OTHER-NEXT: br label %[[LOOP_EPIL:.*]] +; OTHER: [[LOOP_EPIL]]: +; OTHER-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ] +; OTHER-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ] +; OTHER-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ] +; OTHER-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]] +; OTHER-NEXT: [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2 +; OTHER-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]] +; OTHER-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1 +; OTHER-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]] +; OTHER-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 +; OTHER-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] +; OTHER-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]] +; OTHER: [[EXIT_EPILOG_LCSSA]]: +; OTHER-NEXT: [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ] +; OTHER-NEXT: br label %[[EXIT]] +; OTHER: [[EXIT]]: +; OTHER-NEXT: [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ] +; OTHER-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] + %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %1 = load i32, ptr %gep.a, align 2 + %rdx.next = add nuw nsw i32 %rdx, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + %res = phi i32 [ %rdx.next, %loop ] + ret i32 %res +} ;. ; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} ; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"} ; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} ;. +; OTHER: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; OTHER: [[META1]] = !{!"llvm.loop.unroll.disable"} +;. diff --git a/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll new file mode 100644 index 0000000000000..953dc278b6644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll @@ -0,0 +1,446 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-unroll -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s + +define i32 @test_add(ptr %src, i64 %n, i32 %start) { +; CHECK-LABEL: define i32 @test_add( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1 +; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]] +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[RDX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i32, ptr %src, i64 %iv + %l = load i32 , ptr %gep.src, align 1 + %rdx.next = add i32 %rdx, %l + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret i32 %rdx.next +} + +define i32 @test_add_tc_not_multiple_of_4(ptr %src, i64 %n, i32 %start) { +; CHECK-LABEL: define i32 @test_add_tc_not_multiple_of_4( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP_1:.*]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP_1]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]] +; CHECK-NEXT: [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 1001 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP_1]], label %[[EXIT:.*]] +; CHECK: [[LOOP_1]]: +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_12:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_12:%.*]] = load i32, ptr [[GEP_SRC_12]], align 1 +; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_12]] +; CHECK-NEXT: br label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[RDX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i32, ptr %src, i64 %iv + %l = load i32 , ptr %gep.src, align 1 + %rdx.next = add i32 %rdx, %l + %ec = icmp ne i64 %iv.next, 1001 + br i1 %ec, label %loop, label %exit + +exit: + ret i32 %rdx.next +} + +define i32 @test_add_rdx_used_in_loop(ptr %src, i64 %n, i32 %start) { +; CHECK-LABEL: define i32 @test_add_rdx_used_in_loop( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]] +; CHECK-NEXT: store i32 [[RDX_NEXT]], ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]] +; CHECK-NEXT: store i32 [[RDX_NEXT_1]], ptr [[GEP_SRC_1]], align 4 +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]] +; CHECK-NEXT: store i32 [[RDX_NEXT_2]], ptr [[GEP_SRC_2]], align 4 +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1 +; CHECK-NEXT: [[RDX_NEXT_24]] = add i32 [[RDX_NEXT_2]], [[L_24]] +; CHECK-NEXT: store i32 [[RDX_NEXT_24]], ptr [[GEP_SRC_24]], align 4 +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_24]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[RDX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i32, ptr %src, i64 %iv + %l = load i32 , ptr %gep.src, align 1 + %rdx.next = add i32 %rdx, %l + store i32 %rdx.next, ptr %gep.src + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret i32 %rdx.next +} + +define i32 @test_add_phi_used_outside_loop(ptr %src, i64 %n, i32 %start) { +; CHECK-LABEL: define i32 @test_add_phi_used_outside_loop( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1 +; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]] +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_2]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[RDX_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ %start, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i32, ptr %src, i64 %iv + %l = load i32 , ptr %gep.src, align 1 + %rdx.next = add i32 %rdx, %l + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret i32 %rdx +} + +define i32 @test_add_and_mul_reduction(ptr %src, i64 %n, i32 %start) { +; CHECK-LABEL: define i32 @test_add_and_mul_reduction( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_2:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[RDX_1_NEXT:%.*]] = add i32 [[RDX_1]], [[L]] +; CHECK-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[L]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[RDX_1_2:%.*]] = add i32 [[RDX_1_NEXT]], [[L_1]] +; CHECK-NEXT: [[RDX_2_2:%.*]] = mul i32 [[RDX_2_NEXT]], [[L_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[RDX_1_NEXT_2:%.*]] = add i32 [[RDX_1_2]], [[L_2]] +; CHECK-NEXT: [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_2]], [[L_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1 +; CHECK-NEXT: [[RDX_1_NEXT_3]] = add i32 [[RDX_1_NEXT_2]], [[L_24]] +; CHECK-NEXT: [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[L_24]] +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_1_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: [[BIN_RDX5:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: [[RES:%.*]] = add i32 [[RDX_1_NEXT_LCSSA]], [[BIN_RDX5]] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx.1 = phi i32 [ %start, %entry ], [ %rdx.1.next, %loop ] + %rdx.2 = phi i32 [ %start, %entry ], [ %rdx.2.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i32, ptr %src, i64 %iv + %l = load i32 , ptr %gep.src, align 1 + %rdx.1.next = add i32 %rdx.1, %l + %rdx.2.next = mul i32 %rdx.2, %l + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + %res = add i32 %rdx.1.next, %rdx.2.next + ret i32 %res +} + +define float @test_fadd_no_fmfs(ptr %src, i64 %n, float %start) { +; CHECK-LABEL: define float @test_fadd_no_fmfs( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], float [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1 +; CHECK-NEXT: [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]] +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: ret float [[RDX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi float [ %start, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr float, ptr %src, i64 %iv + %l = load float, ptr %gep.src, align 1 + %rdx.next = fadd float %rdx, %l + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret float %rdx.next +} + +define float @test_fadd_with_ressaoc(ptr %src, i64 %n, float %start) { +; CHECK-LABEL: define float @test_fadd_with_ressaoc( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], float [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi float [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = fadd float [[RDX]], [[L]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = fadd float [[RDX_NEXT]], [[L_1]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load float, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = fadd float [[RDX_NEXT_1]], [[L_2]] +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr float, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_24:%.*]] = load float, ptr [[GEP_SRC_24]], align 1 +; CHECK-NEXT: [[RDX_NEXT_3]] = fadd float [[RDX_NEXT_2]], [[L_24]] +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi float [ [[RDX_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: ret float [[RDX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi float [ %start, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr float, ptr %src, i64 %iv + %l = load float, ptr %gep.src, align 1 + %rdx.next = fadd float %rdx, %l + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret float %rdx.next +} +define i32 @test_smin(ptr %src, i64 %n) { +; CHECK-LABEL: define i32 @test_smin( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MIN:%.*]] = phi i32 [ 1000, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = call i32 @llvm.smin.i32(i32 [[MIN]], i32 [[L]]) +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT]], i32 [[L_1]]) +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT_1]], i32 [[L_2]]) +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1 +; CHECK-NEXT: [[RDX_NEXT_3]] = call i32 @llvm.smin.i32(i32 [[RDX_NEXT_2]], i32 [[L_24]]) +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: ret i32 [[RDX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %min = phi i32 [ 1000, %entry ], [ %rdx.next, %loop ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i32, ptr %src, i64 %iv + %l = load i32 , ptr %gep.src, align 1 + %rdx.next = call i32 @llvm.smin(i32 %min, i32 %l) + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret i32 %rdx.next +} + +define i64 @test_any_of_reduction(ptr %src, i64 %n) { +; CHECK-LABEL: define i64 @test_any_of_reduction( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[ANY_OF_RDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1 +; CHECK-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = select i1 [[C]], i64 [[ANY_OF_RDX]], i64 0 +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8 [[L_1]], 0 +; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = select i1 [[C_1]], i64 [[RDX_NEXT]], i64 0 +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[L_2:%.*]] = load i8, ptr [[GEP_SRC_2]], align 1 +; CHECK-NEXT: [[C_2:%.*]] = icmp eq i8 [[L_2]], 0 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = select i1 [[C_2]], i64 [[RDX_NEXT_1]], i64 0 +; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[L_24:%.*]] = load i8, ptr [[GEP_SRC_24]], align 1 +; CHECK-NEXT: [[C_24:%.*]] = icmp eq i8 [[L_24]], 0 +; CHECK-NEXT: [[RDX_NEXT_3]] = select i1 [[C_24]], i64 [[RDX_NEXT_2]], i64 0 +; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000 +; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i64 [ [[RDX_NEXT_3]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[RDX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] + %any.of.rdx = phi i64 [ %rdx.next, %loop ], [ 0, %entry ] + %iv.next = add i64 %iv, 1 + %gep.src = getelementptr i8, ptr %src, i64 %iv + %l = load i8, ptr %gep.src, align 1 + %c = icmp eq i8 %l, 0 + %rdx.next = select i1 %c, i64 %any.of.rdx, i64 0 + %ec = icmp ne i64 %iv.next, 1000 + br i1 %ec, label %loop, label %exit + +exit: + ret i64 %rdx.next +} diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll new file mode 100644 index 0000000000000..89f06ad373aa9 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-reductions.ll @@ -0,0 +1,238 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-unroll -S %s | FileCheck %s + +define i32 @test_add_reduction(ptr %a, i64 %n) { +; CHECK-LABEL: define i32 @test_add_reduction( +; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1 +; CHECK-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; CHECK: [[ENTRY_NEW]]: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2 +; CHECK-NEXT: [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]] +; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; CHECK-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: br label %[[EXIT_UNR_LCSSA]] +; CHECK: [[EXIT_UNR_LCSSA]]: +; CHECK-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK: [[LOOP_EPIL_PREHEADER]]: +; CHECK-NEXT: br label %[[LOOP_EPIL:.*]] +; CHECK: [[LOOP_EPIL]]: +; CHECK-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_UNR]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2 +; CHECK-NEXT: [[RDX_NEXT_EPIL:%.*]] = add nuw nsw i32 [[RDX_UNR]], [[TMP4]] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] + %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %1 = load i32, ptr %gep.a, align 2 + %rdx.next = add nuw nsw i32 %rdx, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop, !llvm.loop !0 + +exit: + %res = phi i32 [ %rdx.next, %loop ] + ret i32 %res +} + +define i32 @test_add_reduction_constant_op(ptr %a, i64 %n) { +; CHECK-LABEL: define i32 @test_add_reduction_constant_op( +; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1 +; CHECK-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; CHECK: [[ENTRY_NEW]]: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_NEXT_1]] = add nuw nsw i32 [[RDX]], 2 +; CHECK-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; CHECK-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_1]], %[[LOOP]] ] +; CHECK-NEXT: br label %[[EXIT_UNR_LCSSA]] +; CHECK: [[EXIT_UNR_LCSSA]]: +; CHECK-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK: [[LOOP_EPIL_PREHEADER]]: +; CHECK-NEXT: br label %[[LOOP_EPIL:.*]] +; CHECK: [[LOOP_EPIL]]: +; CHECK-NEXT: [[RDX_NEXT_EPIL:%.*]] = add nuw nsw i32 [[RDX_UNR]], 1 +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] + %rdx.next = add nuw nsw i32 %rdx, 1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop, !llvm.loop !0 + +exit: + %res = phi i32 [ %rdx.next, %loop ] + ret i32 %res +} + +define i32 @test_add_reduction_8x_unroll(ptr %a, i64 %n) { +; CHECK-LABEL: define i32 @test_add_reduction_8x_unroll( +; CHECK-SAME: ptr [[A:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 7 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 7 +; CHECK-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]] +; CHECK: [[ENTRY_NEW]]: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_7:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_7:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_7:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A]], align 2 +; CHECK-NEXT: [[RDX_NEXT:%.*]] = add nuw nsw i32 [[RDX]], [[TMP2]] +; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]] +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2 +; CHECK-NEXT: [[RDX_2:%.*]] = add nuw nsw i32 [[RDX_NEXT]], [[TMP3]] +; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2 +; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2 +; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add nuw nsw i32 [[RDX_2]], [[TMP4]] +; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3 +; CHECK-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]] +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2 +; CHECK-NEXT: [[RDX_4:%.*]] = add nuw nsw i32 [[RDX_NEXT_2]], [[TMP5]] +; CHECK-NEXT: [[IV_NEXT_3:%.*]] = add nuw nsw i64 [[IV]], 4 +; CHECK-NEXT: [[GEP_A_4:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_3]] +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[GEP_A_4]], align 2 +; CHECK-NEXT: [[RDX_NEXT_4:%.*]] = add nuw nsw i32 [[RDX_4]], [[TMP6]] +; CHECK-NEXT: [[IV_NEXT_4:%.*]] = add nuw nsw i64 [[IV]], 5 +; CHECK-NEXT: [[GEP_A_5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_4]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[GEP_A_5]], align 2 +; CHECK-NEXT: [[RDX_6:%.*]] = add nuw nsw i32 [[RDX_NEXT_4]], [[TMP7]] +; CHECK-NEXT: [[IV_NEXT_5:%.*]] = add nuw nsw i64 [[IV]], 6 +; CHECK-NEXT: [[GEP_A_6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_5]] +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[GEP_A_6]], align 2 +; CHECK-NEXT: [[RDX_NEXT_6:%.*]] = add nuw nsw i32 [[RDX_6]], [[TMP8]] +; CHECK-NEXT: [[IV_NEXT_6:%.*]] = add nuw nsw i64 [[IV]], 7 +; CHECK-NEXT: [[GEP_A_7:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_6]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[GEP_A_7]], align 2 +; CHECK-NEXT: [[RDX_NEXT_7]] = add nuw nsw i32 [[RDX_NEXT_6]], [[TMP9]] +; CHECK-NEXT: [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8 +; CHECK-NEXT: [[NITER_NEXT_7]] = add i64 [[NITER]], 8 +; CHECK-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NEXT_7]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_7]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT_UNR_LCSSA_LOOPEXIT]]: +; CHECK-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_7]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_7]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_7]], %[[LOOP]] ] +; CHECK-NEXT: br label %[[EXIT_UNR_LCSSA]] +; CHECK: [[EXIT_UNR_LCSSA]]: +; CHECK-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[RES_PH_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK: [[LOOP_EPIL_PREHEADER]]: +; CHECK-NEXT: br label %[[LOOP_EPIL:.*]] +; CHECK: [[LOOP_EPIL]]: +; CHECK-NEXT: [[IV_EPIL:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ] +; CHECK-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ] +; CHECK-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ] +; CHECK-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]] +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2 +; CHECK-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP10]] +; CHECK-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1 +; CHECK-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]] +; CHECK-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1 +; CHECK-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]] +; CHECK-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[EXIT_EPILOG_LCSSA]]: +; CHECK-NEXT: [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ] + %gep.a = getelementptr inbounds nuw i32, ptr %a, i64 %iv + %1 = load i32, ptr %gep.a, align 2 + %rdx.next = add nuw nsw i32 %rdx, %1 + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop, !llvm.loop !2 + +exit: + %res = phi i32 [ %rdx.next, %loop ] + ret i32 %res +} + + + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.unroll.count", i32 2} + +!2 = distinct !{!2, !3} +!3 = !{!"llvm.loop.unroll.count", i32 8} + +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"} +; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} +;. From 5bac67d9213da8afa0e35199395774ca3c7daa39 Mon Sep 17 00:00:00 2001 From: Diana Picus Date: Fri, 18 Jul 2025 09:22:37 +0200 Subject: [PATCH 297/813] [AMDGPU] Use SIRegisterInfo to compute used registers. NFCI (#149051) Simplify the code in AMDGPUResourceUsageAnalysis to rely more on the TargetRegisterInfo for computing the number of used SGPRs and AGPRs. This is a preliminary refactoring split out from #144855. (While we could technically use TRI to compute the used number of VGPRs at this point too, I'm leaving some of the original code in since for VGPRs we're going to introduce some special cases). --- .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp | 234 ++---------------- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 8 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 6 +- 3 files changed, 31 insertions(+), 217 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp index 46027b8890234..8101c68986241 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp @@ -167,77 +167,39 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI); + Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass, + /*IncludeCalls=*/false); + if (ST.hasMAIInsts()) + Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass, + /*IncludeCalls=*/false); // If there are no calls, MachineRegisterInfo can tell us the used register // count easily. // A tail call isn't considered a call for MachineFrameInfo's purposes. if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) { - Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass); - Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass); - if (ST.hasMAIInsts()) - Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass); + Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass, + /*IncludeCalls=*/false); return Info; } int32_t MaxVGPR = -1; - int32_t MaxAGPR = -1; - int32_t MaxSGPR = -1; Info.CalleeSegmentSize = 0; for (const MachineBasicBlock &MBB : MF) { for (const MachineInstr &MI : MBB) { - // TODO: Check regmasks? Do they occur anywhere except calls? - for (const MachineOperand &MO : MI.operands()) { - unsigned Width = 0; - bool IsSGPR = false; - bool IsAGPR = false; + for (unsigned I = 0; I < MI.getNumOperands(); ++I) { + const MachineOperand &MO = MI.getOperand(I); if (!MO.isReg()) continue; Register Reg = MO.getReg(); switch (Reg) { - case AMDGPU::EXEC: - case AMDGPU::EXEC_LO: - case AMDGPU::EXEC_HI: - case AMDGPU::SCC: - case AMDGPU::M0: - case AMDGPU::M0_LO16: - case AMDGPU::M0_HI16: - case AMDGPU::SRC_SHARED_BASE_LO: - case AMDGPU::SRC_SHARED_BASE: - case AMDGPU::SRC_SHARED_LIMIT_LO: - case AMDGPU::SRC_SHARED_LIMIT: - case AMDGPU::SRC_PRIVATE_BASE_LO: - case AMDGPU::SRC_PRIVATE_BASE: - case AMDGPU::SRC_PRIVATE_LIMIT_LO: - case AMDGPU::SRC_PRIVATE_LIMIT: - case AMDGPU::SRC_POPS_EXITING_WAVE_ID: - case AMDGPU::SGPR_NULL: - case AMDGPU::SGPR_NULL64: - case AMDGPU::MODE: - continue; - case AMDGPU::NoRegister: assert(MI.isDebugInstr() && "Instruction uses invalid noreg register"); continue; - case AMDGPU::VCC: - case AMDGPU::VCC_LO: - case AMDGPU::VCC_HI: - case AMDGPU::VCC_LO_LO16: - case AMDGPU::VCC_LO_HI16: - case AMDGPU::VCC_HI_LO16: - case AMDGPU::VCC_HI_HI16: - Info.UsesVCC = true; - continue; - - case AMDGPU::FLAT_SCR: - case AMDGPU::FLAT_SCR_LO: - case AMDGPU::FLAT_SCR_HI: - continue; - case AMDGPU::XNACK_MASK: case AMDGPU::XNACK_MASK_LO: case AMDGPU::XNACK_MASK_HI: @@ -267,170 +229,22 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( break; } - if (AMDGPU::SGPR_32RegClass.contains(Reg) || - AMDGPU::SGPR_LO16RegClass.contains(Reg) || - AMDGPU::SGPR_HI16RegClass.contains(Reg)) { - IsSGPR = true; - Width = 1; - } else if (AMDGPU::VGPR_32RegClass.contains(Reg) || - AMDGPU::VGPR_16RegClass.contains(Reg)) { - IsSGPR = false; - Width = 1; - } else if (AMDGPU::AGPR_32RegClass.contains(Reg) || - AMDGPU::AGPR_LO16RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 1; - } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) { - IsSGPR = true; - Width = 2; - } else if (AMDGPU::VReg_64RegClass.contains(Reg)) { - IsSGPR = false; - Width = 2; - } else if (AMDGPU::AReg_64RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 2; - } else if (AMDGPU::VReg_96RegClass.contains(Reg)) { - IsSGPR = false; - Width = 3; - } else if (AMDGPU::SReg_96RegClass.contains(Reg)) { - IsSGPR = true; - Width = 3; - } else if (AMDGPU::AReg_96RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 3; - } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) { - IsSGPR = true; - Width = 4; - } else if (AMDGPU::VReg_128RegClass.contains(Reg)) { - IsSGPR = false; - Width = 4; - } else if (AMDGPU::AReg_128RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 4; - } else if (AMDGPU::VReg_160RegClass.contains(Reg)) { - IsSGPR = false; - Width = 5; - } else if (AMDGPU::SReg_160RegClass.contains(Reg)) { - IsSGPR = true; - Width = 5; - } else if (AMDGPU::AReg_160RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 5; - } else if (AMDGPU::VReg_192RegClass.contains(Reg)) { - IsSGPR = false; - Width = 6; - } else if (AMDGPU::SReg_192RegClass.contains(Reg)) { - IsSGPR = true; - Width = 6; - } else if (AMDGPU::AReg_192RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 6; - } else if (AMDGPU::VReg_224RegClass.contains(Reg)) { - IsSGPR = false; - Width = 7; - } else if (AMDGPU::SReg_224RegClass.contains(Reg)) { - IsSGPR = true; - Width = 7; - } else if (AMDGPU::AReg_224RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 7; - } else if (AMDGPU::SReg_256RegClass.contains(Reg)) { - IsSGPR = true; - Width = 8; - } else if (AMDGPU::VReg_256RegClass.contains(Reg)) { - IsSGPR = false; - Width = 8; - } else if (AMDGPU::AReg_256RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 8; - } else if (AMDGPU::VReg_288RegClass.contains(Reg)) { - IsSGPR = false; - Width = 9; - } else if (AMDGPU::SReg_288RegClass.contains(Reg)) { - IsSGPR = true; - Width = 9; - } else if (AMDGPU::AReg_288RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 9; - } else if (AMDGPU::VReg_320RegClass.contains(Reg)) { - IsSGPR = false; - Width = 10; - } else if (AMDGPU::SReg_320RegClass.contains(Reg)) { - IsSGPR = true; - Width = 10; - } else if (AMDGPU::AReg_320RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 10; - } else if (AMDGPU::VReg_352RegClass.contains(Reg)) { - IsSGPR = false; - Width = 11; - } else if (AMDGPU::SReg_352RegClass.contains(Reg)) { - IsSGPR = true; - Width = 11; - } else if (AMDGPU::AReg_352RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 11; - } else if (AMDGPU::VReg_384RegClass.contains(Reg)) { - IsSGPR = false; - Width = 12; - } else if (AMDGPU::SReg_384RegClass.contains(Reg)) { - IsSGPR = true; - Width = 12; - } else if (AMDGPU::AReg_384RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 12; - } else if (AMDGPU::SReg_512RegClass.contains(Reg)) { - IsSGPR = true; - Width = 16; - } else if (AMDGPU::VReg_512RegClass.contains(Reg)) { - IsSGPR = false; - Width = 16; - } else if (AMDGPU::AReg_512RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 16; - } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) { - IsSGPR = true; - Width = 32; - } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - Width = 32; - } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) { - IsSGPR = false; - IsAGPR = true; - Width = 32; - } else { - // We only expect TTMP registers or registers that do not belong to - // any RC. - assert((AMDGPU::TTMP_32RegClass.contains(Reg) || - AMDGPU::TTMP_64RegClass.contains(Reg) || - AMDGPU::TTMP_128RegClass.contains(Reg) || - AMDGPU::TTMP_256RegClass.contains(Reg) || - AMDGPU::TTMP_512RegClass.contains(Reg) || - !TRI.getPhysRegBaseClass(Reg)) && - "Unknown register class"); - } + const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(Reg); + assert((!RC || TRI.isVGPRClass(RC) || TRI.isSGPRClass(RC) || + TRI.isAGPRClass(RC) || AMDGPU::TTMP_32RegClass.contains(Reg) || + AMDGPU::TTMP_64RegClass.contains(Reg) || + AMDGPU::TTMP_128RegClass.contains(Reg) || + AMDGPU::TTMP_256RegClass.contains(Reg) || + AMDGPU::TTMP_512RegClass.contains(Reg)) && + "Unknown register class"); + + if (!RC || !TRI.isVGPRClass(RC)) + continue; + + unsigned Width = divideCeil(TRI.getRegSizeInBits(*RC), 32); unsigned HWReg = TRI.getHWRegIndex(Reg); int MaxUsed = HWReg + Width - 1; - if (IsSGPR) { - MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR; - } else if (IsAGPR) { - MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR; - } else { - MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR; - } + MaxVGPR = std::max(MaxUsed, MaxVGPR); } if (MI.isCall()) { @@ -492,9 +306,7 @@ AMDGPUResourceUsageAnalysisImpl::analyzeResourceUsage( } } - Info.NumExplicitSGPR = MaxSGPR + 1; Info.NumVGPR = MaxVGPR + 1; - Info.NumAGPR = MaxAGPR + 1; return Info; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 9173041a7bccd..fa2b8db6ba55a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -4052,11 +4052,11 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC, return 0; } -unsigned -SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, - const TargetRegisterClass &RC) const { +unsigned SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI, + const TargetRegisterClass &RC, + bool IncludeCalls) const { for (MCPhysReg Reg : reverse(RC.getRegisters())) - if (MRI.isPhysRegUsed(Reg)) + if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls)) return getHWRegIndex(Reg) + 1; return 0; } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 06a7a17b0246b..0008e5f8cf3b4 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -486,9 +486,11 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { unsigned SubReg) const; // \returns a number of registers of a given \p RC used in a function. - // Does not go inside function calls. + // Does not go inside function calls. If \p IncludeCalls is true, it will + // include registers that may be clobbered by calls. unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, - const TargetRegisterClass &RC) const; + const TargetRegisterClass &RC, + bool IncludeCalls = true) const; std::optional getVRegFlagValue(StringRef Name) const override { return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG From beec840822867079b829f35cbd4b360aa8971438 Mon Sep 17 00:00:00 2001 From: Younan Zhang Date: Fri, 18 Jul 2025 15:23:32 +0800 Subject: [PATCH 298/813] [Clang] Ensure correct parameters are in the scope for constraint equivalence checking (#149264) This is another case where untransformed constraint expressions led to inconsistent transforms. We did fix some of those issues by looking at parent scopes, however the parent instantiation scope is not always available because we could also reach here after the parents get instantiated. Fixes #146614 --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/Sema/SemaConcept.cpp | 5 +++++ .../test/SemaTemplate/concepts-using-decl.cpp | 21 +++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index fcd3887ec7a09..6f55d14fd0aa5 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -808,6 +808,7 @@ Bug Fixes in This Version nested scopes. (#GH147495) - Fixed a failed assertion with an operator call expression which comes from a macro expansion when performing analysis for nullability attributes. (#GH138371) +- Fixed a concept equivalent checking crash due to untransformed constraint expressions. (#GH146614) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp index 834417f8e15ac..5205ca0bca6fa 100644 --- a/clang/lib/Sema/SemaConcept.cpp +++ b/clang/lib/Sema/SemaConcept.cpp @@ -925,7 +925,12 @@ static const Expr *SubstituteConstraintExpressionWithoutSatisfaction( ND && ND->isFunctionOrFunctionTemplate()) { ScopeForParameters.emplace(S, /*CombineWithOuterScope=*/true); const FunctionDecl *FD = ND->getAsFunction(); + if (FunctionTemplateDecl *Template = FD->getDescribedFunctionTemplate(); + Template && Template->getInstantiatedFromMemberTemplate()) + FD = Template->getInstantiatedFromMemberTemplate()->getTemplatedDecl(); for (auto *PVD : FD->parameters()) { + if (ScopeForParameters->getInstantiationOfIfExists(PVD)) + continue; if (!PVD->isParameterPack()) { ScopeForParameters->InstantiatedLocal(PVD, PVD); continue; diff --git a/clang/test/SemaTemplate/concepts-using-decl.cpp b/clang/test/SemaTemplate/concepts-using-decl.cpp index fca69dea5c88f..41f7b6d2f8faa 100644 --- a/clang/test/SemaTemplate/concepts-using-decl.cpp +++ b/clang/test/SemaTemplate/concepts-using-decl.cpp @@ -176,3 +176,24 @@ void func() { f.foo<10, 10>(); // expected-error {{no matching member function for call to 'foo'}} } } // namespace heads_without_concepts. + +namespace GH146614 { + +template +struct base { + template + void foo(A x) + requires (requires{x;}) + {} +}; + + +struct child : base { + using base::foo; + template + void foo(A x) + requires (false) + {} +}; + +} From 2a1869b9815c1f59af9eae91a3ef7e7d78e8f4f2 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Fri, 18 Jul 2025 08:25:10 +0100 Subject: [PATCH 299/813] [DebugInfo] Shave even more users of DbgVariableIntrinsic from LLVM (#149136) At this stage I'm just opportunistically deleting any code using debug-intrinsic types, largely adjacent to calls to findDbgUsers. I'll get to deleting that in probably one or more two commits. --- llvm/include/llvm/IR/DebugInfo.h | 5 +- llvm/include/llvm/IR/DebugInfoMetadata.h | 3 - llvm/lib/IR/DebugInfo.cpp | 25 +++---- llvm/lib/IR/DebugInfoMetadata.cpp | 9 --- .../InstCombine/InstCombineInternal.h | 3 - .../InstCombine/InstructionCombining.cpp | 67 +---------------- .../Scalar/ConstraintElimination.cpp | 1 + llvm/lib/Transforms/Utils/CodeExtractor.cpp | 8 +- llvm/lib/Transforms/Utils/InlineFunction.cpp | 3 +- llvm/lib/Transforms/Utils/Local.cpp | 73 ++----------------- .../Utils/PromoteMemoryToRegister.cpp | 49 ++++--------- .../Transforms/Utils/DebugifyTest.cpp | 9 +-- llvm/unittests/Transforms/Utils/LocalTest.cpp | 56 -------------- 13 files changed, 41 insertions(+), 270 deletions(-) diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h index 77cee875f16e7..f8241a3cdf160 100644 --- a/llvm/include/llvm/IR/DebugInfo.h +++ b/llvm/include/llvm/IR/DebugInfo.h @@ -115,8 +115,7 @@ class DebugInfoFinder { LLVM_ABI void processVariable(DILocalVariable *DVI); /// Process debug info location. LLVM_ABI void processLocation(const Module &M, const DILocation *Loc); - /// Process a DbgRecord (e.g, treat a DbgVariableRecord like a - /// DbgVariableIntrinsic). + /// Process a DbgRecord. LLVM_ABI void processDbgRecord(const Module &M, const DbgRecord &DR); /// Process subprogram. @@ -290,8 +289,6 @@ struct VarRecord { DILocalVariable *Var; DILocation *DL; - VarRecord(DbgVariableIntrinsic *DVI) - : Var(DVI->getVariable()), DL(getDebugValueLoc(DVI)) {} VarRecord(DbgVariableRecord *DVR) : Var(DVR->getVariable()), DL(getDebugValueLoc(DVR)) {} VarRecord(DILocalVariable *Var, DILocation *DL) : Var(Var), DL(DL) {} diff --git a/llvm/include/llvm/IR/DebugInfoMetadata.h b/llvm/include/llvm/IR/DebugInfoMetadata.h index 9345f95015301..f1f0c18949c35 100644 --- a/llvm/include/llvm/IR/DebugInfoMetadata.h +++ b/llvm/include/llvm/IR/DebugInfoMetadata.h @@ -66,7 +66,6 @@ namespace dwarf { enum Tag : uint16_t; } -class DbgVariableIntrinsic; class DbgVariableRecord; LLVM_ABI extern cl::opt EnableFSDiscriminator; @@ -4613,7 +4612,6 @@ class DebugVariable { LLVM_ABI static const FragmentInfo DefaultFragment; public: - LLVM_ABI DebugVariable(const DbgVariableIntrinsic *DII); LLVM_ABI DebugVariable(const DbgVariableRecord *DVR); DebugVariable(const DILocalVariable *Var, @@ -4681,7 +4679,6 @@ template <> struct DenseMapInfo { /// information). class DebugVariableAggregate : public DebugVariable { public: - LLVM_ABI DebugVariableAggregate(const DbgVariableIntrinsic *DVI); DebugVariableAggregate(const DebugVariable &V) : DebugVariable(V.getVariable(), std::nullopt, V.getInlinedAt()) {} }; diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 84a56058de834..8fb33c30e5cac 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -2288,39 +2288,36 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) { // Collect a map of {backing storage : dbg.declares} (currently "backing // storage" is limited to Allocas). We'll use this to find dbg.declares to // delete after running `trackAssignments`. - DenseMap> DbgDeclares; DenseMap> DVRDeclares; // Create another similar map of {storage : variables} that we'll pass to // trackAssignments. StorageToVarsMap Vars; - auto ProcessDeclare = [&](auto *Declare, auto &DeclareList) { + auto ProcessDeclare = [&](DbgVariableRecord &Declare) { // FIXME: trackAssignments doesn't let you specify any modifiers to the // variable (e.g. fragment) or location (e.g. offset), so we have to // leave dbg.declares with non-empty expressions in place. - if (Declare->getExpression()->getNumElements() != 0) + if (Declare.getExpression()->getNumElements() != 0) return; - if (!Declare->getAddress()) + if (!Declare.getAddress()) return; if (AllocaInst *Alloca = - dyn_cast(Declare->getAddress()->stripPointerCasts())) { + dyn_cast(Declare.getAddress()->stripPointerCasts())) { // FIXME: Skip VLAs for now (let these variables use dbg.declares). if (!Alloca->isStaticAlloca()) return; // Similarly, skip scalable vectors (use dbg.declares instead). if (auto Sz = Alloca->getAllocationSize(*DL); Sz && Sz->isScalable()) return; - DeclareList[Alloca].insert(Declare); - Vars[Alloca].insert(VarRecord(Declare)); + DVRDeclares[Alloca].insert(&Declare); + Vars[Alloca].insert(VarRecord(&Declare)); } }; for (auto &BB : F) { for (auto &I : BB) { for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) { if (DVR.isDbgDeclare()) - ProcessDeclare(&DVR, DVRDeclares); + ProcessDeclare(DVR); } - if (DbgDeclareInst *DDI = dyn_cast(&I)) - ProcessDeclare(DDI, DbgDeclares); } } @@ -2336,8 +2333,8 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) { trackAssignments(F.begin(), F.end(), Vars, *DL); // Delete dbg.declares for variables now tracked with assignment tracking. - auto DeleteSubsumedDeclare = [&](const auto &Markers, auto &Declares) { - (void)Markers; + for (auto &[Insts, Declares] : DVRDeclares) { + auto Markers = at::getDVRAssignmentMarkers(Insts); for (auto *Declare : Declares) { // Assert that the alloca that Declare uses is now linked to a dbg.assign // describing the same variable (i.e. check that this dbg.declare has @@ -2356,10 +2353,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) { Changed = true; } }; - for (auto &P : DbgDeclares) - DeleteSubsumedDeclare(at::getAssignmentMarkers(P.first), P.second); - for (auto &P : DVRDeclares) - DeleteSubsumedDeclare(at::getDVRAssignmentMarkers(P.first), P.second); return Changed; } diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp index 2270923bd3719..f16963dce56e1 100644 --- a/llvm/lib/IR/DebugInfoMetadata.cpp +++ b/llvm/lib/IR/DebugInfoMetadata.cpp @@ -49,20 +49,11 @@ uint32_t DIType::getAlignInBits() const { const DIExpression::FragmentInfo DebugVariable::DefaultFragment = { std::numeric_limits::max(), std::numeric_limits::min()}; -DebugVariable::DebugVariable(const DbgVariableIntrinsic *DII) - : Variable(DII->getVariable()), - Fragment(DII->getExpression()->getFragmentInfo()), - InlinedAt(DII->getDebugLoc().getInlinedAt()) {} - DebugVariable::DebugVariable(const DbgVariableRecord *DVR) : Variable(DVR->getVariable()), Fragment(DVR->getExpression()->getFragmentInfo()), InlinedAt(DVR->getDebugLoc().getInlinedAt()) {} -DebugVariableAggregate::DebugVariableAggregate(const DbgVariableIntrinsic *DVI) - : DebugVariable(DVI->getVariable(), std::nullopt, - DVI->getDebugLoc()->getInlinedAt()) {} - DILocation::DILocation(LLVMContext &C, StorageType Storage, unsigned Line, unsigned Column, uint64_t AtomGroup, uint8_t AtomRank, ArrayRef MDs, bool ImplicitCode) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 9d7c025ccff86..f7fbf0815df03 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -825,9 +825,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned); bool tryToSinkInstruction(Instruction *I, BasicBlock *DestBlock); - void tryToSinkInstructionDbgValues( - Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock, - BasicBlock *DestBlock, SmallVectorImpl &DbgUsers); void tryToSinkInstructionDbgVariableRecords( Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock, BasicBlock *DestBlock, SmallVectorImpl &DPUsers); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 684b9a1f90161..a8bfd8c072d2f 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3575,6 +3575,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { std::unique_ptr DIB; if (isa(MI)) { findDbgUsers(DVIs, &MI, &DVRs); + assert(DVIs.empty()); DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false)); } @@ -5253,8 +5254,7 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I, SmallVector DbgUsers; SmallVector DbgVariableRecords; findDbgUsers(DbgUsers, I, &DbgVariableRecords); - if (!DbgUsers.empty()) - tryToSinkInstructionDbgValues(I, InsertPos, SrcBlock, DestBlock, DbgUsers); + assert(DbgUsers.empty()); if (!DbgVariableRecords.empty()) tryToSinkInstructionDbgVariableRecords(I, InsertPos, SrcBlock, DestBlock, DbgVariableRecords); @@ -5271,71 +5271,12 @@ bool InstCombinerImpl::tryToSinkInstruction(Instruction *I, return true; } -void InstCombinerImpl::tryToSinkInstructionDbgValues( - Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock, - BasicBlock *DestBlock, SmallVectorImpl &DbgUsers) { - // For all debug values in the destination block, the sunk instruction - // will still be available, so they do not need to be dropped. - SmallVector DbgUsersToSalvage; - for (auto &DbgUser : DbgUsers) - if (DbgUser->getParent() != DestBlock) - DbgUsersToSalvage.push_back(DbgUser); - - // Process the sinking DbgUsersToSalvage in reverse order, as we only want - // to clone the last appearing debug intrinsic for each given variable. - SmallVector DbgUsersToSink; - for (DbgVariableIntrinsic *DVI : DbgUsersToSalvage) - if (DVI->getParent() == SrcBlock) - DbgUsersToSink.push_back(DVI); - llvm::sort(DbgUsersToSink, - [](auto *A, auto *B) { return B->comesBefore(A); }); - - SmallVector DIIClones; - SmallSet SunkVariables; - for (auto *User : DbgUsersToSink) { - // A dbg.declare instruction should not be cloned, since there can only be - // one per variable fragment. It should be left in the original place - // because the sunk instruction is not an alloca (otherwise we could not be - // here). - if (isa(User)) - continue; - - DebugVariable DbgUserVariable = - DebugVariable(User->getVariable(), User->getExpression(), - User->getDebugLoc()->getInlinedAt()); - - if (!SunkVariables.insert(DbgUserVariable).second) - continue; - - // Leave dbg.assign intrinsics in their original positions and there should - // be no need to insert a clone. - if (isa(User)) - continue; - - DIIClones.emplace_back(cast(User->clone())); - if (isa(User) && isa(I)) - DIIClones.back()->replaceVariableLocationOp(I, I->getOperand(0)); - LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n'); - } - - // Perform salvaging without the clones, then sink the clones. - if (!DIIClones.empty()) { - salvageDebugInfoForDbgValues(*I, DbgUsersToSalvage, {}); - // The clones are in reverse order of original appearance, reverse again to - // maintain the original order. - for (auto &DIIClone : llvm::reverse(DIIClones)) { - DIIClone->insertBefore(InsertPos); - LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n'); - } - } -} - void InstCombinerImpl::tryToSinkInstructionDbgVariableRecords( Instruction *I, BasicBlock::iterator InsertPos, BasicBlock *SrcBlock, BasicBlock *DestBlock, SmallVectorImpl &DbgVariableRecords) { - // Implementation of tryToSinkInstructionDbgValues, but for the - // DbgVariableRecord of variable assignments rather than dbg.values. + // For all debug values in the destination block, the sunk instruction + // will still be available, so they do not need to be dropped. // Fetch all DbgVariableRecords not already in the destination. SmallVector DbgVariableRecordsToSalvage; diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp index 2786d81773ed9..df3160233c510 100644 --- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp +++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp @@ -1489,6 +1489,7 @@ static bool checkAndReplaceCondition( SmallVector DbgUsers; SmallVector DVRUsers; findDbgUsers(DbgUsers, Cmp, &DVRUsers); + assert(DbgUsers.empty()); for (auto *DVR : DVRUsers) { auto *DTN = DT.getNode(DVR->getParent()); diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index eacaf42e4e8ba..1d1af42153325 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1222,9 +1222,7 @@ static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) { SmallVector DbgUsers; SmallVector DbgVariableRecords; findDbgUsers(DbgUsers, &I, &DbgVariableRecords); - for (DbgVariableIntrinsic *DVI : DbgUsers) - if (DVI->getFunction() != &F) - DVI->eraseFromParent(); + assert(DbgUsers.empty()); for (DbgVariableRecord *DVR : DbgVariableRecords) if (DVR->getFunction() != &F) DVR->eraseFromParent(); @@ -1289,14 +1287,12 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc, SmallVector DbgUsers; SmallVector DPUsers; findDbgUsers(DbgUsers, Input, &DPUsers); + assert(DbgUsers.empty()); DIExpression *Expr = DIB.createExpression(); // Iterate the debud users of the Input values. If they are in the extracted // function then update their location with the new value. If they are in // the parent function then create a similar debug record. - for (auto *DVI : DbgUsers) - UpdateOrInsertDebugRecord(DVI, Input, NewVal, Expr, - isa(DVI)); for (auto *DVR : DPUsers) UpdateOrInsertDebugRecord(DVR, Input, NewVal, Expr, DVR->isDbgDeclare()); } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 6929d14bc56ea..ed3dca2f7c307 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1978,14 +1978,13 @@ static at::StorageToVarsMap collectEscapedLocals(const DataLayout &DL, continue; // Find all local variables associated with the backing storage. - auto CollectAssignsForStorage = [&](auto *DbgAssign) { + auto CollectAssignsForStorage = [&](DbgVariableRecord *DbgAssign) { // Skip variables from inlined functions - they are not local variables. if (DbgAssign->getDebugLoc().getInlinedAt()) return; LLVM_DEBUG(errs() << " > DEF : " << *DbgAssign << "\n"); EscapedLocals[Base].insert(at::VarRecord(DbgAssign)); }; - for_each(at::getAssignmentMarkers(Base), CollectAssignsForStorage); for_each(at::getDVRAssignmentMarkers(Base), CollectAssignsForStorage); } return EscapedLocals; diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index b14bbeac97675..ee3e56c3c6db9 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -613,11 +613,10 @@ bool llvm::replaceDbgUsesWithUndef(Instruction *I) { SmallVector DbgUsers; SmallVector DPUsers; findDbgUsers(DbgUsers, I, &DPUsers); - for (auto *DII : DbgUsers) - DII->setKillLocation(); + assert(DbgUsers.empty()); for (auto *DVR : DPUsers) DVR->setKillLocation(); - return !DbgUsers.empty() || !DPUsers.empty(); + return !DPUsers.empty(); } /// areAllUsesEqual - Check whether the uses of a value are all the same. @@ -2022,6 +2021,7 @@ void llvm::salvageDebugInfo(Instruction &I) { SmallVector DbgUsers; SmallVector DPUsers; findDbgUsers(DbgUsers, &I, &DPUsers); + assert(DbgUsers.empty()); salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers); } @@ -2070,66 +2070,9 @@ void llvm::salvageDebugInfoForDbgValues( const unsigned MaxExpressionSize = 128; bool Salvaged = false; - for (auto *DII : DbgUsers) { - if (auto *DAI = dyn_cast(DII)) { - if (DAI->getAddress() == &I) { - salvageDbgAssignAddress(DAI); - Salvaged = true; - } - if (DAI->getValue() != &I) - continue; - } - - // Do not add DW_OP_stack_value for DbgDeclare, because they are implicitly - // pointing out the value as a DWARF memory location description. - bool StackValue = isa(DII); - auto DIILocation = DII->location_ops(); - assert( - is_contained(DIILocation, &I) && - "DbgVariableIntrinsic must use salvaged instruction as its location"); - SmallVector AdditionalValues; - // `I` may appear more than once in DII's location ops, and each use of `I` - // must be updated in the DIExpression and potentially have additional - // values added; thus we call salvageDebugInfoImpl for each `I` instance in - // DIILocation. - Value *Op0 = nullptr; - DIExpression *SalvagedExpr = DII->getExpression(); - auto LocItr = find(DIILocation, &I); - while (SalvagedExpr && LocItr != DIILocation.end()) { - SmallVector Ops; - unsigned LocNo = std::distance(DIILocation.begin(), LocItr); - uint64_t CurrentLocOps = SalvagedExpr->getNumLocationOperands(); - Op0 = salvageDebugInfoImpl(I, CurrentLocOps, Ops, AdditionalValues); - if (!Op0) - break; - SalvagedExpr = - DIExpression::appendOpsToArg(SalvagedExpr, Ops, LocNo, StackValue); - LocItr = std::find(++LocItr, DIILocation.end(), &I); - } - // salvageDebugInfoImpl should fail on examining the first element of - // DbgUsers, or none of them. - if (!Op0) - break; + // We should never see debug intrinsics nowadays. + assert(DbgUsers.empty()); - SalvagedExpr = SalvagedExpr->foldConstantMath(); - DII->replaceVariableLocationOp(&I, Op0); - bool IsValidSalvageExpr = SalvagedExpr->getNumElements() <= MaxExpressionSize; - if (AdditionalValues.empty() && IsValidSalvageExpr) { - DII->setExpression(SalvagedExpr); - } else if (isa(DII) && IsValidSalvageExpr && - DII->getNumVariableLocationOps() + AdditionalValues.size() <= - MaxDebugArgs) { - DII->addVariableLocationOps(AdditionalValues, SalvagedExpr); - } else { - // Do not salvage using DIArgList for dbg.declare, as it is not currently - // supported in those instructions. Also do not salvage if the resulting - // DIArgList would contain an unreasonably large number of values. - DII->setKillLocation(); - } - LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n'); - Salvaged = true; - } - // Duplicate of above block for DbgVariableRecords. for (auto *DVR : DPUsers) { if (DVR->isDbgAssign()) { if (DVR->getAddress() == &I) { @@ -2198,9 +2141,6 @@ void llvm::salvageDebugInfoForDbgValues( if (Salvaged) return; - for (auto *DII : DbgUsers) - DII->setKillLocation(); - for (auto *DVR : DPUsers) DVR->setKillLocation(); } @@ -3429,8 +3369,7 @@ void llvm::dropDebugUsers(Instruction &I) { SmallVector DbgUsers; SmallVector DPUsers; findDbgUsers(DbgUsers, &I, &DPUsers); - for (auto *DII : DbgUsers) - DII->eraseFromParent(); + assert(DbgUsers.empty()); for (auto *DVR : DPUsers) DVR->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index ccd7ee360e014..73b5f48796b7a 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -190,7 +190,6 @@ class AssignmentTrackingInfo { }; struct AllocaInfo { - using DbgUserVec = SmallVector; using DPUserVec = SmallVector; SmallVector DefiningBlocks; @@ -201,7 +200,6 @@ struct AllocaInfo { bool OnlyUsedInOneBlock; /// Debug users of the alloca - does not include dbg.assign intrinsics. - DbgUserVec DbgUsers; DPUserVec DPUsers; /// Helper to update assignment tracking debug info. AssignmentTrackingInfo AssignmentTracking; @@ -212,7 +210,6 @@ struct AllocaInfo { OnlyStore = nullptr; OnlyBlock = nullptr; OnlyUsedInOneBlock = true; - DbgUsers.clear(); DPUsers.clear(); AssignmentTracking.clear(); } @@ -246,13 +243,10 @@ struct AllocaInfo { OnlyUsedInOneBlock = false; } } - DbgUserVec AllDbgUsers; + SmallVector AllDbgUsers; SmallVector AllDPUsers; findDbgUsers(AllDbgUsers, AI, &AllDPUsers); - std::copy_if(AllDbgUsers.begin(), AllDbgUsers.end(), - std::back_inserter(DbgUsers), [](DbgVariableIntrinsic *DII) { - return !isa(DII); - }); + assert(AllDbgUsers.empty()); std::copy_if(AllDPUsers.begin(), AllDPUsers.end(), std::back_inserter(DPUsers), [](DbgVariableRecord *DVR) { return !DVR->isDbgAssign(); }); @@ -380,10 +374,9 @@ struct PromoteMem2Reg { /// to. DenseMap PhiToAllocaMap; - /// For each alloca, we keep track of the dbg.declare intrinsic that + /// For each alloca, we keep track of the dbg.declare record that /// describes it, if any, so that we can convert it to a dbg.value - /// intrinsic if the alloca gets promoted. - SmallVector AllocaDbgUsers; + /// record if the alloca gets promoted. SmallVector AllocaDPUsers; /// For each alloca, keep an instance of a helper class that gives us an easy @@ -741,14 +734,11 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, AI->eraseFromParent(); // The alloca's debuginfo can be removed as well. - auto DbgUpdateForAlloca = [&](auto &Container) { - for (auto *DbgItem : Container) - if (DbgItem->isAddressOfVariable() || - DbgItem->getExpression()->startsWithDeref()) - DbgItem->eraseFromParent(); - }; - DbgUpdateForAlloca(Info.DbgUsers); - DbgUpdateForAlloca(Info.DPUsers); + for (DbgVariableRecord *DbgItem : Info.DPUsers) { + if (DbgItem->isAddressOfVariable() || + DbgItem->getExpression()->startsWithDeref()) + DbgItem->eraseFromParent(); + } ++NumLocalPromoted; return true; @@ -757,7 +747,6 @@ promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, void PromoteMem2Reg::run() { Function &F = *DT.getRoot()->getParent(); - AllocaDbgUsers.resize(Allocas.size()); AllocaATInfo.resize(Allocas.size()); AllocaDPUsers.resize(Allocas.size()); @@ -816,9 +805,7 @@ void PromoteMem2Reg::run() { if (BBNumPreds.empty()) BBNumPreds.resize(F.getMaxBlockNumber()); - // Remember the dbg.declare intrinsic describing this alloca, if any. - if (!Info.DbgUsers.empty()) - AllocaDbgUsers[AllocaNum] = Info.DbgUsers; + // Remember the dbg.declare record describing this alloca, if any. if (!Info.AssignmentTracking.empty()) AllocaATInfo[AllocaNum] = Info.AssignmentTracking; if (!Info.DPUsers.empty()) @@ -894,16 +881,12 @@ void PromoteMem2Reg::run() { } // Remove alloca's dbg.declare intrinsics from the function. - auto RemoveDbgDeclares = [&](auto &Container) { - for (auto &DbgUsers : Container) { - for (auto *DbgItem : DbgUsers) - if (DbgItem->isAddressOfVariable() || - DbgItem->getExpression()->startsWithDeref()) - DbgItem->eraseFromParent(); - } - }; - RemoveDbgDeclares(AllocaDbgUsers); - RemoveDbgDeclares(AllocaDPUsers); + for (auto &DbgUsers : AllocaDPUsers) { + for (DbgVariableRecord *DbgItem : DbgUsers) + if (DbgItem->isAddressOfVariable() || + DbgItem->getExpression()->startsWithDeref()) + DbgItem->eraseFromParent(); + } // Loop over all of the PHI nodes and see if there are any that we can get // rid of because they merge all of the same incoming values. This can diff --git a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp index 0b00734fc4d75..1daf381ee2862 100644 --- a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp +++ b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp @@ -54,20 +54,13 @@ struct DebugInfoDrop : public FunctionPass { struct DebugValueDrop : public FunctionPass { static char ID; bool runOnFunction(Function &F) override { - SmallVector Dbgs; for (BasicBlock &BB : F) { - // Remove dbg var intrinsics. for (Instruction &I : BB) { - if (auto *DVI = dyn_cast(&I)) - Dbgs.push_back(DVI); - // If there are any non-intrinsic records (DbgRecords), drop those too. + // If there are any debug records, drop them. I.dropDbgRecords(); } } - for (auto &I : Dbgs) - I->eraseFromParent(); - return true; } diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index b922216ef8893..dd2a6249c7cf9 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -633,62 +633,6 @@ TEST(Local, ChangeToUnreachable) { EXPECT_EQ(DLA, DLB); } -TEST(Local, FindDbgUsers) { - LLVMContext Ctx; - std::unique_ptr M = parseIR(Ctx, - R"( - define dso_local void @fun(ptr %a) #0 !dbg !11 { - entry: - #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19) - ret void - } - - !llvm.dbg.cu = !{!0} - !llvm.module.flags = !{!2, !3, !9} - !llvm.ident = !{!10} - - !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 17.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None) - !1 = !DIFile(filename: "test.cpp", directory: "/") - !2 = !{i32 7, !"Dwarf Version", i32 5} - !3 = !{i32 2, !"Debug Info Version", i32 3} - !4 = !{i32 1, !"wchar_size", i32 4} - !9 = !{i32 7, !"debug-info-assignment-tracking", i1 true} - !10 = !{!"clang version 17.0.0"} - !11 = distinct !DISubprogram(name: "fun", linkageName: "fun", scope: !1, file: !1, line: 1, type: !12, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !14) - !12 = !DISubroutineType(types: !13) - !13 = !{null} - !14 = !{} - !15 = distinct !DIAssignID() - !16 = !DILocalVariable(name: "x", scope: !11, file: !1, line: 2, type: !17) - !17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !18, size: 64) - !18 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) - !19 = !DILocation(line: 0, scope: !11) - )"); - - bool BrokenDebugInfo = true; - verifyModule(*M, &errs(), &BrokenDebugInfo); - ASSERT_FALSE(BrokenDebugInfo); - - // Convert to debug intrinsics as we want to test findDbgUsers and - // findDbgValue's debug-intrinsic-finding code here. - // TODO: Remove this test when debug intrinsics are removed. - M->convertFromNewDbgValues(); - - Function &Fun = *cast(M->getNamedValue("fun")); - Value *Arg = Fun.getArg(0); - SmallVector Users; - // Arg (%a) is used twice by a single dbg.assign. Check findDbgUsers returns - // only 1 pointer to it rather than 2. - findDbgUsers(Users, Arg); - EXPECT_EQ(Users.size(), 1u); - - SmallVector Vals; - // Arg (%a) is used twice by a single dbg.assign. Check findDbgValues returns - // only 1 pointer to it rather than 2. - findDbgValues(Vals, Arg); - EXPECT_EQ(Vals.size(), 1u); -} - TEST(Local, FindDbgRecords) { // DbgRecord copy of the FindDbgUsers test above. LLVMContext Ctx; From 74c396afb26dec74c0b799e218c63f1a26e90d21 Mon Sep 17 00:00:00 2001 From: clubby789 Date: Fri, 18 Jul 2025 08:30:23 +0100 Subject: [PATCH 300/813] [DSE] Remove `uninitialized` from `allockind` when creating dummy zeroed variant function (#149336) cc https://github.com/llvm/llvm-project/pull/138299 rustc sets `allockind("uninitialized")` - if we copy the attributes as-is when creating a dummy function, Verify complains about `allockind("uninitialized,zeroed")` conflicting, so we need to clear the flag. Co-authored-by: Jamie Hill-Daniel --- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp | 1 + llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 85dd9a1bf7161..0f63ed0166cf4 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -2079,6 +2079,7 @@ struct DSEState { AllocFnKind AllocKind = Attrs.getFnAttr(Attribute::AllocKind).getAllocKind() | AllocFnKind::Zeroed; + AllocKind &= ~AllocFnKind::Uninitialized; Attrs = Attrs.addFnAttribute(Ctx, Attribute::getWithAllocKind(Ctx, AllocKind)) .removeFnAttribute(Ctx, "alloc-variant-zeroed"); diff --git a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll index e390d4bdca632..303afc207c023 100644 --- a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll +++ b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll @@ -12,6 +12,6 @@ define ptr @undeclared_customalloc(i64 %size, i64 %align) { ret ptr %call } -declare ptr @customalloc2(i64, i64) allockind("alloc") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed" +declare ptr @customalloc2(i64, i64) allockind("alloc,uninitialized") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed" ; CHECK-DAG: declare ptr @customalloc2_zeroed(i64, i64) #[[CA2ATTR:[0-9]+]] ; CHECK-DAG: attributes #[[CA2ATTR]] = { allockind("alloc,zeroed") "alloc-family"="customalloc2" } From 3f991f5067bd45064af4afb0594ab5d614e357df Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 18 Jul 2025 09:30:42 +0200 Subject: [PATCH 301/813] [clang][bytecode][NFC] Remove unused includes (#149460) --- clang/lib/AST/ByteCode/InterpBlock.h | 5 ----- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 1 - 2 files changed, 6 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBlock.h b/clang/lib/AST/ByteCode/InterpBlock.h index 7798b6f886a85..51622238e275c 100644 --- a/clang/lib/AST/ByteCode/InterpBlock.h +++ b/clang/lib/AST/ByteCode/InterpBlock.h @@ -14,11 +14,6 @@ #define LLVM_CLANG_AST_INTERP_BLOCK_H #include "Descriptor.h" -#include "clang/AST/ComparisonCategories.h" -#include "clang/AST/Decl.h" -#include "clang/AST/DeclCXX.h" -#include "clang/AST/Expr.h" -#include "llvm/ADT/PointerUnion.h" #include "llvm/Support/raw_ostream.h" namespace clang { diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index de0b97fd93c76..9ce1e380bff2c 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "../ExprConstShared.h" #include "Boolean.h" -#include "Compiler.h" #include "EvalEmitter.h" #include "Interp.h" #include "InterpBuiltinBitCast.h" From 1e7446fe45f2de473fe180a065733f68ced8e653 Mon Sep 17 00:00:00 2001 From: Trevor Gross Date: Fri, 18 Jul 2025 02:31:28 -0500 Subject: [PATCH 302/813] [X86] Correct an assertion message (NFC) (#149386) I introduced this in a78a0f8d2043 ("[X86] Align f128 and i128 to 16 bytes"). Correct the message here. --- llvm/lib/Target/X86/X86CallingConv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp index 82e8ce4e0bd7c..5d5a705893242 100644 --- a/llvm/lib/Target/X86/X86CallingConv.cpp +++ b/llvm/lib/Target/X86/X86CallingConv.cpp @@ -389,7 +389,7 @@ static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT, if (!ArgFlags.isInConsecutiveRegsLast()) return true; - assert(PendingMembers.size() == 4 && "Should have two parts"); + assert(PendingMembers.size() == 4 && "Should have four parts"); int64_t Offset = State.AllocateStack(16, Align(16)); PendingMembers[0].convertToMem(Offset); From cda28e203d8f396af65cd4e19c62cfaa58480280 Mon Sep 17 00:00:00 2001 From: "Oleksandr T." Date: Fri, 18 Jul 2025 10:34:15 +0300 Subject: [PATCH 303/813] [analyzer] Support parenthesized list initialization (CXXParenListInitExpr) (#148988) This patch addresses the lack of support for parenthesized initialization in the Clang Static Analyzer's `ExprEngine`. Previously, initializations such as `V v(1, 2);` were not modeled properly, which could lead to false negatives in analyses like `DivideZero`. ```cpp struct A { int x; A(int v) : x(v) {} }; int t() { A a(42); return 1 / (a.x - 42); // expected-warning {{Division by zero}} } ``` Fixes #148875 --- clang/docs/ReleaseNotes.rst | 2 + .../Core/PathSensitive/ExprEngine.h | 7 ++- clang/lib/StaticAnalyzer/Core/ExprEngine.cpp | 46 +++++++++++++- clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp | 48 --------------- clang/test/Analysis/div-zero-cxx20.cpp | 61 +++++++++++++++++++ clang/test/Analysis/div-zero.cpp | 60 ++++++++++++++++++ 6 files changed, 170 insertions(+), 54 deletions(-) create mode 100644 clang/test/Analysis/div-zero-cxx20.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 6f55d14fd0aa5..ea16029268dba 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1207,6 +1207,8 @@ Static Analyzer --------------- - Fixed a crash when C++20 parenthesized initializer lists are used. This issue was causing a crash in clang-tidy. (#GH136041) +- The Clang Static Analyzer now handles parenthesized initialization. + (#GH148875) New features ^^^^^^^^^^^^ diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h index 6370586e218ef..fbb34340a5c67 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h @@ -499,9 +499,6 @@ class ExprEngine { void VisitGuardedExpr(const Expr *Ex, const Expr *L, const Expr *R, ExplodedNode *Pred, ExplodedNodeSet &Dst); - void VisitInitListExpr(const InitListExpr *E, ExplodedNode *Pred, - ExplodedNodeSet &Dst); - /// VisitAttributedStmt - Transfer function logic for AttributedStmt. void VisitAttributedStmt(const AttributedStmt *A, ExplodedNode *Pred, ExplodedNodeSet &Dst); @@ -591,6 +588,10 @@ class ExprEngine { ExplodedNode *Pred, ExplodedNodeSet &Dst); + void ConstructInitList(const Expr *Source, ArrayRef Args, + bool IsTransparent, ExplodedNode *Pred, + ExplodedNodeSet &Dst); + /// evalEagerlyAssumeBifurcation - Given the nodes in 'Src', eagerly assume /// concrete boolean values for 'Ex', storing the resulting nodes in 'Dst'. void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src, diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index c77ef26da568d..d87484470f8b5 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1941,7 +1941,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::ConceptSpecializationExprClass: case Stmt::CXXRewrittenBinaryOperatorClass: case Stmt::RequiresExprClass: - case Expr::CXXParenListInitExprClass: case Stmt::EmbedExprClass: // Fall through. @@ -2315,11 +2314,22 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, break; } - case Stmt::InitListExprClass: + case Stmt::InitListExprClass: { + const InitListExpr *E = cast(S); Bldr.takeNodes(Pred); - VisitInitListExpr(cast(S), Pred, Dst); + ConstructInitList(E, E->inits(), E->isTransparent(), Pred, Dst); Bldr.addNodes(Dst); break; + } + + case Expr::CXXParenListInitExprClass: { + const CXXParenListInitExpr *E = cast(S); + Bldr.takeNodes(Pred); + ConstructInitList(E, E->getInitExprs(), /*IsTransparent*/ false, Pred, + Dst); + Bldr.addNodes(Dst); + break; + } case Stmt::MemberExprClass: Bldr.takeNodes(Pred); @@ -4114,3 +4124,33 @@ void *ProgramStateTrait::GDMIndex() { } void ExprEngine::anchor() { } + +void ExprEngine::ConstructInitList(const Expr *E, ArrayRef Args, + bool IsTransparent, ExplodedNode *Pred, + ExplodedNodeSet &Dst) { + assert((isa(E))); + + const LocationContext *LC = Pred->getLocationContext(); + + StmtNodeBuilder B(Pred, Dst, *currBldrCtx); + ProgramStateRef S = Pred->getState(); + QualType T = E->getType().getCanonicalType(); + + bool IsCompound = T->isArrayType() || T->isRecordType() || + T->isAnyComplexType() || T->isVectorType(); + + if (Args.size() > 1 || (E->isPRValue() && IsCompound && !IsTransparent)) { + llvm::ImmutableList ArgList = getBasicVals().getEmptySValList(); + for (Expr *E : llvm::reverse(Args)) + ArgList = getBasicVals().prependSVal(S->getSVal(E, LC), ArgList); + + B.generateNode(E, Pred, + S->BindExpr(E, LC, svalBuilder.makeCompoundVal(T, ArgList))); + } else { + B.generateNode(E, Pred, + S->BindExpr(E, LC, + Args.size() == 0 + ? getSValBuilder().makeZeroVal(T) + : S->getSVal(Args.front(), LC))); + } +} diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp index fa8e669b6bb2f..f1a25a750dd0d 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp @@ -771,54 +771,6 @@ void ExprEngine::VisitLogicalExpr(const BinaryOperator* B, ExplodedNode *Pred, Bldr.generateNode(B, Pred, state->BindExpr(B, Pred->getLocationContext(), X)); } -void ExprEngine::VisitInitListExpr(const InitListExpr *IE, - ExplodedNode *Pred, - ExplodedNodeSet &Dst) { - StmtNodeBuilder B(Pred, Dst, *currBldrCtx); - - ProgramStateRef state = Pred->getState(); - const LocationContext *LCtx = Pred->getLocationContext(); - QualType T = getContext().getCanonicalType(IE->getType()); - unsigned NumInitElements = IE->getNumInits(); - - if (!IE->isGLValue() && !IE->isTransparent() && - (T->isArrayType() || T->isRecordType() || T->isVectorType() || - T->isAnyComplexType())) { - llvm::ImmutableList vals = getBasicVals().getEmptySValList(); - - // Handle base case where the initializer has no elements. - // e.g: static int* myArray[] = {}; - if (NumInitElements == 0) { - SVal V = svalBuilder.makeCompoundVal(T, vals); - B.generateNode(IE, Pred, state->BindExpr(IE, LCtx, V)); - return; - } - - for (const Stmt *S : llvm::reverse(*IE)) { - SVal V = state->getSVal(cast(S), LCtx); - vals = getBasicVals().prependSVal(V, vals); - } - - B.generateNode(IE, Pred, - state->BindExpr(IE, LCtx, - svalBuilder.makeCompoundVal(T, vals))); - return; - } - - // Handle scalars: int{5} and int{} and GLvalues. - // Note, if the InitListExpr is a GLvalue, it means that there is an address - // representing it, so it must have a single init element. - assert(NumInitElements <= 1); - - SVal V; - if (NumInitElements == 0) - V = getSValBuilder().makeZeroVal(T); - else - V = state->getSVal(IE->getInit(0), LCtx); - - B.generateNode(IE, Pred, state->BindExpr(IE, LCtx, V)); -} - void ExprEngine::VisitGuardedExpr(const Expr *Ex, const Expr *L, const Expr *R, diff --git a/clang/test/Analysis/div-zero-cxx20.cpp b/clang/test/Analysis/div-zero-cxx20.cpp new file mode 100644 index 0000000000000..00ea96e796777 --- /dev/null +++ b/clang/test/Analysis/div-zero-cxx20.cpp @@ -0,0 +1,61 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=core.DivideZero -std=c++20 -verify %s + +namespace GH148875 { +struct A { + int x; + A(int v) : x(v) {} +}; + +struct B { + int x; + B() : x(0) {} +}; + +struct C { + int x, y; + C(int a, int b) : x(a), y(b) {} +}; + +struct D { + int x; +}; + +struct E { + D d; + E(int a) : d(a) {} +}; + +struct F { + int x; +}; + +int t1() { + A a{42}; + return 1 / (a.x - 42); // expected-warning {{Division by zero}} +} + +int t2() { + B b{}; + return 1 / b.x; // expected-warning {{Division by zero}} +} + +int t3() { + C c1{1, -1}; + return 1 / (c1.x + c1.y); // expected-warning {{Division by zero}} +} + +int t4() { + C c2{0, 0}; + return 1 / (c2.x + c2.y); // expected-warning {{Division by zero}} +} + +int t5() { + E e{32}; + return 1 / (e.d.x - 32); // expected-warning {{Division by zero}} +} + +int t6() { + F f(32); + return 1 / (f.x - 32); // expected-warning {{Division by zero}} +} +} // namespace GH148875 diff --git a/clang/test/Analysis/div-zero.cpp b/clang/test/Analysis/div-zero.cpp index 063450d8883b0..51ea25e828a18 100644 --- a/clang/test/Analysis/div-zero.cpp +++ b/clang/test/Analysis/div-zero.cpp @@ -11,3 +11,63 @@ int fooPR10616 (int qX ) { return (a % (qX-1)); // expected-warning {{Division by zero}} } + +namespace GH148875 { +struct A { + int x; + A(int v) : x(v) {} +}; + +struct B { + int x; + B() : x(0) {} +}; + +struct C { + int x, y; + C(int a, int b) : x(a), y(b) {} +}; + +struct D { + int x; +}; + +struct E { + D d; + E(int a) : d{a} {} +}; + +struct F { + int x; +}; + +int t1() { + A a{42}; + return 1 / (a.x - 42); // expected-warning {{Division by zero}} +} + +int t2() { + B b{}; + return 1 / b.x; // expected-warning {{Division by zero}} +} + +int t3() { + C c1{1, -1}; + return 1 / (c1.x + c1.y); // expected-warning {{Division by zero}} +} + +int t4() { + C c2{0, 0}; + return 1 / (c2.x + c2.y); // expected-warning {{Division by zero}} +} + +int t5() { + E e{32}; + return 1 / (e.d.x - 32); // expected-warning {{Division by zero}} +} + +int t6() { + F f{32}; + return 1 / (f.x - 32); // expected-warning {{Division by zero}} +} +} From daa6de37bac9e547d37a3c5f2c9a51559679a7ed Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 18 Jul 2025 10:00:54 +0200 Subject: [PATCH 304/813] [AMDGPU][SDAG] Add target-specific ISD::PTRADD combines (#143673) This patch adds several (AMDGPU-)target-specific DAG combines for ISD::PTRADD nodes that reproduce existing similar transforms for ISD::ADD nodes. There is no functional change intended for the existing target-specific PTRADD combine. For SWDEV-516125. --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 146 +++++++++++++---- .../AMDGPU/ptradd-sdag-optimizations.ll | 151 ++++++------------ 3 files changed, 167 insertions(+), 134 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 56c8bb441ddf8..5453828177c72 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6751,7 +6751,9 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT, return SDValue(); int64_t Offset = C2->getSExtValue(); switch (Opcode) { - case ISD::ADD: break; + case ISD::ADD: + case ISD::PTRADD: + break; case ISD::SUB: Offset = -uint64_t(Offset); break; default: return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 400795c29b0e4..0c76ff2ec5ea7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/SDPatternMatch.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" @@ -46,6 +47,7 @@ #include using namespace llvm; +using namespace llvm::SDPatternMatch; #define DEBUG_TYPE "si-lower" @@ -14561,7 +14563,7 @@ static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, // instead of a tree. SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, DAGCombinerInfo &DCI) const { - assert(N->getOpcode() == ISD::ADD); + assert(N->isAnyAdd()); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -14594,7 +14596,7 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, for (SDNode *User : LHS->users()) { // There is a use that does not feed into addition, so the multiply can't // be removed. We prefer MUL + ADD + ADDC over MAD + MUL. - if (User->getOpcode() != ISD::ADD) + if (!User->isAnyAdd()) return SDValue(); // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer @@ -14706,8 +14708,11 @@ SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N, SDValue Hi = getHiHalf64(LHS, DAG); SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32); + unsigned Opcode = N->getOpcode(); + if (Opcode == ISD::PTRADD) + Opcode = ISD::ADD; SDValue AddHi = - DAG.getNode(N->getOpcode(), SL, MVT::i32, Hi, ConstHi32, N->getFlags()); + DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags()); SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi); @@ -15181,42 +15186,123 @@ SDValue SITargetLowering::performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); + EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N1.getOpcode() == ISD::ADD) { - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, - // y is not, and (add y, z) is used only once. - // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, - // z is not, and (add y, z) is used only once. - // The goal is to move constant offsets to the outermost ptradd, to create - // more opportunities to fold offsets into memory instructions. - // Together with the generic combines in DAGCombiner.cpp, this also - // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). - // - // This transform is here instead of in the general DAGCombiner as it can - // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for - // AArch64's CPA. - SDValue X = N0; - SDValue Y = N1.getOperand(0); - SDValue Z = N1.getOperand(1); - if (N1.hasOneUse()) { - bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); - bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); - if (ZIsConstant != YIsConstant) { - // If both additions in the original were NUW, the new ones are as well. - SDNodeFlags Flags = - (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; - if (YIsConstant) - std::swap(Y, Z); + // The following folds transform PTRADDs into regular arithmetic in cases + // where the PTRADD wouldn't be folded as an immediate offset into memory + // instructions anyway. They are target-specific in that other targets might + // prefer to not lose information about the pointer arithmetic. + + // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)). + // Adapted from DAGCombiner::visitADDLikeCommutative. + SDValue V, K; + if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) { + SDNodeFlags ShlFlags = N1->getFlags(); + // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0, + // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be + // preserved. + SDNodeFlags NewShlFlags = + ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap() + ? SDNodeFlags::NoSignedWrap + : SDNodeFlags(); + SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getNode(ISD::SUB, DL, VT, N0, Inner); + } + + // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in + // performAddCombine. + if (N1.getOpcode() == ISD::MUL) { + if (Subtarget->hasMad64_32()) { + if (SDValue Folded = tryFoldToMad64_32(N, DCI)) + return Folded; + } + } - SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags); + // If the 32 low bits of the constant are all zero, there is nothing to fold + // into an immediate offset, so it's better to eliminate the unnecessary + // addition for the lower 32 bits than to preserve the PTRADD. + // Analogous to a fold in performAddCombine. + if (VT == MVT::i64) { + if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI)) + return Folded; + } + + if (N0.getOpcode() == ISD::PTRADD && N1.getOpcode() == ISD::Constant) { + // Fold (ptradd (ptradd GA, v), c) -> (ptradd (ptradd GA, c) v) with + // global address GA and constant c, such that c can be folded into GA. + SDValue GAValue = N0.getOperand(0); + if (const GlobalAddressSDNode *GA = + dyn_cast(GAValue)) { + if (DCI.isBeforeLegalizeOps() && isOffsetFoldingLegal(GA)) { + // If both additions in the original were NUW, reassociation preserves + // that. + SDNodeFlags Flags = + (N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap; + SDValue Inner = DAG.getMemBasePlusOffset(GAValue, N1, DL, Flags); DCI.AddToWorklist(Inner.getNode()); - return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags); + return DAG.getMemBasePlusOffset(Inner, N0.getOperand(1), DL, Flags); } } } + if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse()) + return SDValue(); + + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant, + // y is not, and (add y, z) is used only once. + // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant, + // z is not, and (add y, z) is used only once. + // The goal is to move constant offsets to the outermost ptradd, to create + // more opportunities to fold offsets into memory instructions. + // Together with the generic combines in DAGCombiner.cpp, this also + // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)). + // + // This transform is here instead of in the general DAGCombiner as it can + // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for + // AArch64's CPA. + SDValue X = N0; + SDValue Y = N1.getOperand(0); + SDValue Z = N1.getOperand(1); + bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y); + bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z); + + // If both additions in the original were NUW, reassociation preserves that. + SDNodeFlags ReassocFlags = + (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap; + + if (ZIsConstant != YIsConstant) { + if (YIsConstant) + std::swap(Y, Z); + SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(Inner.getNode()); + return DAG.getMemBasePlusOffset(Inner, Z, DL, ReassocFlags); + } + + // If one of Y and Z is constant, they have been handled above. If both were + // constant, the addition would have been folded in SelectionDAG::getNode + // already. This ensures that the generic DAG combines won't undo the + // following reassociation. + assert(!YIsConstant && !ZIsConstant); + + if (!X->isDivergent() && Y->isDivergent() != Z->isDivergent()) { + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and + // y are uniform and z isn't. + // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and + // z are uniform and y isn't. + // The goal is to push uniform operands up in the computation, so that they + // can be handled with scalar operations. We can't use reassociateScalarOps + // for this since it requires two identical commutative operations to + // reassociate. + if (Y->isDivergent()) + std::swap(Y, Z); + SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags); + DCI.AddToWorklist(UniformInner.getNode()); + return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags); + } + return SDValue(); } diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index 1ec94162951a6..c00bccdbce6b7 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -145,49 +145,29 @@ entry: ; Test skipping the lower-32-bit addition if it is unnecessary. define ptr @huge_offset_low_32_unused(ptr %p) { -; GFX942_PTRADD-LABEL: huge_offset_low_32_unused: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_mov_b32 s0, 0 -; GFX942_PTRADD-NEXT: s_mov_b32 s1, 1 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: huge_offset_low_32_unused: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_add_u32_e32 v1, 1, v1 -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: huge_offset_low_32_unused: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_add_u32_e32 v1, 1, v1 +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr inbounds i8, ptr %p, i64 u0x100000000 ret ptr %gep } ; Reassociate address computation if it leads to more scalar operations. define amdgpu_kernel void @reassoc_scalar_r(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { -; GFX942_PTRADD-LABEL: reassoc_scalar_r: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[6:7] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: reassoc_scalar_r: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6 -; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7 -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] -; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: reassoc_scalar_r: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_add_u32 s2, s2, s6 +; GFX942-NEXT: s_addc_u32 s3, s3, s7 +; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942-NEXT: s_endpgm entry: %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() %voffset = zext i32 %voffset32 to i64 @@ -198,30 +178,18 @@ entry: } define amdgpu_kernel void @reassoc_scalar_l(ptr addrspace(1) %out, ptr addrspace(1) %p, i64 %soffset) { -; GFX942_PTRADD-LABEL: reassoc_scalar_l: -; GFX942_PTRADD: ; %bb.0: ; %entry -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX942_PTRADD-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, 0 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[6:7], 0, v[0:1] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: reassoc_scalar_l: -; GFX942_LEGACY: ; %bb.0: ; %entry -; GFX942_LEGACY-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, 0 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_add_u32 s2, s2, s6 -; GFX942_LEGACY-NEXT: s_addc_u32 s3, s3, s7 -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] -; GFX942_LEGACY-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: reassoc_scalar_l: +; GFX942: ; %bb.0: ; %entry +; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10 +; GFX942-NEXT: v_mov_b32_e32 v1, 0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_add_u32 s2, s2, s6 +; GFX942-NEXT: s_addc_u32 s3, s3, s7 +; GFX942-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX942-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] +; GFX942-NEXT: s_endpgm entry: %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() %voffset = zext i32 %voffset32 to i64 @@ -233,24 +201,14 @@ entry: ; Tests the target-specific (ptradd x, shl(0 - y, k)) -> sub(x, shl(y, k)) fold define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %shift) { -; GFX942_PTRADD-LABEL: shl_neg_offset: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_sub_co_u32_e32 v2, vcc, 0, v2 -; GFX942_PTRADD-NEXT: s_nop 1 -; GFX942_PTRADD-NEXT: v_subb_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: shl_neg_offset: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] -; GFX942_LEGACY-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 -; GFX942_LEGACY-NEXT: s_nop 1 -; GFX942_LEGACY-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: shl_neg_offset: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX942-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 +; GFX942-NEXT: s_nop 1 +; GFX942-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc +; GFX942-NEXT: s_setpc_b64 s[30:31] %offset = sub i64 0, %noffset %x = shl i64 %offset, %shift %gep = getelementptr inbounds i8, ptr addrspace(1) %p, i64 %x @@ -268,10 +226,9 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) { ; GFX942_PTRADD: ; %bb.0: ; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1] -; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+4 -; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+12 +; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 +; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 ; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 10 ; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] ; ; GFX942_LEGACY-LABEL: complextype_global_gep: @@ -291,27 +248,15 @@ define ptr addrspace(1) @complextype_global_gep(i64 %offset) { ; Tests the tryFoldToMad64_32 PTRADD combine. define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { -; GFX942_PTRADD-LABEL: fold_mad64: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v1, 12, v0 -; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v0, 12, v0 -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] -; GFX942_PTRADD-NEXT: global_store_dword v[0:1], v2, off -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: fold_mad64: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 1.0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] -; GFX942_LEGACY-NEXT: global_store_dword v[0:1], v2, off -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: fold_mad64: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX942-NEXT: v_mov_b32_e32 v2, 1.0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1] +; GFX942-NEXT: global_store_dword v[0:1], v2, off +; GFX942-NEXT: s_endpgm %voffset32 = call i32 @llvm.amdgcn.workitem.id.x() %voffset = zext i32 %voffset32 to i64 %p1 = getelementptr inbounds %S, ptr addrspace(1) %p, i64 %voffset, i32 0 From efedd49a22832f8b0981a084c503cdcdf4ed8e65 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 18 Jul 2025 10:04:51 +0200 Subject: [PATCH 305/813] [AMDGPU][SDAG] Test ISD::PTRADD handling in VOP3 patterns (#143880) Pre-committing tests to show improvements in a follow-up PR. --- .../AMDGPU/ptradd-sdag-optimizations.ll | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index c00bccdbce6b7..d48bfe0bb7f21 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -263,3 +263,48 @@ define amdgpu_kernel void @fold_mad64(ptr addrspace(1) %p) { store float 1.0, ptr addrspace(1) %p1 ret void } + +; Use non-zero shift amounts in v_lshl_add_u64. +define ptr @select_v_lshl_add_u64(ptr %base, i64 %voffset) { +; GFX942_PTRADD-LABEL: select_v_lshl_add_u64: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT: v_lshlrev_b64 v[2:3], 3, v[2:3] +; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: select_v_lshl_add_u64: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 3, v[0:1] +; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] + %gep = getelementptr inbounds i64, ptr %base, i64 %voffset + ret ptr %gep +} + +; Fold mul and add into v_mad, even if amdgpu-codegenprepare-mul24 turned the +; mul into a mul24. +define ptr @fold_mul24_into_mad(ptr %base, i64 %a, i64 %b) { +; GFX942_PTRADD-LABEL: fold_mul24_into_mad: +; GFX942_PTRADD: ; %bb.0: +; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_PTRADD-NEXT: v_and_b32_e32 v2, 0xfffff, v2 +; GFX942_PTRADD-NEXT: v_and_b32_e32 v4, 0xfffff, v4 +; GFX942_PTRADD-NEXT: v_mul_hi_u32_u24_e32 v3, v2, v4 +; GFX942_PTRADD-NEXT: v_mul_u32_u24_e32 v2, v2, v4 +; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] +; +; GFX942_LEGACY-LABEL: fold_mul24_into_mad: +; GFX942_LEGACY: ; %bb.0: +; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942_LEGACY-NEXT: v_and_b32_e32 v2, 0xfffff, v2 +; GFX942_LEGACY-NEXT: v_and_b32_e32 v3, 0xfffff, v4 +; GFX942_LEGACY-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, v3, v[0:1] +; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] + %a_masked = and i64 %a, u0xfffff + %b_masked = and i64 %b, u0xfffff + %mul = mul i64 %a_masked, %b_masked + %gep = getelementptr inbounds i8, ptr %base, i64 %mul + ret ptr %gep +} From 64a0478e08829ec6bcae2b05e154aa58c2c46ac0 Mon Sep 17 00:00:00 2001 From: tangaac Date: Fri, 18 Jul 2025 16:12:11 +0800 Subject: [PATCH 306/813] [LoongArch] Strengthen stack size estimation for LSX/LASX extension (#146455) This patch adds an emergency spill slot when ran out of registers. PR #139201 introduces `vstelm` instructions with only 8-bit imm offset, it causes no spill slot to store the spill registers. --- .../LoongArch/LoongArchFrameLowering.cpp | 7 +- .../CodeGen/LoongArch/calling-conv-common.ll | 48 +-- .../CodeGen/LoongArch/calling-conv-half.ll | 16 +- .../LoongArch/can-not-realign-stack.ll | 44 +-- .../CodeGen/LoongArch/emergency-spill-slot.ll | 4 +- llvm/test/CodeGen/LoongArch/frame.ll | 107 ++++++- .../CodeGen/LoongArch/intrinsic-memcpy.ll | 8 +- llvm/test/CodeGen/LoongArch/lasx/fpowi.ll | 88 +++--- .../lasx/ir-instruction/extractelement.ll | 120 ++++---- .../lasx/ir-instruction/insertelement.ll | 132 ++++---- llvm/test/CodeGen/LoongArch/llvm.sincos.ll | 150 ++++----- llvm/test/CodeGen/LoongArch/lsx/pr146455.ll | 287 ++++++++++++++++++ ...realignment-with-variable-sized-objects.ll | 24 +- .../CodeGen/LoongArch/stack-realignment.ll | 80 ++--- .../LoongArch/unaligned-memcpy-inline.ll | 14 +- llvm/test/CodeGen/LoongArch/vararg.ll | 70 ++--- 16 files changed, 783 insertions(+), 416 deletions(-) create mode 100644 llvm/test/CodeGen/LoongArch/lsx/pr146455.ll diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp index ac5e7f3891c72..1493bf4cba695 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp @@ -158,7 +158,12 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized( // estimateStackSize has been observed to under-estimate the final stack // size, so give ourselves wiggle-room by checking for stack size // representable an 11-bit signed field rather than 12-bits. - if (!isInt<11>(MFI.estimateStackSize(MF))) + // For [x]vstelm.{b/h/w/d} memory instructions with 8 imm offset, 7-bit + // signed field is fine. + unsigned EstimateStackSize = MFI.estimateStackSize(MF); + if (!isInt<11>(EstimateStackSize) || + (MF.getSubtarget().hasExtLSX() && + !isInt<7>(EstimateStackSize))) ScavSlotsNum = std::max(ScavSlotsNum, 1u); // For CFR spill. diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll index d07e2914c753a..f7653af1fa9ba 100644 --- a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll +++ b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll @@ -122,23 +122,23 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind { define i64 @caller_large_scalars() nounwind { ; CHECK-LABEL: caller_large_scalars: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -80 -; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill -; CHECK-NEXT: st.d $zero, $sp, 24 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $zero, $sp, 40 ; CHECK-NEXT: vrepli.b $vr0, 0 -; CHECK-NEXT: vst $vr0, $sp, 8 +; CHECK-NEXT: vst $vr0, $sp, 24 ; CHECK-NEXT: ori $a0, $zero, 2 -; CHECK-NEXT: st.d $a0, $sp, 0 -; CHECK-NEXT: st.d $zero, $sp, 56 -; CHECK-NEXT: vst $vr0, $sp, 40 +; CHECK-NEXT: st.d $a0, $sp, 16 +; CHECK-NEXT: st.d $zero, $sp, 72 +; CHECK-NEXT: vst $vr0, $sp, 56 ; CHECK-NEXT: ori $a2, $zero, 1 -; CHECK-NEXT: addi.d $a0, $sp, 32 -; CHECK-NEXT: addi.d $a1, $sp, 0 -; CHECK-NEXT: st.d $a2, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 48 +; CHECK-NEXT: addi.d $a1, $sp, 16 +; CHECK-NEXT: st.d $a2, $sp, 48 ; CHECK-NEXT: pcaddu18i $ra, %call36(callee_large_scalars) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 80 +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %1 = call i64 @callee_large_scalars(i256 1, i256 2) ret i64 %1 @@ -177,20 +177,20 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d, define i64 @caller_large_scalars_exhausted_regs() nounwind { ; CHECK-LABEL: caller_large_scalars_exhausted_regs: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -96 -; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $a0, $sp, 16 +; CHECK-NEXT: addi.d $sp, $sp, -112 +; CHECK-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: st.d $a0, $sp, 8 ; CHECK-NEXT: ori $a0, $zero, 9 ; CHECK-NEXT: st.d $a0, $sp, 0 -; CHECK-NEXT: st.d $zero, $sp, 40 +; CHECK-NEXT: st.d $zero, $sp, 56 ; CHECK-NEXT: vrepli.b $vr0, 0 -; CHECK-NEXT: vst $vr0, $sp, 24 +; CHECK-NEXT: vst $vr0, $sp, 40 ; CHECK-NEXT: ori $a0, $zero, 10 -; CHECK-NEXT: st.d $a0, $sp, 16 -; CHECK-NEXT: st.d $zero, $sp, 72 +; CHECK-NEXT: st.d $a0, $sp, 32 +; CHECK-NEXT: st.d $zero, $sp, 88 ; CHECK-NEXT: ori $a0, $zero, 8 -; CHECK-NEXT: st.d $a0, $sp, 48 +; CHECK-NEXT: st.d $a0, $sp, 64 ; CHECK-NEXT: ori $a0, $zero, 1 ; CHECK-NEXT: ori $a1, $zero, 2 ; CHECK-NEXT: ori $a2, $zero, 3 @@ -198,12 +198,12 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind { ; CHECK-NEXT: ori $a4, $zero, 5 ; CHECK-NEXT: ori $a5, $zero, 6 ; CHECK-NEXT: ori $a6, $zero, 7 -; CHECK-NEXT: addi.d $a7, $sp, 48 -; CHECK-NEXT: vst $vr0, $sp, 56 +; CHECK-NEXT: addi.d $a7, $sp, 64 +; CHECK-NEXT: vst $vr0, $sp, 72 ; CHECK-NEXT: pcaddu18i $ra, %call36(callee_large_scalars_exhausted_regs) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 112 ; CHECK-NEXT: ret %1 = call i64 @callee_large_scalars_exhausted_regs( i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i256 8, i64 9, diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll index c88b67f13d1e7..da8c3e93f6842 100644 --- a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll +++ b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll @@ -1252,8 +1252,8 @@ define i32 @caller_half_on_stack() nounwind { ; ; LA64F-LP64S-LABEL: caller_half_on_stack: ; LA64F-LP64S: # %bb.0: -; LA64F-LP64S-NEXT: addi.d $sp, $sp, -80 -; LA64F-LP64S-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64F-LP64S-NEXT: addi.d $sp, $sp, -96 +; LA64F-LP64S-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill ; LA64F-LP64S-NEXT: lu12i.w $a0, -12 ; LA64F-LP64S-NEXT: ori $a1, $a0, 3200 ; LA64F-LP64S-NEXT: lu32i.d $a1, 0 @@ -1292,8 +1292,8 @@ define i32 @caller_half_on_stack() nounwind { ; LA64F-LP64S-NEXT: st.w $t0, $sp, 0 ; LA64F-LP64S-NEXT: pcaddu18i $ra, %call36(callee_half_on_stack) ; LA64F-LP64S-NEXT: jirl $ra, $ra, 0 -; LA64F-LP64S-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload -; LA64F-LP64S-NEXT: addi.d $sp, $sp, 80 +; LA64F-LP64S-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; LA64F-LP64S-NEXT: addi.d $sp, $sp, 96 ; LA64F-LP64S-NEXT: ret ; ; LA64F-LP64D-LABEL: caller_half_on_stack: @@ -1336,8 +1336,8 @@ define i32 @caller_half_on_stack() nounwind { ; ; LA64D-LP64S-LABEL: caller_half_on_stack: ; LA64D-LP64S: # %bb.0: -; LA64D-LP64S-NEXT: addi.d $sp, $sp, -80 -; LA64D-LP64S-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64D-LP64S-NEXT: addi.d $sp, $sp, -96 +; LA64D-LP64S-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill ; LA64D-LP64S-NEXT: lu12i.w $a0, -12 ; LA64D-LP64S-NEXT: ori $a1, $a0, 3200 ; LA64D-LP64S-NEXT: lu32i.d $a1, 0 @@ -1376,8 +1376,8 @@ define i32 @caller_half_on_stack() nounwind { ; LA64D-LP64S-NEXT: st.w $t0, $sp, 0 ; LA64D-LP64S-NEXT: pcaddu18i $ra, %call36(callee_half_on_stack) ; LA64D-LP64S-NEXT: jirl $ra, $ra, 0 -; LA64D-LP64S-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload -; LA64D-LP64S-NEXT: addi.d $sp, $sp, 80 +; LA64D-LP64S-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; LA64D-LP64S-NEXT: addi.d $sp, $sp, 96 ; LA64D-LP64S-NEXT: ret ; ; LA64D-LP64D-LABEL: caller_half_on_stack: diff --git a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll index 52d8dd05aaa4c..1a9de3b0ef3d1 100644 --- a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll +++ b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll @@ -14,41 +14,41 @@ define dso_local noundef signext i32 @main() nounwind { ; CHECK-LABEL: main: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -272 -; CHECK-NEXT: st.d $ra, $sp, 264 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $sp, $sp, -288 +; CHECK-NEXT: st.d $ra, $sp, 280 # 8-byte Folded Spill ; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) ; CHECK-NEXT: xvld $xr0, $a0, %pc_lo12(.LCPI0_0) -; CHECK-NEXT: xvst $xr0, $sp, 96 # 32-byte Folded Spill +; CHECK-NEXT: xvst $xr0, $sp, 112 # 32-byte Folded Spill ; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) ; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI0_1) -; CHECK-NEXT: xvst $xr1, $sp, 64 # 32-byte Folded Spill +; CHECK-NEXT: xvst $xr1, $sp, 80 # 32-byte Folded Spill ; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_2) ; CHECK-NEXT: xvld $xr2, $a0, %pc_lo12(.LCPI0_2) -; CHECK-NEXT: xvst $xr2, $sp, 32 # 32-byte Folded Spill +; CHECK-NEXT: xvst $xr2, $sp, 48 # 32-byte Folded Spill ; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_3) ; CHECK-NEXT: xvld $xr3, $a0, %pc_lo12(.LCPI0_3) -; CHECK-NEXT: xvst $xr3, $sp, 0 # 32-byte Folded Spill -; CHECK-NEXT: xvst $xr0, $sp, 136 -; CHECK-NEXT: xvst $xr1, $sp, 168 -; CHECK-NEXT: xvst $xr2, $sp, 200 -; CHECK-NEXT: xvst $xr3, $sp, 232 -; CHECK-NEXT: addi.d $a0, $sp, 136 +; CHECK-NEXT: xvst $xr3, $sp, 16 # 32-byte Folded Spill +; CHECK-NEXT: xvst $xr0, $sp, 152 +; CHECK-NEXT: xvst $xr1, $sp, 184 +; CHECK-NEXT: xvst $xr2, $sp, 216 +; CHECK-NEXT: xvst $xr3, $sp, 248 +; CHECK-NEXT: addi.d $a0, $sp, 152 ; CHECK-NEXT: pcaddu18i $ra, %call36(foo) ; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: xvld $xr0, $sp, 96 # 32-byte Folded Reload -; CHECK-NEXT: xvst $xr0, $sp, 136 -; CHECK-NEXT: xvld $xr0, $sp, 64 # 32-byte Folded Reload -; CHECK-NEXT: xvst $xr0, $sp, 168 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload -; CHECK-NEXT: xvst $xr0, $sp, 200 -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload -; CHECK-NEXT: xvst $xr0, $sp, 232 -; CHECK-NEXT: addi.d $a0, $sp, 136 +; CHECK-NEXT: xvld $xr0, $sp, 112 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 152 +; CHECK-NEXT: xvld $xr0, $sp, 80 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 184 +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 216 +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 248 +; CHECK-NEXT: addi.d $a0, $sp, 152 ; CHECK-NEXT: pcaddu18i $ra, %call36(bar) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: move $a0, $zero -; CHECK-NEXT: ld.d $ra, $sp, 264 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 272 +; CHECK-NEXT: ld.d $ra, $sp, 280 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 288 ; CHECK-NEXT: ret entry: %s = alloca %struct.S, align 2 diff --git a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll index ccc5c703e71ed..15ac95dfc6c55 100644 --- a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll +++ b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll @@ -28,12 +28,12 @@ define void @func() { ; CHECK-NEXT: ld.w $a3, $a1, 0 ; CHECK-NEXT: ld.w $a2, $a1, 0 ; CHECK-NEXT: ld.w $a0, $a1, 0 -; CHECK-NEXT: st.d $fp, $sp, 0 +; CHECK-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill ; CHECK-NEXT: lu12i.w $fp, 1 ; CHECK-NEXT: ori $fp, $fp, 12 ; CHECK-NEXT: add.d $fp, $sp, $fp ; CHECK-NEXT: st.w $t8, $fp, 0 -; CHECK-NEXT: ld.d $fp, $sp, 0 +; CHECK-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload ; CHECK-NEXT: st.w $t8, $a1, 0 ; CHECK-NEXT: st.w $t7, $a1, 0 ; CHECK-NEXT: st.w $t6, $a1, 0 diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll index 048703029d8c6..b29d8634854f3 100644 --- a/llvm/test/CodeGen/LoongArch/frame.ll +++ b/llvm/test/CodeGen/LoongArch/frame.ll @@ -1,5 +1,6 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch64 -mattr=+d < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 -mattr=+d,-lsx < %s | FileCheck %s --check-prefixes=CHECK,NOLSX +; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LSX %struct.key_t = type { i32, [16 x i8] } @@ -7,20 +8,35 @@ declare void @llvm.memset.p0.i64(ptr, i8, i64, i1) declare void @test1(ptr) define i32 @test() nounwind { -; CHECK-LABEL: test: -; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -32 -; CHECK-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill -; CHECK-NEXT: st.w $zero, $sp, 16 -; CHECK-NEXT: vrepli.b $vr0, 0 -; CHECK-NEXT: vst $vr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 4 -; CHECK-NEXT: pcaddu18i $ra, %call36(test1) -; CHECK-NEXT: jirl $ra, $ra, 0 -; CHECK-NEXT: move $a0, $zero -; CHECK-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 32 -; CHECK-NEXT: ret +; NOLSX-LABEL: test: +; NOLSX: # %bb.0: +; NOLSX-NEXT: addi.d $sp, $sp, -32 +; NOLSX-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; NOLSX-NEXT: st.w $zero, $sp, 16 +; NOLSX-NEXT: st.d $zero, $sp, 8 +; NOLSX-NEXT: st.d $zero, $sp, 0 +; NOLSX-NEXT: addi.d $a0, $sp, 4 +; NOLSX-NEXT: pcaddu18i $ra, %call36(test1) +; NOLSX-NEXT: jirl $ra, $ra, 0 +; NOLSX-NEXT: move $a0, $zero +; NOLSX-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; NOLSX-NEXT: addi.d $sp, $sp, 32 +; NOLSX-NEXT: ret +; +; LSX-LABEL: test: +; LSX: # %bb.0: +; LSX-NEXT: addi.d $sp, $sp, -32 +; LSX-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LSX-NEXT: st.w $zero, $sp, 16 +; LSX-NEXT: vrepli.b $vr0, 0 +; LSX-NEXT: vst $vr0, $sp, 0 +; LSX-NEXT: addi.d $a0, $sp, 4 +; LSX-NEXT: pcaddu18i $ra, %call36(test1) +; LSX-NEXT: jirl $ra, $ra, 0 +; LSX-NEXT: move $a0, $zero +; LSX-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LSX-NEXT: addi.d $sp, $sp, 32 +; LSX-NEXT: ret %key = alloca %struct.key_t, align 4 call void @llvm.memset.p0.i64(ptr %key, i8 0, i64 20, i1 false) %1 = getelementptr inbounds %struct.key_t, ptr %key, i64 0, i32 1, i64 0 @@ -98,3 +114,62 @@ define void @test_large_frame_size_1234576() "frame-pointer"="all" { %1 = alloca i8, i32 1234567 ret void } + +;; Note: will create an emergency spill slot, if (!isInt<7>(StackSize)). +;; Should involve only one SP-adjusting addi per adjustment. +;; LSX 112 + 16(emergency solt) = 128 +define void @test_frame_size_112() { +; NOLSX-LABEL: test_frame_size_112: +; NOLSX: # %bb.0: +; NOLSX-NEXT: addi.d $sp, $sp, -112 +; NOLSX-NEXT: .cfi_def_cfa_offset 112 +; NOLSX-NEXT: addi.d $sp, $sp, 112 +; NOLSX-NEXT: ret +; +; LSX-LABEL: test_frame_size_112: +; LSX: # %bb.0: +; LSX-NEXT: addi.d $sp, $sp, -128 +; LSX-NEXT: .cfi_def_cfa_offset 128 +; LSX-NEXT: addi.d $sp, $sp, 128 +; LSX-NEXT: ret + %1 = alloca i8, i32 112 + ret void +} + +;; LSX 128 + 16(emergency solt) = 144 +define void @test_frame_size_128() { +; NOLSX-LABEL: test_frame_size_128: +; NOLSX: # %bb.0: +; NOLSX-NEXT: addi.d $sp, $sp, -128 +; NOLSX-NEXT: .cfi_def_cfa_offset 128 +; NOLSX-NEXT: addi.d $sp, $sp, 128 +; NOLSX-NEXT: ret +; +; LSX-LABEL: test_frame_size_128: +; LSX: # %bb.0: +; LSX-NEXT: addi.d $sp, $sp, -144 +; LSX-NEXT: .cfi_def_cfa_offset 144 +; LSX-NEXT: addi.d $sp, $sp, 144 +; LSX-NEXT: ret + %1 = alloca i8, i32 128 + ret void +} + +;; LSX 144 + 16(emergency solt) = 160 +define void @test_frame_size_144() { +; NOLSX-LABEL: test_frame_size_144: +; NOLSX: # %bb.0: +; NOLSX-NEXT: addi.d $sp, $sp, -144 +; NOLSX-NEXT: .cfi_def_cfa_offset 144 +; NOLSX-NEXT: addi.d $sp, $sp, 144 +; NOLSX-NEXT: ret +; +; LSX-LABEL: test_frame_size_144: +; LSX: # %bb.0: +; LSX-NEXT: addi.d $sp, $sp, -160 +; LSX-NEXT: .cfi_def_cfa_offset 160 +; LSX-NEXT: addi.d $sp, $sp, 160 +; LSX-NEXT: ret + %1 = alloca i8, i32 144 + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll index 402ddb9ad941b..5a55b253c77bb 100644 --- a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll +++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll @@ -6,11 +6,11 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 dereferenceable(48) %b, i64 %i) { ; CHECK-LABEL: box: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -96 -; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: addi.d $sp, $sp, -112 +; CHECK-NEXT: .cfi_def_cfa_offset 112 ; CHECK-NEXT: slli.d $a2, $a1, 5 ; CHECK-NEXT: alsl.d $a1, $a1, $a2, 4 -; CHECK-NEXT: addi.d $a2, $sp, 0 +; CHECK-NEXT: addi.d $a2, $sp, 16 ; CHECK-NEXT: add.d $a3, $a2, $a1 ; CHECK-NEXT: vldx $vr0, $a1, $a2 ; CHECK-NEXT: vld $vr1, $a3, 32 @@ -18,7 +18,7 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 der ; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: vst $vr1, $a0, 32 ; CHECK-NEXT: vst $vr2, $a0, 16 -; CHECK-NEXT: addi.d $sp, $sp, 96 +; CHECK-NEXT: addi.d $sp, $sp, 112 ; CHECK-NEXT: ret %1 = alloca [2 x %Box], align 16 %2 = getelementptr inbounds [2 x %Box], ptr %1, i64 0, i64 %i diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll index 789b51d9b5e5b..9528280d181a3 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll @@ -6,10 +6,10 @@ declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32) define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind { ; CHECK-LABEL: powi_v8f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -80 -; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill -; CHECK-NEXT: xvst $xr0, $sp, 0 # 32-byte Folded Spill +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill ; CHECK-NEXT: addi.w $fp, $a0, 0 ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 0 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 @@ -18,79 +18,79 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind { ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.s $a0, $fa0 ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 1 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 1 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 2 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 2 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 3 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 3 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 4 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 4 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 5 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 5 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 6 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 6 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.w $a0, $xr0, 7 ; CHECK-NEXT: movgr2fr.w $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powisf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.s $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.w $xr0, $a0, 7 -; CHECK-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 80 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret entry: %res = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %va, i32 %b) @@ -102,10 +102,10 @@ declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32) define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind { ; CHECK-LABEL: powi_v4f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi.d $sp, $sp, -80 -; CHECK-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill -; CHECK-NEXT: xvst $xr0, $sp, 0 # 32-byte Folded Spill +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill ; CHECK-NEXT: addi.w $fp, $a0, 0 ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0 ; CHECK-NEXT: movgr2fr.d $fa0, $a0 @@ -114,39 +114,39 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind { ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.d $a0, $fa0 ; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1 ; CHECK-NEXT: movgr2fr.d $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 1 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2 ; CHECK-NEXT: movgr2fr.d $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 2 -; CHECK-NEXT: xvst $xr0, $sp, 32 # 32-byte Folded Spill -; CHECK-NEXT: xvld $xr0, $sp, 0 # 32-byte Folded Reload +; CHECK-NEXT: xvst $xr0, $sp, 48 # 32-byte Folded Spill +; CHECK-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload ; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3 ; CHECK-NEXT: movgr2fr.d $fa0, $a0 ; CHECK-NEXT: move $a0, $fp ; CHECK-NEXT: pcaddu18i $ra, %call36(__powidf2) ; CHECK-NEXT: jirl $ra, $ra, 0 ; CHECK-NEXT: movfr2gr.d $a0, $fa0 -; CHECK-NEXT: xvld $xr0, $sp, 32 # 32-byte Folded Reload +; CHECK-NEXT: xvld $xr0, $sp, 48 # 32-byte Folded Reload ; CHECK-NEXT: xvinsgr2vr.d $xr0, $a0, 3 -; CHECK-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 80 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret entry: %res = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %va, i32 %b) diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll index 04214f5dfa9d2..2e1618748688a 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll @@ -76,21 +76,21 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind { define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; CHECK-LABEL: extract_32xi8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a2, 4, 0 ; CHECK-NEXT: ld.b $a0, $a0, 0 ; CHECK-NEXT: st.b $a0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <32 x i8>, ptr %src %e = extractelement <32 x i8> %v, i32 %idx @@ -101,21 +101,21 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind { define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; CHECK-LABEL: extract_16xi16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a2, 4, 1 ; CHECK-NEXT: ld.h $a0, $a0, 0 ; CHECK-NEXT: st.h $a0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <16 x i16>, ptr %src %e = extractelement <16 x i16> %v, i32 %idx @@ -126,21 +126,21 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind { define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; CHECK-LABEL: extract_8xi32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2 ; CHECK-NEXT: ld.w $a0, $a0, 0 ; CHECK-NEXT: st.w $a0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <8 x i32>, ptr %src %e = extractelement <8 x i32> %v, i32 %idx @@ -151,21 +151,21 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind { define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; CHECK-LABEL: extract_4xi64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3 ; CHECK-NEXT: ld.d $a0, $a0, 0 ; CHECK-NEXT: st.d $a0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <4 x i64>, ptr %src %e = extractelement <4 x i64> %v, i32 %idx @@ -176,21 +176,21 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind { define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; CHECK-LABEL: extract_8xfloat_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2 ; CHECK-NEXT: fld.s $fa0, $a0, 0 ; CHECK-NEXT: fst.s $fa0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <8 x float>, ptr %src %e = extractelement <8 x float> %v, i32 %idx @@ -201,21 +201,21 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind { define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind { ; CHECK-LABEL: extract_4xdouble_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3 ; CHECK-NEXT: fld.d $fa0, $a0, 0 ; CHECK-NEXT: fst.d $fa0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <4 x double>, ptr %src %e = extractelement <4 x double> %v, i32 %idx diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll index 3a4f6efd2c893..b24f95e676d10 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll @@ -114,22 +114,22 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind { define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind { ; CHECK-LABEL: insert_32xi8_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a3, 4, 0 ; CHECK-NEXT: st.b $a2, $a0, 0 -; CHECK-NEXT: xvld $xr0, $sp, 0 +; CHECK-NEXT: xvld $xr0, $sp, 32 ; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <32 x i8>, ptr %src %v_new = insertelement <32 x i8> %v, i8 %in, i32 %idx @@ -140,22 +140,22 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind { define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind { ; CHECK-LABEL: insert_16xi16_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a3, 4, 1 ; CHECK-NEXT: st.h $a2, $a0, 0 -; CHECK-NEXT: xvld $xr0, $sp, 0 +; CHECK-NEXT: xvld $xr0, $sp, 32 ; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <16 x i16>, ptr %src %v_new = insertelement <16 x i16> %v, i16 %in, i32 %idx @@ -166,22 +166,22 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind { define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind { ; CHECK-LABEL: insert_8xi32_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a3, 4, 2 ; CHECK-NEXT: st.w $a2, $a0, 0 -; CHECK-NEXT: xvld $xr0, $sp, 0 +; CHECK-NEXT: xvld $xr0, $sp, 32 ; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <8 x i32>, ptr %src %v_new = insertelement <8 x i32> %v, i32 %in, i32 %idx @@ -192,22 +192,22 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind { define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind { ; CHECK-LABEL: insert_4xi64_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr0, $a0, 0 -; CHECK-NEXT: xvst $xr0, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr0, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a3, 4, 3 ; CHECK-NEXT: st.d $a2, $a0, 0 -; CHECK-NEXT: xvld $xr0, $sp, 0 +; CHECK-NEXT: xvld $xr0, $sp, 32 ; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <4 x i64>, ptr %src %v_new = insertelement <4 x i64> %v, i64 %in, i32 %idx @@ -218,22 +218,22 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind { define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwind { ; CHECK-LABEL: insert_8xfloat_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: xvst $xr1, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr1, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a2, 4, 2 ; CHECK-NEXT: fst.s $fa0, $a0, 0 -; CHECK-NEXT: xvld $xr0, $sp, 0 +; CHECK-NEXT: xvld $xr0, $sp, 32 ; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <8 x float>, ptr %src %v_new = insertelement <8 x float> %v, float %in, i32 %idx @@ -244,22 +244,22 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounwind { ; CHECK-LABEL: insert_4xdouble_idx: ; CHECK: # %bb.0: -; CHECK-NEXT: addi.d $sp, $sp, -64 -; CHECK-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; CHECK-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; CHECK-NEXT: addi.d $fp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $sp, -96 +; CHECK-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $fp, $sp, 96 ; CHECK-NEXT: bstrins.d $sp, $zero, 4, 0 ; CHECK-NEXT: xvld $xr1, $a0, 0 -; CHECK-NEXT: xvst $xr1, $sp, 0 -; CHECK-NEXT: addi.d $a0, $sp, 0 +; CHECK-NEXT: xvst $xr1, $sp, 32 +; CHECK-NEXT: addi.d $a0, $sp, 32 ; CHECK-NEXT: bstrins.d $a0, $a2, 4, 3 ; CHECK-NEXT: fst.d $fa0, $a0, 0 -; CHECK-NEXT: xvld $xr0, $sp, 0 +; CHECK-NEXT: xvld $xr0, $sp, 32 ; CHECK-NEXT: xvst $xr0, $a1, 0 -; CHECK-NEXT: addi.d $sp, $fp, -64 -; CHECK-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; CHECK-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; CHECK-NEXT: addi.d $sp, $sp, 64 +; CHECK-NEXT: addi.d $sp, $fp, -96 +; CHECK-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 96 ; CHECK-NEXT: ret %v = load volatile <4 x double>, ptr %src %v_new = insertelement <4 x double> %v, double %in, i32 %idx diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll index ffedd7f9e9438..648c19d509715 100644 --- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll +++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll @@ -347,42 +347,42 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 { ; ; LA64-LABEL: test_sincos_v2f32: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -64 -; LA64-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill ; LA64-NEXT: vreplvei.w $vr0, $vr0, 0 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sinf) ; LA64-NEXT: jirl $ra, $ra, 0 ; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload ; LA64-NEXT: vreplvei.w $vr0, $vr0, 1 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sinf) ; LA64-NEXT: jirl $ra, $ra, 0 ; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload ; LA64-NEXT: vpackev.w $vr0, $vr0, $vr1 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cosf) ; LA64-NEXT: jirl $ra, $ra, 0 ; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cosf) ; LA64-NEXT: jirl $ra, $ra, 0 ; LA64-NEXT: # kill: def $f0 killed $f0 def $vr0 -; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: vld $vr1, $sp, 48 # 16-byte Folded Reload ; LA64-NEXT: vpackev.w $vr1, $vr0, $vr1 -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 64 +; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 80 ; LA64-NEXT: ret %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a) ret { <2 x float>, <2 x float> } %result @@ -439,48 +439,48 @@ define { <3 x float>, <3 x float> } @test_sincos_v3f32(<3 x float> %a) #0 { ; ; LA64-LABEL: test_sincos_v3f32: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -96 -; LA64-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -112 +; LA64-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill ; LA64-NEXT: vreplvei.w $vr0, $vr0, 2 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sinf) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: fst.s $fa0, $sp, 72 -; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: fst.s $fa0, $sp, 88 +; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload ; LA64-NEXT: vreplvei.w $vr0, $vr0, 1 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sinf) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: fst.s $fa0, $sp, 68 -; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: fst.s $fa0, $sp, 84 +; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload ; LA64-NEXT: vreplvei.w $vr0, $vr0, 0 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sinf) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: fst.s $fa0, $sp, 64 -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: fst.s $fa0, $sp, 80 +; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cosf) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: fst.s $fa0, $sp, 56 -; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: fst.s $fa0, $sp, 72 +; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cosf) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: fst.s $fa0, $sp, 52 -; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: fst.s $fa0, $sp, 68 +; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0 killed $f0 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cosf) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: fst.s $fa0, $sp, 48 -; LA64-NEXT: vld $vr0, $sp, 64 -; LA64-NEXT: vld $vr1, $sp, 48 -; LA64-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 96 +; LA64-NEXT: fst.s $fa0, $sp, 64 +; LA64-NEXT: vld $vr0, $sp, 80 +; LA64-NEXT: vld $vr1, $sp, 64 +; LA64-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 112 ; LA64-NEXT: ret %result = call { <3 x float>, <3 x float> } @llvm.sincos.v3f32(<3 x float> %a) ret { <3 x float>, <3 x float> } %result @@ -568,44 +568,44 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 { ; ; LA64-LABEL: test_sincos_v2f64: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -64 -; LA64-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill -; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill ; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill +; LA64-NEXT: vreplvei.d $vr0, $vr0, 0 +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sin) ; LA64-NEXT: jirl $ra, $ra, 0 ; LA64-NEXT: movfr2gr.d $a0, $fa0 ; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload ; LA64-NEXT: vreplvei.d $vr0, $vr0, 1 -; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill +; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(sin) ; LA64-NEXT: jirl $ra, $ra, 0 ; LA64-NEXT: movfr2gr.d $a0, $fa0 -; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload +; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload ; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 1 -; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vst $vr0, $sp, 48 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cos) ; LA64-NEXT: jirl $ra, $ra, 0 ; LA64-NEXT: movfr2gr.d $a0, $fa0 ; LA64-NEXT: vinsgr2vr.d $vr0, $a0, 0 -; LA64-NEXT: vst $vr0, $sp, 16 # 16-byte Folded Spill -; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload +; LA64-NEXT: vst $vr0, $sp, 32 # 16-byte Folded Spill +; LA64-NEXT: vld $vr0, $sp, 16 # 16-byte Folded Reload ; LA64-NEXT: # kill: def $f0_64 killed $f0_64 killed $vr0 ; LA64-NEXT: pcaddu18i $ra, %call36(cos) ; LA64-NEXT: jirl $ra, $ra, 0 ; LA64-NEXT: movfr2gr.d $a0, $fa0 -; LA64-NEXT: vld $vr1, $sp, 16 # 16-byte Folded Reload +; LA64-NEXT: vld $vr1, $sp, 32 # 16-byte Folded Reload ; LA64-NEXT: vinsgr2vr.d $vr1, $a0, 1 -; LA64-NEXT: vld $vr0, $sp, 32 # 16-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 64 +; LA64-NEXT: vld $vr0, $sp, 48 # 16-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 80 ; LA64-NEXT: ret %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a) ret { <2 x double>, <2 x double> } %result @@ -801,17 +801,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 { ; ; LA64-LABEL: test_sincos_v2f128: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -80 -; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill -; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill -; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill -; LA64-NEXT: st.d $s2, $sp, 40 # 8-byte Folded Spill -; LA64-NEXT: st.d $s3, $sp, 32 # 8-byte Folded Spill -; LA64-NEXT: st.d $s4, $sp, 24 # 8-byte Folded Spill -; LA64-NEXT: st.d $s5, $sp, 16 # 8-byte Folded Spill -; LA64-NEXT: st.d $s6, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: st.d $s7, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -96 +; LA64-NEXT: st.d $ra, $sp, 88 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 80 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 72 # 8-byte Folded Spill +; LA64-NEXT: st.d $s1, $sp, 64 # 8-byte Folded Spill +; LA64-NEXT: st.d $s2, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $s3, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: st.d $s4, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: st.d $s5, $sp, 32 # 8-byte Folded Spill +; LA64-NEXT: st.d $s6, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $s7, $sp, 16 # 8-byte Folded Spill ; LA64-NEXT: ld.d $fp, $a1, 16 ; LA64-NEXT: ld.d $s0, $a1, 24 ; LA64-NEXT: ld.d $s1, $a1, 0 @@ -847,17 +847,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 { ; LA64-NEXT: st.d $s6, $s3, 16 ; LA64-NEXT: st.d $s5, $s3, 8 ; LA64-NEXT: st.d $s4, $s3, 0 -; LA64-NEXT: ld.d $s7, $sp, 0 # 8-byte Folded Reload -; LA64-NEXT: ld.d $s6, $sp, 8 # 8-byte Folded Reload -; LA64-NEXT: ld.d $s5, $sp, 16 # 8-byte Folded Reload -; LA64-NEXT: ld.d $s4, $sp, 24 # 8-byte Folded Reload -; LA64-NEXT: ld.d $s3, $sp, 32 # 8-byte Folded Reload -; LA64-NEXT: ld.d $s2, $sp, 40 # 8-byte Folded Reload -; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload -; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload -; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 80 +; LA64-NEXT: ld.d $s7, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s6, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s5, $sp, 32 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s4, $sp, 40 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s3, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s2, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s1, $sp, 64 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 72 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 80 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 88 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 96 ; LA64-NEXT: ret %result = call { <2 x fp128>, <2 x fp128> } @llvm.sincos.v2f128(<2 x fp128> %a) ret { <2 x fp128>, <2 x fp128> } %result diff --git a/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll new file mode 100644 index 0000000000000..96159e5884d3f --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll @@ -0,0 +1,287 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx --verify-machineinstrs < %s | FileCheck %s +define void @eliminate_frame_index(<16 x i8> %a) nounwind { +; CHECK-LABEL: eliminate_frame_index: +; CHECK: # %bb.0: +; CHECK-NEXT: addi.d $sp, $sp, -240 +; CHECK-NEXT: st.d $ra, $sp, 232 # 8-byte Folded Spill +; CHECK-NEXT: st.d $fp, $sp, 224 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s0, $sp, 216 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s1, $sp, 208 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s2, $sp, 200 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s3, $sp, 192 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s4, $sp, 184 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s5, $sp, 176 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s6, $sp, 168 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s7, $sp, 160 # 8-byte Folded Spill +; CHECK-NEXT: st.d $s8, $sp, 152 # 8-byte Folded Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $zero, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $ra, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $tp, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $a0, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $a1, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $a2, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $a3, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $a4, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $a5, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $a6, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $a7, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t0, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t1, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t2, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t3, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t4, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t5, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t6, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t7, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $t8, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $fp, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s0, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s1, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s2, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s3, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s4, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s5, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s6, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s7, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: addi.d $s8, $zero, 1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: st.d $a0, $sp, 0 # 8-byte Folded Spill +; CHECK-NEXT: addi.d $a0, $sp, 136 +; CHECK-NEXT: vstelm.b $vr0, $a0, 0, 0 +; CHECK-NEXT: ld.d $a0, $sp, 0 # 8-byte Folded Reload +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $zero +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $ra +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $tp +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $a0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $a1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $a2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $a3 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $a4 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $a5 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $a6 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $a7 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t3 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t4 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t5 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t6 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t7 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $t8 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $fp +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s0 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s1 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s3 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s4 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s5 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s6 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s7 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: #APP +; CHECK-NEXT: # reg use $s8 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: ld.d $s8, $sp, 152 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s7, $sp, 160 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s6, $sp, 168 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s5, $sp, 176 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s4, $sp, 184 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s3, $sp, 192 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s2, $sp, 200 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s1, $sp, 208 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $s0, $sp, 216 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $fp, $sp, 224 # 8-byte Folded Reload +; CHECK-NEXT: ld.d $ra, $sp, 232 # 8-byte Folded Reload +; CHECK-NEXT: addi.d $sp, $sp, 240 +; CHECK-NEXT: ret + %s = alloca [16 x i8] + %ss = alloca [128 x i8] + + %zero = call i64 asm sideeffect "addi.d $$zero, $$zero, 1", "={r0}"() + %ra = call i64 asm sideeffect "addi.d $$ra, $$zero, 1", "={r1}"() + %tp = call i64 asm sideeffect "addi.d $$tp, $$zero, 1", "={r2}"() + %a0 = call i64 asm sideeffect "addi.d $$a0, $$zero, 1", "={r4}"() + %a1 = call i64 asm sideeffect "addi.d $$a1, $$zero, 1", "={r5}"() + %a2 = call i64 asm sideeffect "addi.d $$a2, $$zero, 1", "={r6}"() + %a3 = call i64 asm sideeffect "addi.d $$a3, $$zero, 1", "={r7}"() + %a4 = call i64 asm sideeffect "addi.d $$a4, $$zero, 1", "={r8}"() + %a5 = call i64 asm sideeffect "addi.d $$a5, $$zero, 1", "={r9}"() + %a6 = call i64 asm sideeffect "addi.d $$a6, $$zero, 1", "={r10}"() + %a7 = call i64 asm sideeffect "addi.d $$a7, $$zero, 1", "={r11}"() + %t0 = call i64 asm sideeffect "addi.d $$t0, $$zero, 1", "={r12}"() + %t1 = call i64 asm sideeffect "addi.d $$t1, $$zero, 1", "={r13}"() + %t2 = call i64 asm sideeffect "addi.d $$t2, $$zero, 1", "={r14}"() + %t3 = call i64 asm sideeffect "addi.d $$t3, $$zero, 1", "={r15}"() + %t4 = call i64 asm sideeffect "addi.d $$t4, $$zero, 1", "={r16}"() + %t5 = call i64 asm sideeffect "addi.d $$t5, $$zero, 1", "={r17}"() + %t6 = call i64 asm sideeffect "addi.d $$t6, $$zero, 1", "={r18}"() + %t7 = call i64 asm sideeffect "addi.d $$t7, $$zero, 1", "={r19}"() + %t8 = call i64 asm sideeffect "addi.d $$t8, $$zero, 1", "={r20}"() + ;; r21 Reserved (Non-allocatable) + %s9 = call i64 asm sideeffect "addi.d $$s9, $$zero, 1", "={r22}"() + %s0 = call i64 asm sideeffect "addi.d $$s0, $$zero, 1", "={r23}"() + %s1 = call i64 asm sideeffect "addi.d $$s1, $$zero, 1", "={r24}"() + %s2 = call i64 asm sideeffect "addi.d $$s2, $$zero, 1", "={r25}"() + %s3 = call i64 asm sideeffect "addi.d $$s3, $$zero, 1", "={r26}"() + %s4 = call i64 asm sideeffect "addi.d $$s4, $$zero, 1", "={r27}"() + %s5 = call i64 asm sideeffect "addi.d $$s5, $$zero, 1", "={r28}"() + %s6 = call i64 asm sideeffect "addi.d $$s6, $$zero, 1", "={r29}"() + %s7 = call i64 asm sideeffect "addi.d $$s7, $$zero, 1", "={r30}"() + %s8 = call i64 asm sideeffect "addi.d $$s8, $$zero, 1", "={r31}"() + + %e = extractelement <16 x i8> %a, i64 0 + + store volatile i8 %e, ptr %s + + call void asm sideeffect "# reg use $0", "{r0}"(i64 %zero) + call void asm sideeffect "# reg use $0", "{r1}"(i64 %ra) + call void asm sideeffect "# reg use $0", "{r2}"(i64 %tp) + call void asm sideeffect "# reg use $0", "{r4}"(i64 %a0) + call void asm sideeffect "# reg use $0", "{r5}"(i64 %a1) + call void asm sideeffect "# reg use $0", "{r6}"(i64 %a2) + call void asm sideeffect "# reg use $0", "{r7}"(i64 %a3) + call void asm sideeffect "# reg use $0", "{r8}"(i64 %a4) + call void asm sideeffect "# reg use $0", "{r9}"(i64 %a5) + call void asm sideeffect "# reg use $0", "{r10}"(i64 %a6) + call void asm sideeffect "# reg use $0", "{r11}"(i64 %a7) + call void asm sideeffect "# reg use $0", "{r12}"(i64 %t0) + call void asm sideeffect "# reg use $0", "{r13}"(i64 %t1) + call void asm sideeffect "# reg use $0", "{r14}"(i64 %t2) + call void asm sideeffect "# reg use $0", "{r15}"(i64 %t3) + call void asm sideeffect "# reg use $0", "{r16}"(i64 %t4) + call void asm sideeffect "# reg use $0", "{r17}"(i64 %t5) + call void asm sideeffect "# reg use $0", "{r18}"(i64 %t6) + call void asm sideeffect "# reg use $0", "{r19}"(i64 %t7) + call void asm sideeffect "# reg use $0", "{r20}"(i64 %t8) + ;; r21 Reserved (Non-allocatable) + call void asm sideeffect "# reg use $0", "{r22}"(i64 %s9) + call void asm sideeffect "# reg use $0", "{r23}"(i64 %s0) + call void asm sideeffect "# reg use $0", "{r24}"(i64 %s1) + call void asm sideeffect "# reg use $0", "{r25}"(i64 %s2) + call void asm sideeffect "# reg use $0", "{r26}"(i64 %s3) + call void asm sideeffect "# reg use $0", "{r27}"(i64 %s4) + call void asm sideeffect "# reg use $0", "{r28}"(i64 %s5) + call void asm sideeffect "# reg use $0", "{r29}"(i64 %s6) + call void asm sideeffect "# reg use $0", "{r30}"(i64 %s7) + call void asm sideeffect "# reg use $0", "{r31}"(i64 %s8) + + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll index 9f15604fcca6b..69995a0721f8a 100644 --- a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll +++ b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll @@ -36,15 +36,15 @@ define void @caller(i32 %n) { ; ; LA64-LABEL: caller: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -64 -; LA64-NEXT: .cfi_def_cfa_offset 64 -; LA64-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill -; LA64-NEXT: st.d $s8, $sp, 40 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -128 +; LA64-NEXT: .cfi_def_cfa_offset 128 +; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill +; LA64-NEXT: st.d $s8, $sp, 104 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 ; LA64-NEXT: .cfi_offset 31, -24 -; LA64-NEXT: addi.d $fp, $sp, 64 +; LA64-NEXT: addi.d $fp, $sp, 128 ; LA64-NEXT: .cfi_def_cfa 22, 0 ; LA64-NEXT: bstrins.d $sp, $zero, 5, 0 ; LA64-NEXT: move $s8, $sp @@ -54,14 +54,14 @@ define void @caller(i32 %n) { ; LA64-NEXT: slli.d $a0, $a0, 4 ; LA64-NEXT: sub.d $a0, $sp, $a0 ; LA64-NEXT: move $sp, $a0 -; LA64-NEXT: addi.d $a1, $s8, 0 +; LA64-NEXT: addi.d $a1, $s8, 64 ; LA64-NEXT: pcaddu18i $ra, %call36(callee) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: addi.d $sp, $fp, -64 -; LA64-NEXT: ld.d $s8, $sp, 40 # 8-byte Folded Reload -; LA64-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 64 +; LA64-NEXT: addi.d $sp, $fp, -128 +; LA64-NEXT: ld.d $s8, $sp, 104 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 128 ; LA64-NEXT: ret %1 = alloca i8, i32 %n %2 = alloca i32, align 64 diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll index 0645339358b64..0188884543adb 100644 --- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll +++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll @@ -28,22 +28,22 @@ define void @caller32() { ; ; LA64-LABEL: caller32: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -32 -; LA64-NEXT: .cfi_def_cfa_offset 32 -; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -64 +; LA64-NEXT: .cfi_def_cfa_offset 64 +; LA64-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 -; LA64-NEXT: addi.d $fp, $sp, 32 +; LA64-NEXT: addi.d $fp, $sp, 64 ; LA64-NEXT: .cfi_def_cfa 22, 0 ; LA64-NEXT: bstrins.d $sp, $zero, 4, 0 -; LA64-NEXT: addi.d $a0, $sp, 0 +; LA64-NEXT: addi.d $a0, $sp, 32 ; LA64-NEXT: pcaddu18i $ra, %call36(callee) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: addi.d $sp, $fp, -32 -; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: addi.d $sp, $fp, -64 +; LA64-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 64 ; LA64-NEXT: ret %1 = alloca i8, align 32 call void @callee(ptr %1) @@ -102,22 +102,22 @@ define void @caller64() { ; ; LA64-LABEL: caller64: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -64 -; LA64-NEXT: .cfi_def_cfa_offset 64 -; LA64-NEXT: st.d $ra, $sp, 56 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 48 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -128 +; LA64-NEXT: .cfi_def_cfa_offset 128 +; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 -; LA64-NEXT: addi.d $fp, $sp, 64 +; LA64-NEXT: addi.d $fp, $sp, 128 ; LA64-NEXT: .cfi_def_cfa 22, 0 ; LA64-NEXT: bstrins.d $sp, $zero, 5, 0 -; LA64-NEXT: addi.d $a0, $sp, 0 +; LA64-NEXT: addi.d $a0, $sp, 64 ; LA64-NEXT: pcaddu18i $ra, %call36(callee) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: addi.d $sp, $fp, -64 -; LA64-NEXT: ld.d $fp, $sp, 48 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 56 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 64 +; LA64-NEXT: addi.d $sp, $fp, -128 +; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 128 ; LA64-NEXT: ret %1 = alloca i8, align 64 call void @callee(ptr %1) @@ -176,22 +176,22 @@ define void @caller128() { ; ; LA64-LABEL: caller128: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -128 -; LA64-NEXT: .cfi_def_cfa_offset 128 -; LA64-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -256 +; LA64-NEXT: .cfi_def_cfa_offset 256 +; LA64-NEXT: st.d $ra, $sp, 248 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 240 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 -; LA64-NEXT: addi.d $fp, $sp, 128 +; LA64-NEXT: addi.d $fp, $sp, 256 ; LA64-NEXT: .cfi_def_cfa 22, 0 ; LA64-NEXT: bstrins.d $sp, $zero, 6, 0 -; LA64-NEXT: addi.d $a0, $sp, 0 +; LA64-NEXT: addi.d $a0, $sp, 128 ; LA64-NEXT: pcaddu18i $ra, %call36(callee) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: addi.d $sp, $fp, -128 -; LA64-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 128 +; LA64-NEXT: addi.d $sp, $fp, -256 +; LA64-NEXT: ld.d $fp, $sp, 240 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 248 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 256 ; LA64-NEXT: ret %1 = alloca i8, align 128 call void @callee(ptr %1) @@ -250,22 +250,22 @@ define void @caller256() { ; ; LA64-LABEL: caller256: ; LA64: # %bb.0: -; LA64-NEXT: addi.d $sp, $sp, -256 -; LA64-NEXT: .cfi_def_cfa_offset 256 -; LA64-NEXT: st.d $ra, $sp, 248 # 8-byte Folded Spill -; LA64-NEXT: st.d $fp, $sp, 240 # 8-byte Folded Spill +; LA64-NEXT: addi.d $sp, $sp, -512 +; LA64-NEXT: .cfi_def_cfa_offset 512 +; LA64-NEXT: st.d $ra, $sp, 504 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 496 # 8-byte Folded Spill ; LA64-NEXT: .cfi_offset 1, -8 ; LA64-NEXT: .cfi_offset 22, -16 -; LA64-NEXT: addi.d $fp, $sp, 256 +; LA64-NEXT: addi.d $fp, $sp, 512 ; LA64-NEXT: .cfi_def_cfa 22, 0 ; LA64-NEXT: bstrins.d $sp, $zero, 7, 0 -; LA64-NEXT: addi.d $a0, $sp, 0 +; LA64-NEXT: addi.d $a0, $sp, 256 ; LA64-NEXT: pcaddu18i $ra, %call36(callee) ; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: addi.d $sp, $fp, -256 -; LA64-NEXT: ld.d $fp, $sp, 240 # 8-byte Folded Reload -; LA64-NEXT: ld.d $ra, $sp, 248 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 256 +; LA64-NEXT: addi.d $sp, $fp, -512 +; LA64-NEXT: ld.d $fp, $sp, 496 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 504 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 512 ; LA64-NEXT: ret %1 = alloca i8, align 256 call void @callee(ptr %1) diff --git a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll index 925fdf3d60646..0d441e66a0c84 100644 --- a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll +++ b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll @@ -121,19 +121,19 @@ define void @t3() { ; ; LA64-LABEL: t3: ; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -64 -; LA64-NEXT: .cfi_def_cfa_offset 64 +; LA64-NEXT: addi.d $sp, $sp, -80 +; LA64-NEXT: .cfi_def_cfa_offset 80 ; LA64-NEXT: pcalau12i $a0, %pc_hi20(.L.str) ; LA64-NEXT: addi.d $a0, $a0, %pc_lo12(.L.str) ; LA64-NEXT: ld.h $a1, $a0, 20 ; LA64-NEXT: ld.w $a2, $a0, 16 ; LA64-NEXT: ld.d $a3, $a0, 8 ; LA64-NEXT: ld.d $a0, $a0, 0 -; LA64-NEXT: st.h $a1, $sp, 20 -; LA64-NEXT: st.w $a2, $sp, 16 -; LA64-NEXT: st.d $a3, $sp, 8 -; LA64-NEXT: st.d $a0, $sp, 0 -; LA64-NEXT: addi.d $sp, $sp, 64 +; LA64-NEXT: st.h $a1, $sp, 36 +; LA64-NEXT: st.w $a2, $sp, 32 +; LA64-NEXT: st.d $a3, $sp, 24 +; LA64-NEXT: st.d $a0, $sp, 16 +; LA64-NEXT: addi.d $sp, $sp, 80 ; LA64-NEXT: ret entry: %msgbuf = alloca [64 x i8], align 1 diff --git a/llvm/test/CodeGen/LoongArch/vararg.ll b/llvm/test/CodeGen/LoongArch/vararg.ll index 939cd2015c5b1..bc4b8a77c7e15 100644 --- a/llvm/test/CodeGen/LoongArch/vararg.ll +++ b/llvm/test/CodeGen/LoongArch/vararg.ll @@ -47,7 +47,7 @@ define i64 @va1(ptr %fmt, ...) { ; LA64-WITHFP-NEXT: st.d $a2, $fp, 16 ; LA64-WITHFP-NEXT: st.d $a1, $fp, 8 ; LA64-WITHFP-NEXT: addi.d $a1, $fp, 16 -; LA64-WITHFP-NEXT: st.d $a1, $fp, -24 +; LA64-WITHFP-NEXT: st.d $a1, $fp, -32 ; LA64-WITHFP-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64-WITHFP-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload ; LA64-WITHFP-NEXT: addi.d $sp, $sp, 96 @@ -94,7 +94,7 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind { ; LA64-WITHFP-NEXT: st.d $a2, $fp, 16 ; LA64-WITHFP-NEXT: st.d $a1, $fp, 8 ; LA64-WITHFP-NEXT: addi.d $a1, $fp, 16 -; LA64-WITHFP-NEXT: st.d $a1, $fp, -24 +; LA64-WITHFP-NEXT: st.d $a1, $fp, -32 ; LA64-WITHFP-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64-WITHFP-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload ; LA64-WITHFP-NEXT: addi.d $sp, $sp, 96 @@ -112,11 +112,11 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind { define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; LA64-FPELIM-LABEL: va1_va_arg_alloca: ; LA64-FPELIM: # %bb.0: -; LA64-FPELIM-NEXT: addi.d $sp, $sp, -96 -; LA64-FPELIM-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill -; LA64-FPELIM-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill -; LA64-FPELIM-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill -; LA64-FPELIM-NEXT: addi.d $fp, $sp, 32 +; LA64-FPELIM-NEXT: addi.d $sp, $sp, -112 +; LA64-FPELIM-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-FPELIM-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-FPELIM-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-FPELIM-NEXT: addi.d $fp, $sp, 48 ; LA64-FPELIM-NEXT: move $s0, $a1 ; LA64-FPELIM-NEXT: st.d $a7, $fp, 56 ; LA64-FPELIM-NEXT: st.d $a6, $fp, 48 @@ -126,7 +126,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; LA64-FPELIM-NEXT: st.d $a2, $fp, 16 ; LA64-FPELIM-NEXT: st.d $a1, $fp, 8 ; LA64-FPELIM-NEXT: addi.d $a0, $fp, 16 -; LA64-FPELIM-NEXT: st.d $a0, $fp, -32 +; LA64-FPELIM-NEXT: st.d $a0, $fp, -40 ; LA64-FPELIM-NEXT: addi.d $a0, $a1, 15 ; LA64-FPELIM-NEXT: bstrins.d $a0, $zero, 3, 0 ; LA64-FPELIM-NEXT: sub.d $a0, $sp, $a0 @@ -134,20 +134,20 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; LA64-FPELIM-NEXT: pcaddu18i $ra, %call36(notdead) ; LA64-FPELIM-NEXT: jirl $ra, $ra, 0 ; LA64-FPELIM-NEXT: move $a0, $s0 -; LA64-FPELIM-NEXT: addi.d $sp, $fp, -32 -; LA64-FPELIM-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload -; LA64-FPELIM-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload -; LA64-FPELIM-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload -; LA64-FPELIM-NEXT: addi.d $sp, $sp, 96 +; LA64-FPELIM-NEXT: addi.d $sp, $fp, -48 +; LA64-FPELIM-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-FPELIM-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-FPELIM-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-FPELIM-NEXT: addi.d $sp, $sp, 112 ; LA64-FPELIM-NEXT: ret ; ; LA64-WITHFP-LABEL: va1_va_arg_alloca: ; LA64-WITHFP: # %bb.0: -; LA64-WITHFP-NEXT: addi.d $sp, $sp, -96 -; LA64-WITHFP-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill -; LA64-WITHFP-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill -; LA64-WITHFP-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill -; LA64-WITHFP-NEXT: addi.d $fp, $sp, 32 +; LA64-WITHFP-NEXT: addi.d $sp, $sp, -112 +; LA64-WITHFP-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill +; LA64-WITHFP-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill +; LA64-WITHFP-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill +; LA64-WITHFP-NEXT: addi.d $fp, $sp, 48 ; LA64-WITHFP-NEXT: move $s0, $a1 ; LA64-WITHFP-NEXT: st.d $a7, $fp, 56 ; LA64-WITHFP-NEXT: st.d $a6, $fp, 48 @@ -157,7 +157,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; LA64-WITHFP-NEXT: st.d $a2, $fp, 16 ; LA64-WITHFP-NEXT: st.d $a1, $fp, 8 ; LA64-WITHFP-NEXT: addi.d $a0, $fp, 16 -; LA64-WITHFP-NEXT: st.d $a0, $fp, -32 +; LA64-WITHFP-NEXT: st.d $a0, $fp, -40 ; LA64-WITHFP-NEXT: addi.d $a0, $a1, 15 ; LA64-WITHFP-NEXT: bstrins.d $a0, $zero, 3, 0 ; LA64-WITHFP-NEXT: sub.d $a0, $sp, $a0 @@ -165,11 +165,11 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind { ; LA64-WITHFP-NEXT: pcaddu18i $ra, %call36(notdead) ; LA64-WITHFP-NEXT: jirl $ra, $ra, 0 ; LA64-WITHFP-NEXT: move $a0, $s0 -; LA64-WITHFP-NEXT: addi.d $sp, $fp, -32 -; LA64-WITHFP-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload -; LA64-WITHFP-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload -; LA64-WITHFP-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload -; LA64-WITHFP-NEXT: addi.d $sp, $sp, 96 +; LA64-WITHFP-NEXT: addi.d $sp, $fp, -48 +; LA64-WITHFP-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload +; LA64-WITHFP-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload +; LA64-WITHFP-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload +; LA64-WITHFP-NEXT: addi.d $sp, $sp, 112 ; LA64-WITHFP-NEXT: ret %va = alloca ptr, align 8 call void @llvm.va_start(ptr %va) @@ -314,10 +314,10 @@ define void @va_aligned_stack_caller() nounwind { ; ; LA64-WITHFP-LABEL: va_aligned_stack_caller: ; LA64-WITHFP: # %bb.0: -; LA64-WITHFP-NEXT: addi.d $sp, $sp, -112 -; LA64-WITHFP-NEXT: st.d $ra, $sp, 104 # 8-byte Folded Spill -; LA64-WITHFP-NEXT: st.d $fp, $sp, 96 # 8-byte Folded Spill -; LA64-WITHFP-NEXT: addi.d $fp, $sp, 112 +; LA64-WITHFP-NEXT: addi.d $sp, $sp, -128 +; LA64-WITHFP-NEXT: st.d $ra, $sp, 120 # 8-byte Folded Spill +; LA64-WITHFP-NEXT: st.d $fp, $sp, 112 # 8-byte Folded Spill +; LA64-WITHFP-NEXT: addi.d $fp, $sp, 128 ; LA64-WITHFP-NEXT: ori $a0, $zero, 17 ; LA64-WITHFP-NEXT: st.d $a0, $sp, 48 ; LA64-WITHFP-NEXT: ori $a0, $zero, 16 @@ -336,23 +336,23 @@ define void @va_aligned_stack_caller() nounwind { ; LA64-WITHFP-NEXT: lu32i.d $a0, 335544 ; LA64-WITHFP-NEXT: lu52i.d $a0, $a0, -328 ; LA64-WITHFP-NEXT: st.d $a0, $sp, 16 -; LA64-WITHFP-NEXT: st.d $zero, $fp, -24 +; LA64-WITHFP-NEXT: st.d $zero, $fp, -40 ; LA64-WITHFP-NEXT: vrepli.b $vr0, 0 -; LA64-WITHFP-NEXT: vst $vr0, $fp, -40 +; LA64-WITHFP-NEXT: vst $vr0, $fp, -56 ; LA64-WITHFP-NEXT: ori $a5, $zero, 1000 ; LA64-WITHFP-NEXT: ori $a0, $zero, 1 ; LA64-WITHFP-NEXT: ori $a1, $zero, 11 -; LA64-WITHFP-NEXT: addi.d $a2, $fp, -48 +; LA64-WITHFP-NEXT: addi.d $a2, $fp, -64 ; LA64-WITHFP-NEXT: ori $a3, $zero, 12 ; LA64-WITHFP-NEXT: ori $a4, $zero, 13 ; LA64-WITHFP-NEXT: ori $a7, $zero, 1 -; LA64-WITHFP-NEXT: st.d $a5, $fp, -48 +; LA64-WITHFP-NEXT: st.d $a5, $fp, -64 ; LA64-WITHFP-NEXT: move $a6, $zero ; LA64-WITHFP-NEXT: pcaddu18i $ra, %call36(va_aligned_stack_callee) ; LA64-WITHFP-NEXT: jirl $ra, $ra, 0 -; LA64-WITHFP-NEXT: ld.d $fp, $sp, 96 # 8-byte Folded Reload -; LA64-WITHFP-NEXT: ld.d $ra, $sp, 104 # 8-byte Folded Reload -; LA64-WITHFP-NEXT: addi.d $sp, $sp, 112 +; LA64-WITHFP-NEXT: ld.d $fp, $sp, 112 # 8-byte Folded Reload +; LA64-WITHFP-NEXT: ld.d $ra, $sp, 120 # 8-byte Folded Reload +; LA64-WITHFP-NEXT: addi.d $sp, $sp, 128 ; LA64-WITHFP-NEXT: ret %1 = call i32 (i32, ...) @va_aligned_stack_callee(i32 1, i32 11, i256 1000, i32 12, i32 13, i128 18446744073709551616, i32 14, From a96121089b9c94e08c6632f91f2dffc73c0ffa28 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 18 Jul 2025 10:08:43 +0200 Subject: [PATCH 307/813] Revert "RuntimeLibcalls: Add methods to recognize libcall names (#149001)" This reverts commit 45477add8dfe9851605697bd908b49f0ec244625. This causes a significant LTO compile-time regression. --- llvm/include/llvm/ADT/StringTable.h | 9 ---- llvm/include/llvm/IR/RuntimeLibcalls.h | 12 ----- llvm/lib/IR/RuntimeLibcalls.cpp | 45 ------------------- llvm/lib/Object/IRSymtab.cpp | 45 ++++++++++--------- .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 13 +----- 5 files changed, 24 insertions(+), 100 deletions(-) diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h index 575b3c929e40c..c089a070d4b57 100644 --- a/llvm/include/llvm/ADT/StringTable.h +++ b/llvm/include/llvm/ADT/StringTable.h @@ -118,13 +118,6 @@ class StringTable { constexpr Iterator(const Iterator &RHS) = default; constexpr Iterator(Iterator &&RHS) = default; - Iterator &operator=(const Iterator &RHS) { - Table = RHS.Table; - O = RHS.O; - S = RHS.S; - return *this; - } - bool operator==(const Iterator &RHS) const { assert(Table == RHS.Table && "Compared iterators for unrelated tables!"); return O == RHS.O; @@ -139,8 +132,6 @@ class StringTable { O = O.value() + (*Table)[O].size() + 1; return *this; } - - Offset offset() const { return O; } }; constexpr Iterator begin() const { return Iterator(*this, 0); } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 89ad4e5bc6ca4..8058c8a4c5510 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -132,10 +132,6 @@ struct RuntimeLibcallsInfo { return ImplToLibcall[Impl]; } - /// Check if this is valid libcall for the current module, otherwise - /// RTLIB::Unsupported. - RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const; - private: static const RTLIB::LibcallImpl DefaultLibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1]; @@ -160,14 +156,6 @@ struct RuntimeLibcallsInfo { /// Map from a concrete LibcallImpl implementation to its RTLIB::Libcall kind. LLVM_ABI static const RTLIB::Libcall ImplToLibcall[RTLIB::NumLibcallImpls]; - /// Check if a function name is a recognized runtime call of any kind. This - /// does not consider if this call is available for any current compilation, - /// just that it is a known call somewhere. This returns the set of all - /// LibcallImpls which match the name; multiple implementations with the same - /// name may exist but differ in interpretation based on the target context. - LLVM_ABI static iterator_range::const_iterator> - getRecognizedLibcallImpls(StringRef FuncName); - static bool darwinHasSinCosStret(const Triple &TT) { if (!TT.isOSDarwin()) return false; diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 5936ac7d0287f..b1864897dafa6 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -135,51 +135,6 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, } } -RTLIB::LibcallImpl -RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const { - const ArrayRef RuntimeLibcallNameOffsets( - RuntimeLibcallNameOffsetTable); - - iterator_range::const_iterator> Range = - getRecognizedLibcallImpls(FuncName); - - for (auto I = Range.begin(); I != Range.end(); ++I) { - RTLIB::LibcallImpl Impl = - static_cast(I - RuntimeLibcallNameOffsets.begin()); - - // FIXME: This should not depend on looking up ImplToLibcall, only the list - // of libcalls for the module. - RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]]; - if (Recognized != RTLIB::Unsupported) - return Recognized; - } - - return RTLIB::Unsupported; -} - -iterator_range::const_iterator> -RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) { - StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName); - if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName) - return iterator_range(ArrayRef()); - - uint16_t IndexVal = It.offset().value(); - const ArrayRef TableRef(RuntimeLibcallNameOffsetTable); - - ArrayRef::const_iterator E = TableRef.end(); - ArrayRef::const_iterator EntriesBegin = - std::lower_bound(TableRef.begin(), E, IndexVal); - ArrayRef::const_iterator EntriesEnd = EntriesBegin; - - while (EntriesEnd != E && *EntriesEnd == IndexVal) - ++EntriesEnd; - - assert(EntriesBegin != E && - "libcall found in name table but not offset table"); - - return make_range(EntriesBegin, EntriesEnd); -} - bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { switch (TT.getOS()) { case Triple::MacOSX: diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp index 79eeb08cddeef..2579fa37935f0 100644 --- a/llvm/lib/Object/IRSymtab.cpp +++ b/llvm/lib/Object/IRSymtab.cpp @@ -54,11 +54,6 @@ static const char *PreservedSymbols[] = { "__stack_chk_guard", }; -static bool isPreservedGlobalVarName(StringRef Name) { - return StringRef(PreservedSymbols[0]) == Name || - StringRef(PreservedSymbols[1]) == Name; -} - namespace { const char *getExpectedProducerName() { @@ -86,16 +81,12 @@ struct Builder { // The StringTableBuilder does not create a copy of any strings added to it, // so this provides somewhere to store any strings that we create. Builder(SmallVector &Symtab, StringTableBuilder &StrtabBuilder, - BumpPtrAllocator &Alloc, const Triple &TT) - : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc), TT(TT), - Libcalls(TT) {} + BumpPtrAllocator &Alloc) + : Symtab(Symtab), StrtabBuilder(StrtabBuilder), Saver(Alloc) {} DenseMap ComdatMap; Mangler Mang; - const Triple &TT; - - // FIXME: This shouldn't be here. - RTLIB::RuntimeLibcallsInfo Libcalls; + Triple TT; std::vector Comdats; std::vector Mods; @@ -107,10 +98,6 @@ struct Builder { std::vector DependentLibraries; - bool isPreservedLibFuncName(StringRef Name) { - return Libcalls.getSupportedLibcallImpl(Name) != RTLIB::Unsupported; - } - void setStr(storage::Str &S, StringRef Value) { S.Offset = StrtabBuilder.add(Value); S.Size = Value.size(); @@ -226,6 +213,18 @@ Expected Builder::getComdatIndex(const Comdat *C, const Module *M) { return P.first->second; } +static DenseSet buildPreservedSymbolsSet(const Triple &TT) { + DenseSet PreservedSymbolSet(std::begin(PreservedSymbols), + std::end(PreservedSymbols)); + // FIXME: Do we need to pass in ABI fields from TargetOptions? + RTLIB::RuntimeLibcallsInfo Libcalls(TT); + for (RTLIB::LibcallImpl Impl : Libcalls.getLibcallImpls()) { + if (Impl != RTLIB::Unsupported) + PreservedSymbolSet.insert(Libcalls.getLibcallImplName(Impl)); + } + return PreservedSymbolSet; +} + Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, const SmallPtrSet &Used, ModuleSymbolTable::Symbol Msym) { @@ -279,11 +278,13 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab, return Error::success(); } - StringRef GVName = GV->getName(); - setStr(Sym.IRName, GVName); + setStr(Sym.IRName, GV->getName()); + + static const DenseSet PreservedSymbolsSet = + buildPreservedSymbolsSet(GV->getParent()->getTargetTriple()); + bool IsPreservedSymbol = PreservedSymbolsSet.contains(GV->getName()); - if (Used.count(GV) || isPreservedLibFuncName(GVName) || - isPreservedGlobalVarName(GVName)) + if (Used.count(GV) || IsPreservedSymbol) Sym.Flags |= 1 << storage::Symbol::FB_used; if (GV->isThreadLocal()) Sym.Flags |= 1 << storage::Symbol::FB_tls; @@ -350,6 +351,7 @@ Error Builder::build(ArrayRef IRMods) { setStr(Hdr.Producer, kExpectedProducerName); setStr(Hdr.TargetTriple, IRMods[0]->getTargetTriple().str()); setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName()); + TT = IRMods[0]->getTargetTriple(); for (auto *M : IRMods) if (Error Err = addModule(M)) @@ -375,8 +377,7 @@ Error Builder::build(ArrayRef IRMods) { Error irsymtab::build(ArrayRef Mods, SmallVector &Symtab, StringTableBuilder &StrtabBuilder, BumpPtrAllocator &Alloc) { - const Triple &TT = Mods[0]->getTargetTriple(); - return Builder(Symtab, StrtabBuilder, Alloc, TT).build(Mods); + return Builder(Symtab, StrtabBuilder, Alloc).build(Mods); } // Upgrade a vector of bitcode modules created by an old version of LLVM by diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp index 7f90d6b4fdacc..652bea9dc7f65 100644 --- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp +++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp @@ -236,19 +236,8 @@ class RuntimeLibcallEmitter { for (RuntimeLibcall &LibCall : RuntimeLibcallDefList) Def2RuntimeLibcall[LibCall.getDef()] = &LibCall; - ArrayRef AllRuntimeLibcallImplsRaw = + ArrayRef AllRuntimeLibcallImpls = Records.getAllDerivedDefinitions("RuntimeLibcallImpl"); - - SmallVector AllRuntimeLibcallImpls( - AllRuntimeLibcallImplsRaw); - - // Sort by libcall impl name, not the enum name. This keeps the order - // suitable for using the name table for libcall recognition binary search. - llvm::sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) { - return A->getValueAsString("LibCallFuncName") < - B->getValueAsString("LibCallFuncName"); - }); - RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size()); size_t LibCallImplEnumVal = 1; From 8f3e78f9715cb7085d03686c7bd72e20ce248b04 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Jul 2025 17:15:38 +0900 Subject: [PATCH 308/813] AMDGPU: Add pass to replace constant materialize with AV pseudos (#149292) If we have a v_mov_b32 or v_accvgpr_write_b32 with an inline immediate, replace it with a pseudo which writes to the combined AV_* class. This relaxes the operand constraints, which will allow the allocator to inflate the register class to AV_* to potentially avoid spilling. The allocator does not know how to replace an instruction to enable the change of register class. I originally tried to do this by changing all of the places we introduce v_mov_b32 with immediate, but it's along tail of niche cases that require manual updating. Plus we can restrict this to only run on functions where we know we will be allocating AGPRs. --- llvm/lib/Target/AMDGPU/AMDGPU.h | 3 + llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp | 108 ++++++++++++++++++ .../Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h | 23 ++++ .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 13 +++ llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h | 2 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + llvm/lib/Target/AMDGPU/SIInstrInfo.h | 1 - llvm/test/CodeGen/AMDGPU/agpr-remat.ll | 18 +-- .../AMDGPU/amdgpu-prepare-agpr-alloc.mir | 95 +++++++++++++++ .../branch-folding-implicit-def-subreg.ll | 46 ++++---- llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll | 4 +- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 4 + llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll | 4 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.ll | 10 +- .../CodeGen/AMDGPU/no-fold-accvgpr-mov.mir | 28 ++--- .../CodeGen/AMDGPU/no-fold-accvgpr-read.mir | 26 ++--- ...al-regcopy-and-spill-missed-at-regalloc.ll | 20 ++-- .../CodeGen/AMDGPU/spill-vector-superclass.ll | 6 +- 19 files changed, 330 insertions(+), 83 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 23f106a9c1d4d..007b481f84960 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -153,6 +153,9 @@ struct AMDGPULowerBufferFatPointersPass const TargetMachine &TM; }; +void initializeAMDGPUPrepareAGPRAllocLegacyPass(PassRegistry &); +extern char &AMDGPUPrepareAGPRAllocLegacyID; + void initializeAMDGPUReserveWWMRegsLegacyPass(PassRegistry &); extern char &AMDGPUReserveWWMRegsLegacyID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 250547acb1ee7..b6c6d927d0e89 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -114,6 +114,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) +MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass()) MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp new file mode 100644 index 0000000000000..3b06e9b00ac69 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.cpp @@ -0,0 +1,108 @@ +//===-- AMDGPUPrepareAGPRAlloc.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Make simple transformations to relax register constraints for cases which can +// allocate to AGPRs or VGPRs. Replace materialize of inline immediates into +// AGPR or VGPR with a pseudo with an AV_* class register constraint. This +// allows later passes to inflate the register class if necessary. The register +// allocator does not know to replace instructions to relax constraints. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUPrepareAGPRAlloc.h" +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "SIMachineFunctionInfo.h" +#include "SIRegisterInfo.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/InitializePasses.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-prepare-agpr-alloc" + +namespace { + +class AMDGPUPrepareAGPRAllocImpl { +private: + const SIInstrInfo &TII; + MachineRegisterInfo &MRI; + +public: + AMDGPUPrepareAGPRAllocImpl(const GCNSubtarget &ST, MachineRegisterInfo &MRI) + : TII(*ST.getInstrInfo()), MRI(MRI) {} + bool run(MachineFunction &MF); +}; + +class AMDGPUPrepareAGPRAllocLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUPrepareAGPRAllocLegacy() : MachineFunctionPass(ID) { + initializeAMDGPUPrepareAGPRAllocLegacyPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "AMDGPU Prepare AGPR Alloc"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) +INITIALIZE_PASS_END(AMDGPUPrepareAGPRAllocLegacy, DEBUG_TYPE, + "AMDGPU Prepare AGPR Alloc", false, false) + +char AMDGPUPrepareAGPRAllocLegacy::ID = 0; + +char &llvm::AMDGPUPrepareAGPRAllocLegacyID = AMDGPUPrepareAGPRAllocLegacy::ID; + +bool AMDGPUPrepareAGPRAllocLegacy::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget(); + return AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF); +} + +PreservedAnalyses +AMDGPUPrepareAGPRAllocPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + const GCNSubtarget &ST = MF.getSubtarget(); + AMDGPUPrepareAGPRAllocImpl(ST, MF.getRegInfo()).run(MF); + return PreservedAnalyses::all(); +} + +bool AMDGPUPrepareAGPRAllocImpl::run(MachineFunction &MF) { + if (MRI.isReserved(AMDGPU::AGPR0)) + return false; + + const MCInstrDesc &AVImmPseudo = TII.get(AMDGPU::AV_MOV_B32_IMM_PSEUDO); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if ((MI.getOpcode() == AMDGPU::V_MOV_B32_e32 && + TII.isInlineConstant(MI, 1)) || + (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && + MI.getOperand(1).isImm())) { + MI.setDesc(AVImmPseudo); + Changed = true; + } + } + } + + return Changed; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h new file mode 100644 index 0000000000000..dc598c98f241b --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPrepareAGPRAlloc.h @@ -0,0 +1,23 @@ +//===- AMDGPUPrepareAGPRAlloc.h ---------------------------------*- C++- *-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H + +#include "llvm/CodeGen/MachinePassManager.h" + +namespace llvm { +class AMDGPUPrepareAGPRAllocPass + : public PassInfoMixin { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPREPAREAGPRALLOC_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 31a80e00edd3b..c865082a1dcea 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -25,6 +25,7 @@ #include "AMDGPUMacroFusion.h" #include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUPreloadKernArgProlog.h" +#include "AMDGPUPrepareAGPRAlloc.h" #include "AMDGPURemoveIncompatibleFunctions.h" #include "AMDGPUReserveWWMRegs.h" #include "AMDGPUResourceUsageAnalysis.h" @@ -499,6 +500,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeGlobalISel(*PR); initializeAMDGPUAsmPrinterPass(*PR); initializeAMDGPUDAGToDAGISelLegacyPass(*PR); + initializeAMDGPUPrepareAGPRAllocLegacyPass(*PR); initializeGCNDPPCombineLegacyPass(*PR); initializeSILowerI1CopiesLegacyPass(*PR); initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); @@ -1196,6 +1198,7 @@ class GCNPassConfig final : public AMDGPUPassConfig { bool addRegBankSelect() override; void addPreGlobalInstructionSelect() override; bool addGlobalInstructionSelect() override; + void addPreRegAlloc() override; void addFastRegAlloc() override; void addOptimizedRegAlloc() override; @@ -1539,6 +1542,11 @@ void GCNPassConfig::addFastRegAlloc() { TargetPassConfig::addFastRegAlloc(); } +void GCNPassConfig::addPreRegAlloc() { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(&AMDGPUPrepareAGPRAllocLegacyID); +} + void GCNPassConfig::addOptimizedRegAlloc() { if (EnableDCEInRA) insertPass(&DetectDeadLanesID, &DeadMachineInstructionElimID); @@ -2235,6 +2243,11 @@ void AMDGPUCodeGenPassBuilder::addOptimizedRegAlloc( Base::addOptimizedRegAlloc(addPass); } +void AMDGPUCodeGenPassBuilder::addPreRegAlloc(AddMachinePass &addPass) const { + if (getOptLevel() != CodeGenOptLevel::None) + addPass(AMDGPUPrepareAGPRAllocPass()); +} + Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized( AddMachinePass &addPass) const { // TODO: Check --regalloc-npm option diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h index 3b2f39c14a9bc..e0f1296ddded8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h @@ -181,7 +181,9 @@ class AMDGPUCodeGenPassBuilder void addMachineSSAOptimization(AddMachinePass &) const; void addPostRegAlloc(AddMachinePass &) const; void addPreEmitPass(AddMachinePass &) const; + void addPreEmitRegAlloc(AddMachinePass &) const; Error addRegAssignmentOptimized(AddMachinePass &) const; + void addPreRegAlloc(AddMachinePass &) const; void addOptimizedRegAlloc(AddMachinePass &) const; void addPreSched2(AddMachinePass &) const; diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index e3519f192137c..42edec0d01493 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -74,6 +74,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPULowerKernelArguments.cpp AMDGPULowerKernelAttributes.cpp AMDGPULowerModuleLDSPass.cpp + AMDGPUPrepareAGPRAlloc.cpp AMDGPUSwLowerLDS.cpp AMDGPUMachineFunction.cpp AMDGPUMachineModuleInfo.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 2764ed3d3f0b1..5e92921f3ea21 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1113,7 +1113,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { // that will not require an additional 4-bytes; this function assumes that it // will. bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { - assert(!MO.isReg() && "isInlineConstant called on register operand!"); if (!MO.isImm()) return false; return isInlineConstant(MO.getImm(), OperandType); diff --git a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll index 6742ae6c1d584..f6465de86fa4f 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-remat.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-remat.ll @@ -6,17 +6,17 @@ define amdgpu_kernel void @remat_constant_voids_spill(ptr addrspace(1) %p) #1 { ; GFX908-LABEL: remat_constant_voids_spill: ; GFX908: ; %bb.0: -; GFX908-NEXT: v_accvgpr_write_b32 a1, 1 -; GFX908-NEXT: v_accvgpr_write_b32 a5, 6 -; GFX908-NEXT: v_accvgpr_write_b32 a6, 7 -; GFX908-NEXT: v_accvgpr_write_b32 a7, 8 -; GFX908-NEXT: v_accvgpr_write_b32 a0, 9 -; GFX908-NEXT: v_accvgpr_write_b32 a2, 2 -; GFX908-NEXT: v_accvgpr_write_b32 a3, 3 -; GFX908-NEXT: v_accvgpr_write_b32 a4, 4 +; GFX908-NEXT: v_accvgpr_write_b32 a0, 1 +; GFX908-NEXT: v_accvgpr_write_b32 a1, 2 +; GFX908-NEXT: v_accvgpr_write_b32 a2, 3 +; GFX908-NEXT: v_accvgpr_write_b32 a3, 4 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_accvgpr_write_b32 a1, 5 +; GFX908-NEXT: v_accvgpr_write_b32 a0, 5 +; GFX908-NEXT: v_accvgpr_write_b32 a1, 6 +; GFX908-NEXT: v_accvgpr_write_b32 a2, 7 +; GFX908-NEXT: v_accvgpr_write_b32 a3, 8 +; GFX908-NEXT: v_accvgpr_write_b32 a4, 9 ; GFX908-NEXT: ;;#ASMSTART ; GFX908-NEXT: ;;#ASMEND ; GFX908-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir b/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir new file mode 100644 index 0000000000000..69bdb1f5066f0 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-prepare-agpr-alloc.mir @@ -0,0 +1,95 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -run-pass=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefixes=HAS-AGPR,GFX90A %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -run-pass=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefixes=HAS-AGPR,GFX908 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx906 -passes=amdgpu-prepare-agpr-alloc -o - %s | FileCheck -check-prefix=NO-AGPR %s + +--- | + define void @func() { + ret void + } + + ; Attribute is ignored for gfx90a + define void @no_agprs() "amdgpu-agpr-alloc"="0,0" { + ret void + } + +... +--- +name: func +tracksRegLiveness: true +stack: + - { id: 0, size: 4 } +body: | + ; HAS-AGPR-LABEL: name: func + ; HAS-AGPR: bb.0: + ; HAS-AGPR-NEXT: successors: %bb.1(0x80000000) + ; HAS-AGPR-NEXT: liveins: $vgpr0 + ; HAS-AGPR-NEXT: {{ $}} + ; HAS-AGPR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; HAS-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; HAS-AGPR-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec + ; HAS-AGPR-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65, implicit $exec + ; HAS-AGPR-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; HAS-AGPR-NEXT: [[AV_MOV_1:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec + ; HAS-AGPR-NEXT: [[AV_MOV_2:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 6, implicit $exec + ; HAS-AGPR-NEXT: {{ $}} + ; HAS-AGPR-NEXT: bb.1: + ; HAS-AGPR-NEXT: [[AV_MOV_3:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 3, implicit $exec + ; + ; NO-AGPR-LABEL: name: func + ; NO-AGPR: bb.0: + ; NO-AGPR-NEXT: successors: %bb.1(0x80000000) + ; NO-AGPR-NEXT: liveins: $vgpr0 + ; NO-AGPR-NEXT: {{ $}} + ; NO-AGPR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + ; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; NO-AGPR-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; NO-AGPR-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65, implicit $exec + ; NO-AGPR-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec + ; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_2:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 6, implicit $exec + ; NO-AGPR-NEXT: {{ $}} + ; NO-AGPR-NEXT: bb.1: + ; NO-AGPR-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + bb.0: + liveins: $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 $vgpr0, implicit $exec + %1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + %2:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %3:vgpr_32 = V_MOV_B32_e32 65, implicit $exec + %4:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %5:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec + %6:agpr_32 = V_ACCVGPR_WRITE_B32_e64 6, implicit $exec + + bb.1: + %7:vgpr_32 = V_MOV_B32_e32 3, implicit $exec + +... + +--- +name: no_agprs +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX90A-LABEL: name: no_agprs + ; GFX90A: liveins: $vgpr0 + ; GFX90A-NEXT: {{ $}} + ; GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; GFX90A-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec + ; + ; GFX908-LABEL: name: no_agprs + ; GFX908: liveins: $vgpr0 + ; GFX908-NEXT: {{ $}} + ; GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec + ; GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:agpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec + ; + ; NO-AGPR-LABEL: name: no_agprs + ; NO-AGPR: liveins: $vgpr0 + ; NO-AGPR-NEXT: {{ $}} + ; NO-AGPR-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + ; NO-AGPR-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec + %0:vgpr_32 = V_MOV_B32_e32 1, implicit $exec + %1:agpr_32 = V_ACCVGPR_WRITE_B32_e64 2, implicit $exec + +... diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll index ae90cfb631e8d..7eb7d72e6cb97 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: S_BITCMP1_B32 renamable $sgpr17, 8, implicit-def $scc ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_CSELECT_B64 -1, 0, implicit killed $scc ; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_XOR_B64 killed renamable $sgpr18_sgpr19, -1, implicit-def dead $scc - ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = DS_READ_B32_gfx9 renamable $vgpr3, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(3) null`, align 8, addrspace 3) ; GFX90A-NEXT: renamable $sgpr18_sgpr19 = S_MOV_B64 0 ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr28_sgpr29, implicit-def dead $scc @@ -56,8 +56,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc - ; GFX90A-NEXT: renamable $vgpr15 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr17 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr15 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr17 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: S_CBRANCH_VCCZ %bb.57, implicit $vcc ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.4.bb15: @@ -112,14 +112,14 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.7(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr19 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr21 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr20 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr23 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr22 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr25 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr24 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr19 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr18 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr21 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr20 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr23 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr22 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr25 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr24 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: bb.7.Flow19: ; GFX90A-NEXT: successors: %bb.62(0x40000000), %bb.8(0x40000000) @@ -671,7 +671,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr50_sgpr51, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr48_sgpr49, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr56_sgpr57 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec - ; GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr50_sgpr51, implicit-def dead $scc ; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.54, implicit $vcc ; GFX90A-NEXT: {{ $}} @@ -759,7 +759,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47:0x000000000000000F, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr54_sgpr55, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec - ; GFX90A-NEXT: renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr10 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec ; GFX90A-NEXT: renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3) @@ -801,12 +801,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr42_vgpr43 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr40_vgpr41 = IMPLICIT_DEF ; GFX90A-NEXT: renamable $vgpr46_vgpr47 = IMPLICIT_DEF - ; GFX90A-NEXT: renamable $vgpr14 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr52 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr53 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr12 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr14 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr52 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr16 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr53 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr13 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr12 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.7 ; GFX90A-NEXT: {{ $}} @@ -814,7 +814,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: successors: %bb.3(0x80000000) ; GFX90A-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr46_sgpr47:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-NEXT: {{ $}} - ; GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec ; GFX90A-NEXT: renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.419, addrspace 3) @@ -913,7 +913,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec ; GFX90A-NEXT: renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec - ; GFX90A-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr3 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vcc = V_CMP_EQ_U32_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec ; GFX90A-NEXT: renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec @@ -955,7 +955,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec - ; GFX90A-NEXT: renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr27 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec ; GFX90A-NEXT: renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec @@ -989,7 +989,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64 ; GFX90A-NEXT: {{ $}} ; GFX90A-NEXT: renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec ; GFX90A-NEXT: renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec - ; GFX90A-NEXT: renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec + ; GFX90A-NEXT: renamable $vgpr55 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX90A-NEXT: DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3) ; GFX90A-NEXT: renamable $sgpr12_sgpr13 = S_MOV_B64 0 ; GFX90A-NEXT: S_BRANCH %bb.69 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll index 50fa7ac2a19aa..4f81d3599d1de 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline-npm.ll @@ -10,9 +10,9 @@ ; GCN-O0: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(atomic-expand,verify,gc-lowering,lower-constant-intrinsics,unreachableblockelim,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(lower-switch,lower-invoke,unreachableblockelim,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa,require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,localstackalloc))),require,cgscc(function(machine-function(reg-usage-propagation,phi-node-elimination,two-address-instruction,regallocfast,si-fix-vgpr-copies,remove-redundant-debug-values,fixup-statepoint-caller-saved,prolog-epilog,post-ra-pseudos,si-post-ra-bundler,fentry-insert,xray-instrumentation,patchable-function,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O2: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,early-cse<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,early-cse<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) -; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) +; GCN-O3: require,require,require,pre-isel-intrinsic-lowering,function(expand-large-div-rem,expand-fp),amdgpu-remove-incompatible-functions,amdgpu-printf-runtime-binding,amdgpu-lower-ctor-dtor,function(amdgpu-image-intrinsic-opt),expand-variadics,amdgpu-always-inline,always-inline,amdgpu-export-kernel-runtime-handles,amdgpu-sw-lower-lds,amdgpu-lower-module-lds,function(amdgpu-atomic-optimizer,atomic-expand,amdgpu-promote-alloca,separate-const-offset-from-gep<>,slsr,gvn<>,nary-reassociate,early-cse<>,amdgpu-codegenprepare,verify,loop-mssa(loop-reduce),mergeicmps,expand-memcmp,gc-lowering,lower-constant-intrinsics,unreachableblockelim,consthoist,replace-with-veclib,partially-inline-libcalls,ee-instrument,scalarize-masked-mem-intrin,expand-reductions,gvn<>),amdgpu-preload-kernel-arguments,function(amdgpu-lower-kernel-arguments),amdgpu-lower-buffer-fat-pointers,cgscc(function(codegenprepare,load-store-vectorizer,lower-switch,lower-invoke,unreachableblockelim,flatten-cfg,sink,amdgpu-late-codegenprepare,amdgpu-unify-divergent-exit-nodes,fix-irreducible,unify-loop-exits,StructurizeCFGPass,amdgpu-annotate-uniform,si-annotate-control-flow,amdgpu-rewrite-undef-for-phi,lcssa)),amdgpu-perf-hint,cgscc(function(require,callbr-prepare,safe-stack,stack-protector,verify)),cgscc(function(machine-function(amdgpu-isel,si-fix-sgpr-copies,si-i1-copies,finalize-isel,early-tailduplication,opt-phis,stack-coloring,localstackalloc,dead-mi-elimination,early-machinelicm,machine-cse,machine-sink,peephole-opt,dead-mi-elimination,si-fold-operands,gcn-dpp-combine,si-load-store-opt,si-peephole-sdwa,early-machinelicm,machine-cse,si-fold-operands,dead-mi-elimination,si-shrink-instructions))),require,cgscc(function(machine-function(reg-usage-propagation,amdgpu-prepare-agpr-alloc,detect-dead-lanes,dead-mi-elimination,init-undef,process-imp-defs,unreachable-mbb-elimination,require,si-opt-vgpr-liverange,require,phi-node-elimination,si-lower-control-flow,two-address-instruction,register-coalescer,rename-independent-subregs,amdgpu-rewrite-partial-reg-uses,machine-scheduler,amdgpu-pre-ra-optimizations,si-wqm,si-optimize-exec-masking-pre-ra,si-form-memory-clauses,amdgpu-pre-ra-long-branch-reg,greedy,virt-reg-rewriter,stack-slot-coloring,si-lower-sgpr-spills,si-pre-allocate-wwm-regs,greedy,si-lower-wwm-copies,virt-reg-rewriter,amdgpu-reserve-wwm-regs,greedy,amdgpu-nsa-reassign,virt-reg-rewriter,amdgpu-mark-last-scratch-load,machine-cp,machinelicm,si-fix-vgpr-copies,si-optimize-exec-masking,remove-redundant-debug-values,fixup-statepoint-caller-saved,postra-machine-sink,shrink-wrap,prolog-epilog,branch-folder,tailduplication,machine-latecleanup,machine-cp,post-ra-pseudos,si-shrink-instructions,si-post-ra-bundler,postmisched,block-placement,fentry-insert,xray-instrumentation,patchable-function,gcn-create-vopd,si-memory-legalizer,si-insert-waitcnts,si-late-branch-lowering,si-pre-emit-peephole,post-RA-hazard-rec,amdgpu-wait-sgpr-hazards,amdgpu-insert-delay-alu,branch-relaxation,reg-usage-collector,remove-loads-into-fake-uses,live-debug-values,machine-sanmd,stack-frame-layout,verify),free-machine-function)) define void @empty() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index af3241e95e91d..2a5c65278f7dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -329,6 +329,7 @@ ; GCN-O1-NEXT: Remove dead machine instructions ; GCN-O1-NEXT: SI Shrink Instructions ; GCN-O1-NEXT: Register Usage Information Propagation +; GCN-O1-NEXT: AMDGPU Prepare AGPR Alloc ; GCN-O1-NEXT: Detect Dead Lanes ; GCN-O1-NEXT: Remove dead machine instructions ; GCN-O1-NEXT: Init Undef Pass @@ -640,6 +641,7 @@ ; GCN-O1-OPTS-NEXT: Remove dead machine instructions ; GCN-O1-OPTS-NEXT: SI Shrink Instructions ; GCN-O1-OPTS-NEXT: Register Usage Information Propagation +; GCN-O1-OPTS-NEXT: AMDGPU Prepare AGPR Alloc ; GCN-O1-OPTS-NEXT: Detect Dead Lanes ; GCN-O1-OPTS-NEXT: Remove dead machine instructions ; GCN-O1-OPTS-NEXT: Init Undef Pass @@ -956,6 +958,7 @@ ; GCN-O2-NEXT: Remove dead machine instructions ; GCN-O2-NEXT: SI Shrink Instructions ; GCN-O2-NEXT: Register Usage Information Propagation +; GCN-O2-NEXT: AMDGPU Prepare AGPR Alloc ; GCN-O2-NEXT: Detect Dead Lanes ; GCN-O2-NEXT: Remove dead machine instructions ; GCN-O2-NEXT: Init Undef Pass @@ -1286,6 +1289,7 @@ ; GCN-O3-NEXT: Remove dead machine instructions ; GCN-O3-NEXT: SI Shrink Instructions ; GCN-O3-NEXT: Register Usage Information Propagation +; GCN-O3-NEXT: AMDGPU Prepare AGPR Alloc ; GCN-O3-NEXT: Detect Dead Lanes ; GCN-O3-NEXT: Remove dead machine instructions ; GCN-O3-NEXT: Init Undef Pass diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll index 67ae05eb6f0b8..561eaca3b77df 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll @@ -4365,8 +4365,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; NOLIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm: ; NOLIT-SRCC: ; %bb.0: ; %bb ; NOLIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 -; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 0 ; NOLIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 0 @@ -4465,8 +4465,8 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm(ptr addrspace(1) %arg) # ; LIT-SRCC-LABEL: test_mfma_f32_32x32x1f32_imm: ; LIT-SRCC: ; %bb.0: ; %bb ; LIT-SRCC-NEXT: v_mov_b32_e32 v0, 1.0 -; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a0, 1.0 +; LIT-SRCC-NEXT: v_accvgpr_write_b32 a1, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a2, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a3, 0 ; LIT-SRCC-NEXT: v_accvgpr_write_b32 a4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll index 3844d6054e130..cf244f0b1f884 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.ll @@ -6,16 +6,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX942-LABEL: matmul_kernel: ; GFX942: ; %bb.0: ; %entry ; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: v_mov_b32_e32 v0, 0 -; GFX942-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX942-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX942-NEXT: s_mov_b32 s2, 0 ; GFX942-NEXT: v_accvgpr_write_b32 a1, 0 +; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: s_cmp_lg_u32 s0, 0 ; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GFX942-NEXT: s_mov_b32 s3, 0 ; GFX942-NEXT: s_branch .LBB0_2 ; GFX942-NEXT: .LBB0_1: ; %bb2 ; GFX942-NEXT: ; in Loop: Header=BB0_2 Depth=1 @@ -43,16 +42,15 @@ define amdgpu_kernel void @matmul_kernel(i32 %a0, i32 %a1) { ; GFX908-LABEL: matmul_kernel: ; GFX908: ; %bb.0: ; %entry ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: v_accvgpr_write_b32 a2, 0 ; GFX908-NEXT: v_accvgpr_write_b32 a1, 0 ; GFX908-NEXT: s_mov_b32 s2, 0 -; GFX908-NEXT: v_accvgpr_write_b32 a2, v0 +; GFX908-NEXT: s_mov_b32 s3, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_cmp_lg_u32 s0, 0 ; GFX908-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX908-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 -; GFX908-NEXT: s_mov_b32 s3, 0 ; GFX908-NEXT: s_branch .LBB0_2 ; GFX908-NEXT: .LBB0_1: ; %bb2 ; GFX908-NEXT: ; in Loop: Header=BB0_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir index ee5481617cf59..01506d0af1913 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-mov.mir @@ -80,16 +80,16 @@ body: | ; COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc ; COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0 ; COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc - ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec ; COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec - ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.1: ; COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; COALESCE-NEXT: {{ $}} - ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0 ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -103,10 +103,10 @@ body: | ; COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] - ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] - ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 - ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 - ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] + ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1 + ; COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1 + ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.3: @@ -134,16 +134,16 @@ body: | ; GFX908-COALESCE-NEXT: S_BITCMP1_B32 [[S_LOAD_DWORD_IMM]], 0, implicit-def $scc ; GFX908-COALESCE-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_64 = S_MOV_B32 0 ; GFX908-COALESCE-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit killed $scc - ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX908-COALESCE-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[S_CSELECT_B64_]], implicit $exec ; GFX908-COALESCE-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec - ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.1: ; GFX908-COALESCE-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) ; GFX908-COALESCE-NEXT: {{ $}} - ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 -1 ; GFX908-COALESCE-NEXT: $vcc = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: S_CBRANCH_VCCNZ %bb.3, implicit killed $vcc @@ -157,10 +157,10 @@ body: | ; GFX908-COALESCE-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_ASHR_I32_]], [[S_OR_B32_]], implicit-def dead $scc ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_64 = COPY [[S_MOV_B32_]].sub0 ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]] - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub1 - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = COPY [[COPY1]] + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub1 + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub1 + ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY2]], [[COPY2]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 0 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir index 49c0aaf9fb390..a9207de317ea1 100644 --- a/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir +++ b/llvm/test/CodeGen/AMDGPU/no-fold-accvgpr-read.mir @@ -67,7 +67,7 @@ body: | ; COALESCE-NEXT: bb.1: ; COALESCE-NEXT: successors: %bb.3(0x80000000) ; COALESCE-NEXT: {{ $}} - ; COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; COALESCE-NEXT: S_BRANCH %bb.3 ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.2: @@ -78,13 +78,13 @@ body: | ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], 0, 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; COALESCE-NEXT: {{ $}} ; COALESCE-NEXT: bb.3: - ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0 ; COALESCE-NEXT: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec ; COALESCE-NEXT: undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec - ; COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + ; COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 @@ -105,28 +105,28 @@ body: | ; GFX908-COALESCE-NEXT: bb.1: ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000) ; GFX908-COALESCE-NEXT: {{ $}} - ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec + ; GFX908-COALESCE-NEXT: undef [[AV_MOV_:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX908-COALESCE-NEXT: S_BRANCH %bb.3 ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.2: ; GFX908-COALESCE-NEXT: successors: %bb.3(0x80000000) ; GFX908-COALESCE-NEXT: {{ $}} - ; GFX908-COALESCE-NEXT: undef [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub0:areg_128_align2 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[V_ACCVGPR_WRITE_B32_e64_1]].sub0 + ; GFX908-COALESCE-NEXT: undef [[AV_MOV_1:%[0-9]+]].sub0:areg_128_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub1:areg_128_align2 = COPY [[AV_MOV_1]].sub0 + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub2:areg_128_align2 = COPY [[AV_MOV_1]].sub0 + ; GFX908-COALESCE-NEXT: [[AV_MOV_1:%[0-9]+]].sub3:areg_128_align2 = COPY [[AV_MOV_1]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; GFX908-COALESCE-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY [[S_MOV_B32_]].sub0_sub1 - ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_ACCVGPR_WRITE_B32_e64_1]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[AV_MOV_1]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_1:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: [[V_MFMA_F32_16X16X16F16_e64_2:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_1]], 0, 0, 0, implicit $mode, implicit $exec - ; GFX908-COALESCE-NEXT: [[V_ACCVGPR_WRITE_B32_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec + ; GFX908-COALESCE-NEXT: [[AV_MOV_:%[0-9]+]]:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 [[COPY1]], [[COPY1]], [[V_MFMA_F32_16X16X16F16_e64_2]], 0, 0, 0, implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: {{ $}} ; GFX908-COALESCE-NEXT: bb.3: - ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_ACCVGPR_WRITE_B32_e64_]].sub0 + ; GFX908-COALESCE-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[AV_MOV_]].sub0 ; GFX908-COALESCE-NEXT: [[V_CVT_F16_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e32 [[COPY2]], implicit $mode, implicit $exec ; GFX908-COALESCE-NEXT: undef [[V_PACK_B32_F16_e64_:%[0-9]+]].sub0:vreg_64_align2 = nofpexcept V_PACK_B32_F16_e64 0, [[V_CVT_F16_F32_e32_]], 0, 0, 0, 0, implicit $mode, implicit $exec - ; GFX908-COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = V_MOV_B32_e32 0, implicit $exec + ; GFX908-COALESCE-NEXT: [[V_PACK_B32_F16_e64_:%[0-9]+]].sub1:vreg_64_align2 = AV_MOV_B32_IMM_PSEUDO 0, implicit $exec ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0 ; GFX908-COALESCE-NEXT: [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0 diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll index 663fd98b46bf7..ce96766116089 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll @@ -17,9 +17,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 - ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX908-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec + ; REGALLOC-GFX908-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec + ; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX908-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) @@ -42,8 +42,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec - ; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec + ; PEI-GFX908-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec + ; PEI-GFX908-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec ; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 @@ -62,9 +62,9 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 - ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; REGALLOC-GFX90A-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec + ; REGALLOC-GFX90A-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec + ; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; REGALLOC-GFX90A-NEXT: S_ENDPGM 0 @@ -85,8 +85,8 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 { ; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) ; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec - ; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec - ; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec + ; PEI-GFX90A-NEXT: renamable $vgpr0 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec + ; PEI-GFX90A-NEXT: renamable $vgpr1 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec ; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5) ; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll index bd255e88b9512..648b59f69ea79 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll @@ -9,9 +9,9 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 { ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4) ; GCN-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec - ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[AV_MOV_:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 1, implicit $exec + ; GCN-NEXT: [[AV_MOV_1:%[0-9]+]]:vgpr_32 = AV_MOV_B32_IMM_PSEUDO 2, implicit $exec + ; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[AV_MOV_]], [[AV_MOV_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0 ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]] ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) poison`, addrspace 1) From 20fc297ce3c2a2151bf618cf515f2b1981d4821c Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Fri, 18 Jul 2025 09:21:20 +0100 Subject: [PATCH 309/813] [LoopVectorizer] Only check register pressure for VFs that have been enabled via maxBandwidth (#149056) Currently if MaxBandwidth is enabled, the register pressure is checked for each VF. This changes that to only perform said check if the VF would not have otherwise been considered by the LoopVectorizer if maxBandwidth was not enabled. Theoretically this allows for higher VFs to be considered than would otherwise be deemed "safe" (from a regpressure perspective), but more concretely this reduces the amount of work done at compile-time when maxBandwidth is enabled. --- .../Transforms/Vectorize/LoopVectorize.cpp | 38 +++++++++++++------ .../Transforms/Vectorize/VPlanAnalysis.cpp | 9 +++-- llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 6 ++- .../AArch64/maxbandwidth-regpressure.ll | 37 ++++++++++++++++++ 4 files changed, 74 insertions(+), 16 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index ceeabd65cced3..da3532b2f3385 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -947,9 +947,8 @@ class LoopVectorizationCostModel { /// user options, for the given register kind. bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind); - /// \return True if maximizing vector bandwidth is enabled by the target or - /// user options, for the given vector factor. - bool useMaxBandwidth(ElementCount VF); + /// \return True if register pressure should be calculated for the given VF. + bool shouldCalculateRegPressureForVF(ElementCount VF); /// \return The size (in bits) of the smallest and widest types in the code /// that needs to be vectorized. We ignore values that remain scalar such as @@ -1736,6 +1735,9 @@ class LoopVectorizationCostModel { /// Whether this loop should be optimized for size based on function attribute /// or profile information. bool OptForSize; + + /// The highest VF possible for this loop, without using MaxBandwidth. + FixedScalableVFPair MaxPermissibleVFWithoutMaxBW; }; } // end namespace llvm @@ -3832,10 +3834,16 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { return FixedScalableVFPair::getNone(); } -bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) { - return useMaxBandwidth(VF.isScalable() - ? TargetTransformInfo::RGK_ScalableVector - : TargetTransformInfo::RGK_FixedWidthVector); +bool LoopVectorizationCostModel::shouldCalculateRegPressureForVF( + ElementCount VF) { + if (!useMaxBandwidth(VF.isScalable() + ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector)) + return false; + // Only calculate register pressure for VFs enabled by MaxBandwidth. + return ElementCount::isKnownGT( + VF, VF.isScalable() ? MaxPermissibleVFWithoutMaxBW.ScalableVF + : MaxPermissibleVFWithoutMaxBW.FixedVF); } bool LoopVectorizationCostModel::useMaxBandwidth( @@ -3911,6 +3919,12 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget( ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector : TargetTransformInfo::RGK_FixedWidthVector; ElementCount MaxVF = MaxVectorElementCount; + + if (MaxVF.isScalable()) + MaxPermissibleVFWithoutMaxBW.ScalableVF = MaxVF; + else + MaxPermissibleVFWithoutMaxBW.FixedVF = MaxVF; + if (useMaxBandwidth(RegKind)) { auto MaxVectorElementCountMaxBW = ElementCount::get( llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType), @@ -4264,9 +4278,10 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (VF.isScalar()) continue; - /// Don't consider the VF if it exceeds the number of registers for the - /// target. - if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) + /// If the VF was proposed due to MaxBandwidth, don't consider the VF if + /// it exceeds the number of registers for the target. + if (CM.shouldCalculateRegPressureForVF(VF) && + RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) continue; InstructionCost C = CM.expectedCost(VF); @@ -7044,7 +7059,8 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() { InstructionCost Cost = cost(*P, VF); VectorizationFactor CurrentFactor(VF, Cost, ScalarCost); - if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) { + if (CM.shouldCalculateRegPressureForVF(VF) && + RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) { LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width " << VF << " because it uses too many registers\n"); continue; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index b27a7ffeed208..ca8729ae2e00e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -404,9 +404,12 @@ static unsigned getVFScaleFactor(VPRecipeBase *R) { return 1; } -bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI) const { - return any_of(MaxLocalUsers, [&TTI](auto &LU) { - return LU.second > TTI.getNumberOfRegisters(LU.first); +bool VPRegisterUsage::exceedsMaxNumRegs(const TargetTransformInfo &TTI, + unsigned OverrideMaxNumRegs) const { + return any_of(MaxLocalUsers, [&TTI, &OverrideMaxNumRegs](auto &LU) { + return LU.second > (OverrideMaxNumRegs > 0 + ? OverrideMaxNumRegs + : TTI.getNumberOfRegisters(LU.first)); }); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h index 7bcf9dba8c311..cd86d27cf9122 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h @@ -85,8 +85,10 @@ struct VPRegisterUsage { SmallMapVector MaxLocalUsers; /// Check if any of the tracked live intervals exceeds the number of - /// available registers for the target. - bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const; + /// available registers for the target. If non-zero, OverrideMaxNumRegs + /// is used in place of the target's number of registers. + bool exceedsMaxNumRegs(const TargetTransformInfo &TTI, + unsigned OverrideMaxNumRegs = 0) const; }; /// Estimate the register usage for \p Plan and vectorization factors in \p VFs diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll new file mode 100644 index 0000000000000..ce639f9150078 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll @@ -0,0 +1,37 @@ +; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP +; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @dotp(ptr %a, ptr %b) #0 { +; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers +; CHECK-REGS-VP: LV: Selecting VF: vscale x 8. +; +; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers +; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers +; CHECK-NOREGS-VP: LV: Selecting VF: vscale x 4. +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i32 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i32 + %mul = mul i32 %ext.b, %ext.a + %sub = sub i32 0, %mul + %add = add i32 %accum, %sub + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" } From 88721d63d482cf8f95deb66e74462b2cf583be8f Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 18 Jul 2025 08:24:46 +0000 Subject: [PATCH 310/813] [lldb][test] Fix PDB UdtLayoutTest https://github.com/llvm/llvm-project/pull/149282 changed the max children depth and that caused one part of the output to become `{...}`. The original PR set a higher limit for a different test, so I'm doing the same here. --- lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script | 1 + 1 file changed, 1 insertion(+) diff --git a/lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script b/lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script index 91de55f4ade4a..43018eacf709b 100644 --- a/lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script +++ b/lldb/test/Shell/SymbolFile/PDB/Inputs/UdtLayoutTest.script @@ -1,3 +1,4 @@ +settings set target.max-children-depth 10 breakpoint set --file UdtLayoutTest.cpp --line 60 run target variable From 3ce06b8c2196be6368f0e06862ac1849379cce41 Mon Sep 17 00:00:00 2001 From: Simon Tatham Date: Fri, 18 Jul 2025 09:34:42 +0100 Subject: [PATCH 311/813] [Clang][Driver] Expose relocation model as multilib flags (#149132) If a multilib collection contains libraries built for different methods of accessing global data (via absolute address, or via a GOT in -fPIC style, or as an offset from a fixed register in Arm -frwpi style), then `multilib.yaml` will need to know which relocation model an application is using in order to select the right library. Even if a multilib collection only supports one relocation model, it's still useful for `multilib.yaml` to be able to tell if the user has selected the right one, so as to give a useful error message if they haven't, instead of silently selecting a library that won't work. In this commit we determine the PIC / ROPI / RWPI status using the existing logic in `ParsePICArgs`, and translate it back into a canonical set of multilib selection flags. --- clang/lib/Driver/ToolChain.cpp | 40 ++++++++++++++++--- .../test/Driver/print-multi-selection-flags.c | 19 +++++++++ 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 3f9b808b2722e..481f575518b93 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -258,10 +258,10 @@ static void getAArch64MultilibFlags(const Driver &D, processMultilibCustomFlags(Result, Args); } -static void getARMMultilibFlags(const Driver &D, - const llvm::Triple &Triple, - const llvm::opt::ArgList &Args, - Multilib::flags_list &Result) { +static void getARMMultilibFlags(const Driver &D, const llvm::Triple &Triple, + llvm::Reloc::Model RelocationModel, + const llvm::opt::ArgList &Args, + Multilib::flags_list &Result) { std::vector Features; llvm::ARM::FPUKind FPUKind = tools::arm::getARMTargetFeatures( D, Triple, Args, Features, false /*ForAs*/, true /*ForMultilib*/); @@ -304,6 +304,18 @@ static void getARMMultilibFlags(const Driver &D, llvm_unreachable("Invalid float ABI"); } + if (RelocationModel == llvm::Reloc::ROPI || + RelocationModel == llvm::Reloc::ROPI_RWPI) + Result.push_back("-fropi"); + else + Result.push_back("-fno-ropi"); + + if (RelocationModel == llvm::Reloc::RWPI || + RelocationModel == llvm::Reloc::ROPI_RWPI) + Result.push_back("-frwpi"); + else + Result.push_back("-fno-rwpi"); + const Arg *BranchProtectionArg = Args.getLastArgNoClaim(options::OPT_mbranch_protection_EQ); if (BranchProtectionArg) { @@ -344,6 +356,18 @@ ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const { const llvm::Triple Triple(ComputeEffectiveClangTriple(Args)); Result.push_back("--target=" + Triple.str()); + // A difference of relocation model (absolutely addressed data, PIC, Arm + // ROPI/RWPI) is likely to change whether a particular multilib variant is + // compatible with a given link. Determine the relocation model of the + // current link, so as to add appropriate multilib flags. + llvm::Reloc::Model RelocationModel; + unsigned PICLevel; + bool IsPIE; + { + RegisterEffectiveTriple TripleRAII(*this, Triple); + std::tie(RelocationModel, PICLevel, IsPIE) = ParsePICArgs(*this, Args); + } + switch (Triple.getArch()) { case llvm::Triple::aarch64: case llvm::Triple::aarch64_32: @@ -354,7 +378,7 @@ ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const { case llvm::Triple::armeb: case llvm::Triple::thumb: case llvm::Triple::thumbeb: - getARMMultilibFlags(D, Triple, Args, Result); + getARMMultilibFlags(D, Triple, RelocationModel, Args, Result); break; case llvm::Triple::riscv32: case llvm::Triple::riscv64: @@ -376,6 +400,12 @@ ToolChain::getMultilibFlags(const llvm::opt::ArgList &Args) const { else Result.push_back("-fexceptions"); + if (RelocationModel == llvm::Reloc::PIC_) + Result.push_back(IsPIE ? (PICLevel > 1 ? "-fPIE" : "-fpie") + : (PICLevel > 1 ? "-fPIC" : "-fpic")); + else + Result.push_back("-fno-pic"); + // Sort and remove duplicates. std::sort(Result.begin(), Result.end()); Result.erase(llvm::unique(Result), Result.end()); diff --git a/clang/test/Driver/print-multi-selection-flags.c b/clang/test/Driver/print-multi-selection-flags.c index 5f9383fbed8f4..8cf8f04bb6b48 100644 --- a/clang/test/Driver/print-multi-selection-flags.c +++ b/clang/test/Driver/print-multi-selection-flags.c @@ -107,3 +107,22 @@ // CHECK-AARCH64-MULTILIB-CUSTOM-FLAG: --target=aarch64-unknown-none-eabi // CHECK-MULTILIB-CUSTOM-FLAG-DAG: -fmultilib-flag=foo // CHECK-MULTILIB-CUSTOM-FLAG-DAG: -fmultilib-flag=bar + +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fropi | FileCheck --check-prefixes=CHECK-ROPI,CHECK-NO-RWPI,CHECK-NO-PIC %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -frwpi | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-RWPI,CHECK-NO-PIC %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fropi -frwpi | FileCheck --check-prefixes=CHECK-ROPI,CHECK-RWPI,CHECK-NO-PIC %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fno-ropi -fno-rwpi | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-NO-PIC %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-NO-PIC %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fpic | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-PIC1 %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fPIC | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-PIC2 %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fpie | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-PIE1 %s +// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -fPIE | FileCheck --check-prefixes=CHECK-NO-ROPI,CHECK-NO-RWPI,CHECK-PIE2 %s +// CHECK-PIC2: -fPIC +// CHECK-PIE2: -fPIE +// CHECK-NO-PIC: -fno-pic +// CHECK-NO-ROPI: -fno-ropi +// CHECK-NO-RWPI: -fno-rwpi +// CHECK-PIC1: -fpic +// CHECK-PIE1: -fpie +// CHECK-ROPI: -fropi +// CHECK-RWPI: -frwpi From 28208c8e2713cdbc3ad39314e1cbd5c9efbe48d7 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Fri, 18 Jul 2025 09:44:01 +0100 Subject: [PATCH 312/813] [DebugInfo] Remove debug-intrinsic coroutine codepaths (#149068) There are a few duplicate paths/facilities in the coroutine code to deal with both intrinsics and debug-records; we can now delete the intrinsic version. --- llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 52 ++----------------- llvm/lib/Transforms/Coroutines/CoroInternal.h | 5 +- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 23 +++----- 3 files changed, 11 insertions(+), 69 deletions(-) diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index fbeb7218ba9a3..a65d0fb54c212 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -1103,14 +1103,13 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { FrameTy->getElementType(FrameData.getFieldIndex(E.first)), GEP, SpillAlignment, E.first->getName() + Twine(".reload")); - TinyPtrVector DIs = findDbgDeclares(Def); TinyPtrVector DVRs = findDVRDeclares(Def); // Try best to find dbg.declare. If the spill is a temp, there may not // be a direct dbg.declare. Walk up the load chain to find one from an // alias. if (F->getSubprogram()) { auto *CurDef = Def; - while (DIs.empty() && DVRs.empty() && isa(CurDef)) { + while (DVRs.empty() && isa(CurDef)) { auto *LdInst = cast(CurDef); // Only consider ptr to ptr same type load. if (LdInst->getPointerOperandType() != LdInst->getType()) @@ -1118,12 +1117,11 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { CurDef = LdInst->getPointerOperand(); if (!isa(CurDef)) break; - DIs = findDbgDeclares(CurDef); DVRs = findDVRDeclares(CurDef); } } - auto SalvageOne = [&](auto *DDI) { + auto SalvageOne = [&](DbgVariableRecord *DDI) { // This dbg.declare is preserved for all coro-split function // fragments. It will be unreachable in the main function, and // processed by coro::salvageDebugInfo() by the Cloner. @@ -1137,7 +1135,6 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { // will be deleted in all coro-split functions. coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/); }; - for_each(DIs, SalvageOne); for_each(DVRs, SalvageOne); } @@ -1225,8 +1222,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) { SmallVector DIs; SmallVector DbgVariableRecords; findDbgUsers(DIs, Alloca, &DbgVariableRecords); - for (auto *DVI : DIs) - DVI->replaceUsesOfWith(Alloca, G); + assert(DIs.empty() && "Should never see debug-intrinsics"); for (auto *DVR : DbgVariableRecords) DVR->replaceVariableLocationOp(Alloca, G); @@ -1920,48 +1916,6 @@ salvageDebugInfoImpl(SmallDenseMap &ArgToAllocaMap, return {{*Storage, *Expr}}; } -void coro::salvageDebugInfo( - SmallDenseMap &ArgToAllocaMap, - DbgVariableIntrinsic &DVI, bool UseEntryValue) { - - Function *F = DVI.getFunction(); - // Follow the pointer arithmetic all the way to the incoming - // function argument and convert into a DIExpression. - bool SkipOutermostLoad = !isa(DVI); - Value *OriginalStorage = DVI.getVariableLocationOp(0); - - auto SalvagedInfo = - ::salvageDebugInfoImpl(ArgToAllocaMap, UseEntryValue, F, OriginalStorage, - DVI.getExpression(), SkipOutermostLoad); - if (!SalvagedInfo) - return; - - Value *Storage = &SalvagedInfo->first; - DIExpression *Expr = &SalvagedInfo->second; - - DVI.replaceVariableLocationOp(OriginalStorage, Storage); - DVI.setExpression(Expr); - // We only hoist dbg.declare today since it doesn't make sense to hoist - // dbg.value since it does not have the same function wide guarantees that - // dbg.declare does. - if (isa(DVI)) { - std::optional InsertPt; - if (auto *I = dyn_cast(Storage)) { - InsertPt = I->getInsertionPointAfterDef(); - // Update DILocation only if variable was not inlined. - DebugLoc ILoc = I->getDebugLoc(); - DebugLoc DVILoc = DVI.getDebugLoc(); - if (ILoc && DVILoc && - DVILoc->getScope()->getSubprogram() == - ILoc->getScope()->getSubprogram()) - DVI.setDebugLoc(I->getDebugLoc()); - } else if (isa(Storage)) - InsertPt = F->getEntryBlock().begin(); - if (InsertPt) - DVI.moveBefore(*(*InsertPt)->getParent(), *InsertPt); - } -} - void coro::salvageDebugInfo( SmallDenseMap &ArgToAllocaMap, DbgVariableRecord &DVR, bool UseEntryValue) { diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h index b53c5a48eb10b..52f4ffe292dae 100644 --- a/llvm/lib/Transforms/Coroutines/CoroInternal.h +++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h @@ -34,14 +34,11 @@ void suppressCoroAllocs(CoroIdInst *CoroId); void suppressCoroAllocs(LLVMContext &Context, ArrayRef CoroAllocs); -/// Attempts to rewrite the location operand of debug intrinsics in terms of +/// Attempts to rewrite the location operand of debug records in terms of /// the coroutine frame pointer, folding pointer offsets into the DIExpression /// of the intrinsic. /// If the frame pointer is an Argument, store it into an alloca to enhance the /// debugability. -void salvageDebugInfo( - SmallDenseMap &ArgToAllocaMap, - DbgVariableIntrinsic &DVI, bool IsEntryPoint); void salvageDebugInfo( SmallDenseMap &ArgToAllocaMap, DbgVariableRecord &DVR, bool UseEntryValue); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 5a8a41f0dc432..64b33e46404f0 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -618,19 +618,15 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape, } } -/// Returns all DbgVariableIntrinsic in F. -static std::pair, - SmallVector> -collectDbgVariableIntrinsics(Function &F) { - SmallVector Intrinsics; +/// Returns all debug records in F. +static SmallVector +collectDbgVariableRecords(Function &F) { SmallVector DbgVariableRecords; for (auto &I : instructions(F)) { for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) DbgVariableRecords.push_back(&DVR); - if (auto *DVI = dyn_cast(&I)) - Intrinsics.push_back(DVI); } - return {Intrinsics, DbgVariableRecords}; + return DbgVariableRecords; } void coro::BaseCloner::replaceSwiftErrorOps() { @@ -638,13 +634,11 @@ void coro::BaseCloner::replaceSwiftErrorOps() { } void coro::BaseCloner::salvageDebugInfo() { - auto [Worklist, DbgVariableRecords] = collectDbgVariableIntrinsics(*NewF); + auto DbgVariableRecords = collectDbgVariableRecords(*NewF); SmallDenseMap ArgToAllocaMap; // Only 64-bit ABIs have a register we can refer to with the entry value. bool UseEntryValue = OrigF.getParent()->getTargetTriple().isArch64Bit(); - for (DbgVariableIntrinsic *DVI : Worklist) - coro::salvageDebugInfo(ArgToAllocaMap, *DVI, UseEntryValue); for (DbgVariableRecord *DVR : DbgVariableRecords) coro::salvageDebugInfo(ArgToAllocaMap, *DVR, UseEntryValue); @@ -655,7 +649,7 @@ void coro::BaseCloner::salvageDebugInfo() { return !isPotentiallyReachable(&NewF->getEntryBlock(), BB, nullptr, &DomTree); }; - auto RemoveOne = [&](auto *DVI) { + auto RemoveOne = [&](DbgVariableRecord *DVI) { if (IsUnreachableBlock(DVI->getParent())) DVI->eraseFromParent(); else if (isa_and_nonnull(DVI->getVariableLocationOp(0))) { @@ -669,7 +663,6 @@ void coro::BaseCloner::salvageDebugInfo() { DVI->eraseFromParent(); } }; - for_each(Worklist, RemoveOne); for_each(DbgVariableRecords, RemoveOne); } @@ -2022,9 +2015,7 @@ static void doSplitCoroutine(Function &F, SmallVectorImpl &Clones, // original function. The Cloner has already salvaged debug info in the new // coroutine funclets. SmallDenseMap ArgToAllocaMap; - auto [DbgInsts, DbgVariableRecords] = collectDbgVariableIntrinsics(F); - for (auto *DDI : DbgInsts) - coro::salvageDebugInfo(ArgToAllocaMap, *DDI, false /*UseEntryValue*/); + auto DbgVariableRecords = collectDbgVariableRecords(F); for (DbgVariableRecord *DVR : DbgVariableRecords) coro::salvageDebugInfo(ArgToAllocaMap, *DVR, false /*UseEntryValue*/); From d883d5fecf8aa7db6daa0b163599d42ca00c5808 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 21 Jan 2025 13:22:24 +0700 Subject: [PATCH 313/813] AMDGPU: Add testcase with bad regalloc behavior This demonstrates poor allocation due to not ordering AV classes relative to the A and V classes --- .../bad-agpr-vgpr-regalloc-priority.mir | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir diff --git a/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir new file mode 100644 index 0000000000000..1a457c94778fd --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bad-agpr-vgpr-regalloc-priority.mir @@ -0,0 +1,74 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -start-before=greedy,2 -stop-after=virtregrewriter,2 -o - %s | FileCheck %s + +--- +name: bad_ra +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_64, preferred-register: '$sgpr4_sgpr5' } + - { id: 1, class: sgpr_128, preferred-register: '%2' } + - { id: 2, class: areg_128, preferred-register: '%1' } + - { id: 3, class: areg_128, preferred-register: '%4' } + - { id: 4, class: av_128, preferred-register: '%3' } + - { id: 5, class: areg_128, preferred-register: '%6' } + - { id: 6, class: vreg_128, preferred-register: '%5' } + - { id: 7, class: areg_128, preferred-register: '%4' } + - { id: 8, class: vgpr_32 } + - { id: 9, class: vgpr_32 } + - { id: 10, class: vgpr_32 } + - { id: 11, class: areg_128 } +liveins: + - { reg: '$sgpr4_sgpr5', virtual-reg: '%0' } +frameInfo: + maxAlignment: 1 + isCalleeSavedInfoValid: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + occupancy: 10 + vgprForAGPRCopy: '$vgpr255' + sgprForEXECCopy: '$sgpr100_sgpr101' +body: | + bb.0: + liveins: $sgpr4_sgpr5 + + ; CHECK-LABEL: name: bad_ra + ; CHECK: liveins: $sgpr4_sgpr5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + ; CHECK-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1) + ; CHECK-NEXT: renamable $vgpr4 = V_MOV_B32_e32 1065353216, implicit $exec + ; CHECK-NEXT: renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: renamable $vgpr6 = V_MOV_B32_e32 1073741824, implicit $exec + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr1 = COPY renamable $agpr1 + ; CHECK-NEXT: renamable $vgpr0 = COPY renamable $agpr0 + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr4, $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr3 = COPY renamable $agpr1 + ; CHECK-NEXT: renamable $vgpr2 = COPY killed renamable $agpr0 + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 killed $vgpr4, killed $vgpr6, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3 + ; CHECK-NEXT: GLOBAL_STORE_DWORDX4_SADDR killed renamable $vgpr5, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1) + ; CHECK-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr6_sgpr7 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4) + renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM renamable $sgpr6_sgpr7, 0, 0 :: ("amdgpu-noclobber" load (s128), addrspace 1) + %8:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec + %9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %10:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec + %2:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + %3:areg_128 = V_MFMA_F32_4X4X1F32_e64 %8, %10, %2, 0, 0, 0, implicit $mode, implicit $exec + undef %4.sub1:av_128 = COPY %3.sub1 + %4.sub0:av_128 = COPY %3.sub0 + %11:areg_128 = V_MFMA_F32_4X4X1F32_e64 %8, %10, %3, 0, 0, 0, implicit $mode, implicit $exec + %4.sub3:av_128 = COPY %11.sub1 + %4.sub2:av_128 = COPY %11.sub0 + %7:areg_128 = COPY %4 + %5:areg_128 = V_MFMA_F32_4X4X1F32_e64 %8, %10, %7, 0, 0, 0, implicit $mode, implicit $exec + %6:vreg_128 = COPY %5 + GLOBAL_STORE_DWORDX4_SADDR %9, %6, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s128), addrspace 1) + S_ENDPGM 0 + +... From 534b9cdddde2d4f11516a8f689c6ba23a29b8bdc Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Fri, 18 Jul 2025 09:55:00 +0100 Subject: [PATCH 314/813] [LoopVectorizer][NFC] Update comment regarding VF register pressure. (#149478) --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index da3532b2f3385..f142e0796b52a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4278,8 +4278,9 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { if (VF.isScalar()) continue; - /// If the VF was proposed due to MaxBandwidth, don't consider the VF if - /// it exceeds the number of registers for the target. + /// If the register pressure needs to be considered for VF, + /// don't consider the VF as valid if it exceeds the number + /// of registers for the target. if (CM.shouldCalculateRegPressureForVF(VF) && RUs[I].exceedsMaxNumRegs(TTI, ForceTargetNumVectorRegs)) continue; From df9a864b046bb716e56f81409f6a01a17f3181d6 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 18 Jul 2025 09:56:17 +0100 Subject: [PATCH 315/813] [Offload] Implement event sync in amdgpu (#149300) --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 52 ++++++++++++++++++- .../unittests/OffloadAPI/common/Fixtures.hpp | 3 -- .../OffloadAPI/event/olWaitEvent.cpp | 3 -- 3 files changed, 50 insertions(+), 8 deletions(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 12c7cc62905c9..b2fd950c9d500 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -1115,6 +1115,18 @@ struct AMDGPUStreamTy { return Plugin::success(); } + /// Complete pending post actions until and including the event in target + /// slot. + Error completeUntil(uint32_t TargetSlot) { + for (uint32_t Slot = 0; Slot <= TargetSlot; ++Slot) { + // Take the post action of the operation if any. + if (auto Err = Slots[Slot].performAction()) + return Err; + } + + return Plugin::success(); + } + /// Make the current stream wait on a specific operation of another stream. /// The idea is to make the current stream waiting on two signals: 1) the last /// signal of the current stream, and 2) the last signal of the other stream. @@ -1502,6 +1514,11 @@ struct AMDGPUStreamTy { return complete(); } + /// Synchronize the stream until the given event. The current thread waits + /// until the provided event is finalized, and it performs the pending post + /// actions for that and prior events. + Error synchronizeOn(AMDGPUEventTy &Event); + /// Query the stream and complete pending post actions if operations finished. /// Return whether all the operations completed. This operation does not block /// the calling thread. @@ -1575,6 +1592,21 @@ struct AMDGPUEventTy { return Stream.waitEvent(*this); } + Error sync() { + std::lock_guard Lock(Mutex); + + if (!RecordedStream) + return Plugin::error(ErrorCode::INVALID_ARGUMENT, + "event does not have any recorded stream"); + + // No need to wait on anything, the recorded stream already finished the + // corresponding operation. + if (RecordedSlot < 0) + return Plugin::success(); + + return RecordedStream->synchronizeOn(*this); + } + protected: /// The stream registered in this event. AMDGPUStreamTy *RecordedStream; @@ -1630,6 +1662,22 @@ Error AMDGPUStreamTy::waitEvent(const AMDGPUEventTy &Event) { return waitOnStreamOperation(RecordedStream, Event.RecordedSlot); } +Error AMDGPUStreamTy::synchronizeOn(AMDGPUEventTy &Event) { + std::lock_guard Lock(Mutex); + + // Wait until the requested slot has completed + if (auto Err = Slots[Event.RecordedSlot].Signal->wait( + StreamBusyWaitMicroseconds, &Device)) + return Err; + + // If the event is the last one in the stream, just do a full finalize + if (Event.RecordedSlot == last()) + return complete(); + + // Otherwise, only finalize until the appropriate event + return completeUntil(Event.RecordedSlot); +} + struct AMDGPUStreamManagerTy final : GenericDeviceResourceManagerTy> { using ResourceRef = AMDGPUResourceRef; @@ -2540,8 +2588,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Synchronize the current thread with the event. Error syncEventImpl(void *EventPtr) override { - return Plugin::error(ErrorCode::UNIMPLEMENTED, - "synchronize event not implemented"); + AMDGPUEventTy *Event = reinterpret_cast(EventPtr); + return Event->sync(); } /// Print information about the device. diff --git a/offload/unittests/OffloadAPI/common/Fixtures.hpp b/offload/unittests/OffloadAPI/common/Fixtures.hpp index e5d815ecda965..546921164f691 100644 --- a/offload/unittests/OffloadAPI/common/Fixtures.hpp +++ b/offload/unittests/OffloadAPI/common/Fixtures.hpp @@ -171,9 +171,6 @@ struct OffloadQueueTest : OffloadDeviceTest { struct OffloadEventTest : OffloadQueueTest { void SetUp() override { RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp()); - if (getPlatformBackend() == OL_PLATFORM_BACKEND_AMDGPU) - GTEST_SKIP() << "AMDGPU synchronize event not implemented"; - // Get an event from a memcpy. We can still use it in olGetEventInfo etc // after it has been waited on. void *Alloc; diff --git a/offload/unittests/OffloadAPI/event/olWaitEvent.cpp b/offload/unittests/OffloadAPI/event/olWaitEvent.cpp index 05356d4ef8d75..f80dabb4fc93f 100644 --- a/offload/unittests/OffloadAPI/event/olWaitEvent.cpp +++ b/offload/unittests/OffloadAPI/event/olWaitEvent.cpp @@ -14,9 +14,6 @@ using olWaitEventTest = OffloadQueueTest; OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olWaitEventTest); TEST_P(olWaitEventTest, Success) { - if (getPlatformBackend() == OL_PLATFORM_BACKEND_AMDGPU) - GTEST_SKIP() << "AMDGPU synchronize event not implemented"; - uint32_t Src = 42; void *DstPtr; From 3121cc31baa1aed697cc07c72d283891ffa529f6 Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 18 Jul 2025 10:04:36 +0100 Subject: [PATCH 316/813] [lldb][test] TestNSDictionarySynthetic.py: adjust ptr depth in test Fixes failure after we introduced a default limit in https://github.com/llvm/llvm-project/pull/149282 We already did this test change on the Apple fork. --- .../nsdictionarysynth/TestNSDictionarySynthetic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py index e1d7e42bdd1a9..cd60227572be4 100644 --- a/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py +++ b/lldb/test/API/functionalities/data-formatter/nsdictionarysynth/TestNSDictionarySynthetic.py @@ -2,7 +2,6 @@ Test lldb data formatter subsystem. """ - import lldb from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * @@ -120,6 +119,8 @@ def cleanup(): '@"2 elements"', ], ) + + self.runCmd("settings set target.max-children-depth 6") self.expect( "frame variable mutabledict --ptr-depth 3", substrs=[ From 0b7a95a6fd81b31634a3723a0bea6d9d91bbc230 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 18 Jul 2025 17:48:06 +0900 Subject: [PATCH 317/813] Partially Reapply "RuntimeLibcalls: Add methods to recognize libcall names (#149001)" This partially reverts commit a96121089b9c94e08c6632f91f2dffc73c0ffa28. Drop the IRSymtab changes for now --- llvm/include/llvm/ADT/StringTable.h | 9 ++++ llvm/include/llvm/IR/RuntimeLibcalls.h | 12 +++++ llvm/lib/IR/RuntimeLibcalls.cpp | 45 +++++++++++++++++++ .../TableGen/Basic/RuntimeLibcallsEmitter.cpp | 13 +++++- 4 files changed, 78 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h index c089a070d4b57..575b3c929e40c 100644 --- a/llvm/include/llvm/ADT/StringTable.h +++ b/llvm/include/llvm/ADT/StringTable.h @@ -118,6 +118,13 @@ class StringTable { constexpr Iterator(const Iterator &RHS) = default; constexpr Iterator(Iterator &&RHS) = default; + Iterator &operator=(const Iterator &RHS) { + Table = RHS.Table; + O = RHS.O; + S = RHS.S; + return *this; + } + bool operator==(const Iterator &RHS) const { assert(Table == RHS.Table && "Compared iterators for unrelated tables!"); return O == RHS.O; @@ -132,6 +139,8 @@ class StringTable { O = O.value() + (*Table)[O].size() + 1; return *this; } + + Offset offset() const { return O; } }; constexpr Iterator begin() const { return Iterator(*this, 0); } diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h index 8058c8a4c5510..89ad4e5bc6ca4 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.h +++ b/llvm/include/llvm/IR/RuntimeLibcalls.h @@ -132,6 +132,10 @@ struct RuntimeLibcallsInfo { return ImplToLibcall[Impl]; } + /// Check if this is valid libcall for the current module, otherwise + /// RTLIB::Unsupported. + RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const; + private: static const RTLIB::LibcallImpl DefaultLibcallImpls[RTLIB::UNKNOWN_LIBCALL + 1]; @@ -156,6 +160,14 @@ struct RuntimeLibcallsInfo { /// Map from a concrete LibcallImpl implementation to its RTLIB::Libcall kind. LLVM_ABI static const RTLIB::Libcall ImplToLibcall[RTLIB::NumLibcallImpls]; + /// Check if a function name is a recognized runtime call of any kind. This + /// does not consider if this call is available for any current compilation, + /// just that it is a known call somewhere. This returns the set of all + /// LibcallImpls which match the name; multiple implementations with the same + /// name may exist but differ in interpretation based on the target context. + LLVM_ABI static iterator_range::const_iterator> + getRecognizedLibcallImpls(StringRef FuncName); + static bool darwinHasSinCosStret(const Triple &TT) { if (!TT.isOSDarwin()) return false; diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index b1864897dafa6..5936ac7d0287f 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -135,6 +135,51 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT, } } +RTLIB::LibcallImpl +RuntimeLibcallsInfo::getSupportedLibcallImpl(StringRef FuncName) const { + const ArrayRef RuntimeLibcallNameOffsets( + RuntimeLibcallNameOffsetTable); + + iterator_range::const_iterator> Range = + getRecognizedLibcallImpls(FuncName); + + for (auto I = Range.begin(); I != Range.end(); ++I) { + RTLIB::LibcallImpl Impl = + static_cast(I - RuntimeLibcallNameOffsets.begin()); + + // FIXME: This should not depend on looking up ImplToLibcall, only the list + // of libcalls for the module. + RTLIB::LibcallImpl Recognized = LibcallImpls[ImplToLibcall[Impl]]; + if (Recognized != RTLIB::Unsupported) + return Recognized; + } + + return RTLIB::Unsupported; +} + +iterator_range::const_iterator> +RuntimeLibcallsInfo::getRecognizedLibcallImpls(StringRef FuncName) { + StringTable::Iterator It = lower_bound(RuntimeLibcallImplNameTable, FuncName); + if (It == RuntimeLibcallImplNameTable.end() || *It != FuncName) + return iterator_range(ArrayRef()); + + uint16_t IndexVal = It.offset().value(); + const ArrayRef TableRef(RuntimeLibcallNameOffsetTable); + + ArrayRef::const_iterator E = TableRef.end(); + ArrayRef::const_iterator EntriesBegin = + std::lower_bound(TableRef.begin(), E, IndexVal); + ArrayRef::const_iterator EntriesEnd = EntriesBegin; + + while (EntriesEnd != E && *EntriesEnd == IndexVal) + ++EntriesEnd; + + assert(EntriesBegin != E && + "libcall found in name table but not offset table"); + + return make_range(EntriesBegin, EntriesEnd); +} + bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) { switch (TT.getOS()) { case Triple::MacOSX: diff --git a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp index 652bea9dc7f65..7f90d6b4fdacc 100644 --- a/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp +++ b/llvm/utils/TableGen/Basic/RuntimeLibcallsEmitter.cpp @@ -236,8 +236,19 @@ class RuntimeLibcallEmitter { for (RuntimeLibcall &LibCall : RuntimeLibcallDefList) Def2RuntimeLibcall[LibCall.getDef()] = &LibCall; - ArrayRef AllRuntimeLibcallImpls = + ArrayRef AllRuntimeLibcallImplsRaw = Records.getAllDerivedDefinitions("RuntimeLibcallImpl"); + + SmallVector AllRuntimeLibcallImpls( + AllRuntimeLibcallImplsRaw); + + // Sort by libcall impl name, not the enum name. This keeps the order + // suitable for using the name table for libcall recognition binary search. + llvm::sort(AllRuntimeLibcallImpls, [](const Record *A, const Record *B) { + return A->getValueAsString("LibCallFuncName") < + B->getValueAsString("LibCallFuncName"); + }); + RuntimeLibcallImplDefList.reserve(AllRuntimeLibcallImpls.size()); size_t LibCallImplEnumVal = 1; From 3bb4355bb83692d9c859043076db16baa86431e1 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 18 Jul 2025 11:10:57 +0200 Subject: [PATCH 318/813] [clang][bytecode] Report mutable reads when copying unions (#149320) --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 ++ clang/lib/AST/ByteCode/InterpFrame.cpp | 5 +++++ clang/test/AST/ByteCode/unions.cpp | 14 ++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 9ce1e380bff2c..462b9a11e0a5c 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2905,6 +2905,8 @@ static bool copyRecord(InterpState &S, CodePtr OpPC, const Pointer &Src, if (!copyField(F, /*Activate=*/true)) return false; } else { + if (!CheckMutable(S, OpPC, Src.atField(F.Offset))) + return false; Pointer DestField = Dest.atField(F.Offset); zeroAll(DestField); } diff --git a/clang/lib/AST/ByteCode/InterpFrame.cpp b/clang/lib/AST/ByteCode/InterpFrame.cpp index a5a4bd25fe712..d62a4f6275b50 100644 --- a/clang/lib/AST/ByteCode/InterpFrame.cpp +++ b/clang/lib/AST/ByteCode/InterpFrame.cpp @@ -128,6 +128,11 @@ static bool shouldSkipInBacktrace(const Function *F) { if (FD->getDeclName().getCXXOverloadedOperator() == OO_New || FD->getDeclName().getCXXOverloadedOperator() == OO_Array_New) return true; + + if (const auto *MD = dyn_cast(FD); + MD && MD->getParent()->isAnonymousStructOrUnion()) + return true; + return false; } diff --git a/clang/test/AST/ByteCode/unions.cpp b/clang/test/AST/ByteCode/unions.cpp index 0fa44a259a4ff..7cfd0d677a7b3 100644 --- a/clang/test/AST/ByteCode/unions.cpp +++ b/clang/test/AST/ByteCode/unions.cpp @@ -847,6 +847,20 @@ namespace Activation2 { } static_assert(change_member_indirectly() == 4); } + +namespace CopyCtorMutable { + struct E { + union { // expected-note {{read of mutable member 'b'}} + int a; + mutable int b; // both-note {{here}} + }; + }; + constexpr E e1 = {{1}}; + constexpr E e2 = e1; // both-error {{constant}} \ + // ref-note {{read of mutable member 'b'}} \ + // both-note {{in call}} +} + #endif namespace AddressComparison { From b7660a54157fd45e6276acf35176851196f5df71 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 18 Jul 2025 11:20:48 +0200 Subject: [PATCH 319/813] [clang][bytecode] Fix const-in-mutable fields (#149286) For mutable and const fields, we have two bits in InlineDescriptor, which both get inherited down the hierarchy. When a field is both const and mutable, we CAN read from it if it is a mutable-in-const field, but we _can't_ read from it if it is a const-in-mutable field. We need another bit to distinguish the two cases. --- clang/lib/AST/ByteCode/Descriptor.cpp | 2 + clang/lib/AST/ByteCode/Descriptor.h | 4 ++ clang/lib/AST/ByteCode/Disasm.cpp | 1 + clang/lib/AST/ByteCode/Interp.cpp | 5 ++- clang/lib/AST/ByteCode/Pointer.h | 5 +++ clang/test/AST/ByteCode/mutable.cpp | 56 +++++++++++++++++++++++---- 6 files changed, 64 insertions(+), 9 deletions(-) diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp index c89eca9bef440..5b9f44518fcc2 100644 --- a/clang/lib/AST/ByteCode/Descriptor.cpp +++ b/clang/lib/AST/ByteCode/Descriptor.cpp @@ -162,6 +162,8 @@ static void initField(Block *B, std::byte *Ptr, bool IsConst, bool IsMutable, Desc->IsConst = IsConst || D->IsConst; Desc->IsFieldMutable = IsMutable || D->IsMutable; Desc->IsVolatile = IsVolatile || D->IsVolatile; + // True if this field is const AND the parent is mutable. + Desc->IsConstInMutable = Desc->IsConst && IsMutable; if (auto Fn = D->CtorFn) Fn(B, Ptr + FieldOffset, Desc->IsConst, Desc->IsFieldMutable, diff --git a/clang/lib/AST/ByteCode/Descriptor.h b/clang/lib/AST/ByteCode/Descriptor.h index 4591eabb69bb4..0227e4c0c7e38 100644 --- a/clang/lib/AST/ByteCode/Descriptor.h +++ b/clang/lib/AST/ByteCode/Descriptor.h @@ -101,6 +101,10 @@ struct InlineDescriptor { /// Flag indicating if the field is mutable (if in a record). LLVM_PREFERRED_TYPE(bool) unsigned IsFieldMutable : 1; + /// Flag indicating if this field is a const field nested in + /// a mutable parent field. + LLVM_PREFERRED_TYPE(bool) + unsigned IsConstInMutable : 1; /// Flag indicating if the field is an element of a composite array. LLVM_PREFERRED_TYPE(bool) unsigned IsArrayElement : 1; diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp index f64501f4a31e8..74399d177b5a2 100644 --- a/clang/lib/AST/ByteCode/Disasm.cpp +++ b/clang/lib/AST/ByteCode/Disasm.cpp @@ -445,6 +445,7 @@ LLVM_DUMP_METHOD void InlineDescriptor::dump(llvm::raw_ostream &OS) const { OS << "InUnion: " << InUnion << "\n"; OS << "IsFieldMutable: " << IsFieldMutable << "\n"; OS << "IsArrayElement: " << IsArrayElement << "\n"; + OS << "IsConstInMutable: " << IsConstInMutable << '\n'; OS << "Desc: "; if (Desc) Desc->dump(OS); diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index e8b519478c026..df5e3be83d741 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -566,7 +566,10 @@ bool CheckDowncast(InterpState &S, CodePtr OpPC, const Pointer &Ptr, bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { assert(Ptr.isLive() && "Pointer is not live"); - if (!Ptr.isConst() || Ptr.isMutable()) + if (!Ptr.isConst()) + return true; + + if (Ptr.isMutable() && !Ptr.isConstInMutable()) return true; if (!Ptr.isBlockPointer()) diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h index e6a64e6658f06..da74013cf83a6 100644 --- a/clang/lib/AST/ByteCode/Pointer.h +++ b/clang/lib/AST/ByteCode/Pointer.h @@ -576,6 +576,11 @@ class Pointer { return true; return isRoot() ? getDeclDesc()->IsConst : getInlineDesc()->IsConst; } + bool isConstInMutable() const { + if (!isBlockPointer()) + return false; + return isRoot() ? false : getInlineDesc()->IsConstInMutable; + } /// Checks if an object or a subfield is volatile. bool isVolatile() const { diff --git a/clang/test/AST/ByteCode/mutable.cpp b/clang/test/AST/ByteCode/mutable.cpp index aebbea920578c..35c5a0389921e 100644 --- a/clang/test/AST/ByteCode/mutable.cpp +++ b/clang/test/AST/ByteCode/mutable.cpp @@ -1,11 +1,7 @@ -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++11 -verify=expected,expected11,both,both11 %s -// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify=expected,expected14,both %s -// RUN: %clang_cc1 -std=c++11 -verify=ref,ref11,both,both11 %s -// RUN: %clang_cc1 -std=c++14 -verify=ref,ref14,both %s - - - - +// RUN: %clang_cc1 -std=c++11 -verify=expected,expected11,both,both11 %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -std=c++14 -verify=expected,expected14,both %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -std=c++11 -verify=ref,ref11,both,both11 %s +// RUN: %clang_cc1 -std=c++14 -verify=ref,ref14,both %s namespace Simple { struct S { @@ -26,3 +22,47 @@ namespace Simple { static_assert(s2.a2 == 12, ""); // both11-error {{not an integral constant expression}} \ // both11-note {{initializer of 's2' is not a constant expression}} } +#if __cplusplus >= 201402L +namespace ConstInMutable { + class B { + public: + + const int f; + constexpr B() : f(12) {} + }; + class A { + public: + mutable B b; + constexpr A() = default; + }; + constexpr int constInMutable() { + A a; + + int *m = (int*)&a.b.f; + *m = 12; // both-note {{modification of object of const-qualified type 'const int' is not allowed in a constant expression}} + return 1; + } + static_assert(constInMutable() == 1, ""); // both-error {{not an integral constant expression}} \ + // both-note {{in call to}} +} + +namespace MutableInConst { + class C { + public: + mutable int c; + constexpr C() : c(50) {} + }; + class D { + public: + C c; + constexpr D() {} + }; + constexpr int mutableInConst() { + const D d{}; + int *m = (int*)&d.c.c; + *m = 12; + return 1; + } + static_assert(mutableInConst() == 1, ""); +} +#endif From b5e3fffd20a72d3451e31ac37ca4930014044cd0 Mon Sep 17 00:00:00 2001 From: Nicholas Guy Date: Fri, 18 Jul 2025 10:21:21 +0100 Subject: [PATCH 320/813] [LoopVectorizer][NFC] Require asserts on maxbandwidth-regpressure.ll (#149484) Fix for buildbot failure: https://lab.llvm.org/buildbot/#/builders/11/builds/19837 --- .../Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll index ce639f9150078..2d1543185098f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll @@ -1,3 +1,4 @@ +; REQUIRES: asserts ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-REGS-VP ; RUN: opt -passes=loop-vectorize -vectorizer-maximize-bandwidth -debug-only=loop-vectorize -disable-output -force-target-num-vector-regs=1 -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-NOREGS-VP From ee8756e8551bc9ae5bf60e1ff16abaa95d61c234 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez <11032120+lucas-rami@users.noreply.github.com> Date: Fri, 18 Jul 2025 11:28:13 +0200 Subject: [PATCH 321/813] [LLVM] Make `MachineBlockFrequencyInfo`'s constructor arguments const (NFC) (#149279) This avoids having to call `MachineBlockFrequencyInfo::calculate` manually if one of the parameters is const. --- llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h | 4 ++-- llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h index aef91909dd17b..794075201d646 100644 --- a/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h +++ b/llvm/include/llvm/CodeGen/MachineBlockFrequencyInfo.h @@ -39,8 +39,8 @@ class MachineBlockFrequencyInfo { public: LLVM_ABI MachineBlockFrequencyInfo(); // Legacy pass manager only. LLVM_ABI explicit MachineBlockFrequencyInfo( - MachineFunction &F, MachineBranchProbabilityInfo &MBPI, - MachineLoopInfo &MLI); + const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI, + const MachineLoopInfo &MLI); LLVM_ABI MachineBlockFrequencyInfo(MachineBlockFrequencyInfo &&); LLVM_ABI ~MachineBlockFrequencyInfo(); diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp index 9daacfd399787..e7fa0824fd98a 100644 --- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp +++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp @@ -202,8 +202,8 @@ MachineBlockFrequencyInfo::MachineBlockFrequencyInfo( MachineBlockFrequencyInfo &&) = default; MachineBlockFrequencyInfo::MachineBlockFrequencyInfo( - MachineFunction &F, MachineBranchProbabilityInfo &MBPI, - MachineLoopInfo &MLI) { + const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI, + const MachineLoopInfo &MLI) { calculate(F, MBPI, MLI); } From 9e0c06d708a40bb3c8bd08acd982836cce718135 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Fri, 18 Jul 2025 10:59:42 +0200 Subject: [PATCH 322/813] [clang][CodeGen] Set `dead_on_return` when passing arguments indirectly Let Clang emit `dead_on_return` attribute on pointer arguments that are passed indirectly, namely, large aggregates that the ABI mandates be passed by value; thus, the parameter is destroyed within the callee. Writes to such arguments are not observable by the caller after the callee returns. This should desirably enable further MemCpyOpt/DSE optimizations. Previous discussion: https://discourse.llvm.org/t/rfc-add-dead-on-return-attribute/86871. --- clang/lib/CodeGen/CGCall.cpp | 15 +- clang/test/CodeGen/64bit-swiftcall.c | 2 +- clang/test/CodeGen/AArch64/byval-temp.c | 16 +- .../AArch64/pure-scalable-args-empty-union.c | 2 +- .../test/CodeGen/AArch64/pure-scalable-args.c | 42 ++-- .../AArch64/struct-coerce-using-ptr.cpp | 4 +- ...-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c | 6 +- ...cle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp | 2 +- clang/test/CodeGen/LoongArch/bitint.c | 6 +- clang/test/CodeGen/PowerPC/ppc64-vector.c | 2 +- clang/test/CodeGen/RISCV/riscv-abi.cpp | 8 +- .../RISCV/riscv-vector-callingconv-llvm-ir.c | 10 +- .../riscv-vector-callingconv-llvm-ir.cpp | 10 +- clang/test/CodeGen/RISCV/riscv32-abi.c | 74 +++--- clang/test/CodeGen/RISCV/riscv32-vararg.c | 2 +- clang/test/CodeGen/RISCV/riscv64-abi.c | 18 +- clang/test/CodeGen/RISCV/riscv64-vararg.c | 2 +- .../test/CodeGen/SystemZ/systemz-abi-vector.c | 52 ++-- clang/test/CodeGen/SystemZ/systemz-abi.c | 38 +-- .../test/CodeGen/SystemZ/systemz-inline-asm.c | 2 +- clang/test/CodeGen/X86/cx-complex-range.c | 2 +- .../test/CodeGen/X86/x86_32-arguments-win32.c | 14 +- .../test/CodeGen/X86/x86_64-arguments-win32.c | 2 +- clang/test/CodeGen/aapcs64-align.cpp | 4 +- clang/test/CodeGen/arm-aapcs-vfp.c | 2 +- clang/test/CodeGen/arm-abi-vector.c | 6 +- clang/test/CodeGen/arm-swiftcall.c | 2 +- clang/test/CodeGen/arm64-abi-vector.c | 14 +- clang/test/CodeGen/arm64-arguments.c | 26 +- .../CodeGen/arm64-microsoft-arguments.cpp | 2 +- clang/test/CodeGen/armv7k-abi.c | 2 +- clang/test/CodeGen/atomic-arm64.c | 2 +- clang/test/CodeGen/attr-noundef.cpp | 7 +- clang/test/CodeGen/cx-complex-range.c | 36 +-- clang/test/CodeGen/ext-int-cc.c | 36 +-- clang/test/CodeGen/isfpclass.c | 2 +- .../math-libcalls-tbaa-indirect-args.c | 86 +++---- clang/test/CodeGen/mingw-long-double.c | 6 +- clang/test/CodeGen/ms_abi.c | 4 +- clang/test/CodeGen/pass-by-value-noalias.c | 4 +- clang/test/CodeGen/ptrauth-in-c-struct.c | 4 +- clang/test/CodeGen/regcall.c | 10 +- clang/test/CodeGen/regcall2.c | 2 +- clang/test/CodeGen/regcall4.c | 10 +- clang/test/CodeGen/sparcv9-abi.c | 4 +- clang/test/CodeGen/vectorcall.c | 46 ++-- clang/test/CodeGen/win-fp128.c | 2 +- clang/test/CodeGen/win64-i128.c | 4 +- clang/test/CodeGen/windows-swiftcall.c | 2 +- .../CodeGenCXX/aarch64-mangle-sve-vectors.cpp | 4 +- clang/test/CodeGenCXX/arm-cc.cpp | 2 +- .../CodeGenCXX/attr-target-mv-inalloca.cpp | 8 +- clang/test/CodeGenCXX/copy-initialization.cpp | 2 +- clang/test/CodeGenCXX/debug-info.cpp | 2 +- .../empty-nontrivially-copyable.cpp | 4 +- clang/test/CodeGenCXX/fastcall.cpp | 2 +- .../CodeGenCXX/homogeneous-aggregates.cpp | 14 +- clang/test/CodeGenCXX/inalloca-lambda.cpp | 6 +- .../test/CodeGenCXX/inalloca-overaligned.cpp | 8 +- clang/test/CodeGenCXX/inalloca-vector.cpp | 4 +- .../CodeGenCXX/inheriting-constructor.cpp | 8 +- .../member-function-pointer-calls.cpp | 4 +- .../CodeGenCXX/microsoft-abi-arg-order.cpp | 4 +- .../CodeGenCXX/microsoft-abi-byval-thunks.cpp | 12 +- .../microsoft-abi-member-pointers.cpp | 6 +- .../microsoft-abi-sret-and-byval.cpp | 26 +- .../CodeGenCXX/microsoft-abi-unknown-arch.cpp | 2 +- clang/test/CodeGenCXX/ms-property.cpp | 4 +- clang/test/CodeGenCXX/nrvo.cpp | 4 +- .../test/CodeGenCXX/pass-by-value-noalias.cpp | 12 +- .../CodeGenCXX/ptrauth-qualifier-struct.cpp | 2 +- clang/test/CodeGenCXX/regparm.cpp | 2 +- clang/test/CodeGenCXX/trivial_abi.cpp | 8 +- clang/test/CodeGenCXX/uncopyable-args.cpp | 32 +-- clang/test/CodeGenCXX/wasm-args-returns.cpp | 12 +- .../test/CodeGenCXX/windows-x86-swiftcall.cpp | 4 +- .../nontrivial-c-struct-exception.m | 2 +- .../test/CodeGenObjC/pass-by-value-noalias.m | 4 +- clang/test/CodeGenObjC/weak-in-c-struct.m | 6 +- .../test/CodeGenObjCXX/objc-struct-cxx-abi.mm | 12 +- clang/test/CodeGenObjCXX/property-objects.mm | 2 +- .../CodeGenObjCXX/ptrauth-struct-cxx-abi.mm | 2 +- clang/test/Headers/stdarg.cpp | 4 +- .../test/OpenMP/for_firstprivate_codegen.cpp | 30 +-- .../OpenMP/parallel_firstprivate_codegen.cpp | 216 ++++++++--------- .../OpenMP/sections_firstprivate_codegen.cpp | 14 +- .../OpenMP/single_firstprivate_codegen.cpp | 14 +- ..._teams_distribute_firstprivate_codegen.cpp | 82 +++---- ...bute_parallel_for_firstprivate_codegen.cpp | 224 +++++++++--------- .../teams_distribute_firstprivate_codegen.cpp | 90 +++---- ...bute_parallel_for_firstprivate_codegen.cpp | 124 +++++----- .../OpenMP/teams_firstprivate_codegen.cpp | 80 +++---- 92 files changed, 887 insertions(+), 873 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index c8c3d6b20c496..19d8ba26d44d8 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -2852,8 +2852,21 @@ void CodeGenModule::ConstructAttributeList(StringRef Name, if (AI.getInReg()) Attrs.addAttribute(llvm::Attribute::InReg); - if (AI.getIndirectByVal()) + // Depending on the ABI, this may be either a byval or a dead_on_return + // argument. + if (AI.getIndirectByVal()) { Attrs.addByValAttr(getTypes().ConvertTypeForMem(ParamType)); + } else { + // Add dead_on_return when the object's lifetime ends in the callee. + // This includes trivially-destructible objects, as well as objects + // whose destruction / clean-up is carried out within the callee (e.g., + // Obj-C ARC-managed structs, MSVC callee-destroyed objects). + if (!ParamType.isDestructedType() || !ParamType->isRecordType() || + ParamType->castAs() + ->getDecl() + ->isParamDestroyedInCallee()) + Attrs.addAttribute(llvm::Attribute::DeadOnReturn); + } auto *Decl = ParamType->getAsRecordDecl(); if (CodeGenOpts.PassByValueIsNoAlias && Decl && diff --git a/clang/test/CodeGen/64bit-swiftcall.c b/clang/test/CodeGen/64bit-swiftcall.c index 7f8aa02d97ce1..448bca7acbca3 100644 --- a/clang/test/CodeGen/64bit-swiftcall.c +++ b/clang/test/CodeGen/64bit-swiftcall.c @@ -239,7 +239,7 @@ TEST(struct_big_1) // CHECK-LABEL: define {{.*}} void @return_struct_big_1(ptr dead_on_unwind noalias writable sret // Should not be byval. -// CHECK-LABEL: define {{.*}} void @take_struct_big_1(ptr{{( %.*)?}}) +// CHECK-LABEL: define {{.*}} void @take_struct_big_1(ptr dead_on_return{{( %.*)?}}) /*****************************************************************************/ /********************************* TYPE MERGING ******************************/ diff --git a/clang/test/CodeGen/AArch64/byval-temp.c b/clang/test/CodeGen/AArch64/byval-temp.c index 0ee0312b2362d..5033b6cf5ac03 100644 --- a/clang/test/CodeGen/AArch64/byval-temp.c +++ b/clang/test/CodeGen/AArch64/byval-temp.c @@ -30,10 +30,10 @@ void example(void) { // Then, memcpy `l` to the temporary stack space. // CHECK-O0-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[byvaltemp]], ptr align 8 %[[l]], i64 64, i1 false) // Finally, call using a pointer to the temporary stack space. -// CHECK-O0-NEXT: call void @pass_large(ptr noundef %[[byvaltemp]]) +// CHECK-O0-NEXT: call void @pass_large(ptr dead_on_return noundef %[[byvaltemp]]) // Now, do the same for the second call, using the second temporary alloca. // CHECK-O0-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[byvaltemp1]], ptr align 8 %[[l]], i64 64, i1 false) -// CHECK-O0-NEXT: call void @pass_large(ptr noundef %[[byvaltemp1]]) +// CHECK-O0-NEXT: call void @pass_large(ptr dead_on_return noundef %[[byvaltemp1]]) // CHECK-O0-NEXT: ret void // // At O3, we should have lifetime markers to help the optimizer re-use the temporary allocas. @@ -58,7 +58,7 @@ void example(void) { // Then, memcpy `l` to the temporary stack space. // CHECK-O3-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[byvaltemp]], ptr align 8 %[[l]], i64 64, i1 false) // Finally, call using a pointer to the temporary stack space. -// CHECK-O3-NEXT: call void @pass_large(ptr noundef %[[byvaltemp]]) +// CHECK-O3-NEXT: call void @pass_large(ptr dead_on_return noundef %[[byvaltemp]]) // // The lifetime of the temporary used to pass a pointer to the struct ends here. // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 64, ptr %[[byvaltemp]]) @@ -66,7 +66,7 @@ void example(void) { // Now, do the same for the second call, using the second temporary alloca. // CHECK-O3-NEXT: call void @llvm.lifetime.start.p0(i64 64, ptr %[[byvaltemp1]]) // CHECK-O3-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[byvaltemp1]], ptr align 8 %[[l]], i64 64, i1 false) -// CHECK-O3-NEXT: call void @pass_large(ptr noundef %[[byvaltemp1]]) +// CHECK-O3-NEXT: call void @pass_large(ptr dead_on_return noundef %[[byvaltemp1]]) // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 64, ptr %[[byvaltemp1]]) // // Mark the end of the lifetime of `l`. @@ -88,12 +88,12 @@ void example_BitInt(void) { // CHECK-O0-NEXT: [[LOADEDV:%.*]] = trunc i256 [[TMP0]] to i129 // CHECK-O0-NEXT: [[STOREDV:%.*]] = sext i129 [[LOADEDV]] to i256 // CHECK-O0-NEXT: store i256 [[STOREDV]], ptr [[INDIRECT_ARG_TEMP]], align 16 -// CHECK-O0-NEXT: call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP]]) +// CHECK-O0-NEXT: call void @pass_large_BitInt(ptr dead_on_return noundef [[INDIRECT_ARG_TEMP]]) // CHECK-O0-NEXT: [[TMP1:%.*]] = load i256, ptr [[L]], align 16 // CHECK-O0-NEXT: [[LOADEDV1:%.*]] = trunc i256 [[TMP1]] to i129 // CHECK-O0-NEXT: [[STOREDV1:%.*]] = sext i129 [[LOADEDV1]] to i256 // CHECK-O0-NEXT: store i256 [[STOREDV1]], ptr [[INDIRECT_ARG_TEMP1]], align 16 -// CHECK-O0-NEXT: call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP1]]) +// CHECK-O0-NEXT: call void @pass_large_BitInt(ptr dead_on_return noundef [[INDIRECT_ARG_TEMP1]]) // CHECK-O0-NEXT: ret void // // CHECK-O3-LABEL: define dso_local void @example_BitInt( @@ -108,13 +108,13 @@ void example_BitInt(void) { // CHECK-O3-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[INDIRECT_ARG_TEMP]]) // CHECK-O3-NEXT: [[STOREDV:%.*]] = sext i129 [[LOADEDV]] to i256 // CHECK-O3-NEXT: store i256 [[STOREDV]], ptr [[INDIRECT_ARG_TEMP]], align 16, !tbaa [[TBAA6]] -// CHECK-O3-NEXT: call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP]]) +// CHECK-O3-NEXT: call void @pass_large_BitInt(ptr dead_on_return noundef [[INDIRECT_ARG_TEMP]]) // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[INDIRECT_ARG_TEMP]]) // CHECK-O3-NEXT: [[TMP1:%.*]] = load i256, ptr [[L]], align 16, !tbaa [[TBAA6]] // CHECK-O3-NEXT: [[LOADEDV1:%.*]] = trunc i256 [[TMP1]] to i129 // CHECK-O3-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[INDIRECT_ARG_TEMP1]]) // CHECK-O3-NEXT: [[STOREDV1:%.*]] = sext i129 [[LOADEDV1]] to i256 // CHECK-O3-NEXT: store i256 [[STOREDV1]], ptr [[INDIRECT_ARG_TEMP1]], align 16, !tbaa [[TBAA6]] -// CHECK-O3-NEXT: call void @pass_large_BitInt(ptr noundef [[INDIRECT_ARG_TEMP1]]) +// CHECK-O3-NEXT: call void @pass_large_BitInt(ptr dead_on_return noundef [[INDIRECT_ARG_TEMP1]]) // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[INDIRECT_ARG_TEMP1]]) // CHECK-O3-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[L]]) diff --git a/clang/test/CodeGen/AArch64/pure-scalable-args-empty-union.c b/clang/test/CodeGen/AArch64/pure-scalable-args-empty-union.c index 546910068c78a..804e14a2ea34b 100644 --- a/clang/test/CodeGen/AArch64/pure-scalable-args-empty-union.c +++ b/clang/test/CodeGen/AArch64/pure-scalable-args-empty-union.c @@ -19,7 +19,7 @@ void f0(S0 *p) { use0(*p); } // CHECK-C: declare void @use0(, , , ) -// CHECK-CXX: declare void @use0(ptr noundef) +// CHECK-CXX: declare void @use0(ptr dead_on_return noundef) #ifdef __cplusplus diff --git a/clang/test/CodeGen/AArch64/pure-scalable-args.c b/clang/test/CodeGen/AArch64/pure-scalable-args.c index fecd370d09be3..48988f7a1722b 100644 --- a/clang/test/CodeGen/AArch64/pure-scalable-args.c +++ b/clang/test/CodeGen/AArch64/pure-scalable-args.c @@ -92,7 +92,7 @@ void test_argpass_simple(PST *p) { // CHECK-AAPCS-NEXT: ret void // CHECK-AAPCS: declare void @argpass_simple_callee(, , , , , ) -// CHECK-DARWIN: declare void @argpass_simple_callee(ptr noundef) +// CHECK-DARWIN: declare void @argpass_simple_callee(ptr dead_on_return noundef) // Boundary case of using the last available Z-reg, PST expanded. // 0.0 -> d0-d3 @@ -107,7 +107,7 @@ void test_argpass_last_z(PST *p) { argpass_last_z_callee(.0, .0, .0, .0, *p); } // CHECK-AAPCS: declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, , , , , , ) -// CHECK-DARWIN: declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, ptr noundef) +// CHECK-DARWIN: declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, ptr dead_on_return noundef) // Like the above, but using a tuple type to occupy some registers. @@ -123,7 +123,7 @@ void test_argpass_last_z_tuple(PST *p, svfloat64x4_t x) { argpass_last_z_tuple_callee(x, *p); } // CHECK-AAPCS: declare void @argpass_last_z_tuple_callee(, , , , , , , , , ) -// CHECK-DARWIN: declare void @argpass_last_z_tuple_callee(, , , , ptr noundef) +// CHECK-DARWIN: declare void @argpass_last_z_tuple_callee(, , , , ptr dead_on_return noundef) // Boundary case of using the last available P-reg, PST expanded. @@ -139,7 +139,7 @@ void test_argpass_last_p(PST *p) { argpass_last_p_callee(svpfalse(), svpfalse_c(), *p); } // CHECK-AAPCS: declare void @argpass_last_p_callee(, target("aarch64.svcount"), , , , , , ) -// CHECK-DARWIN: declare void @argpass_last_p_callee(, target("aarch64.svcount"), ptr noundef) +// CHECK-DARWIN: declare void @argpass_last_p_callee(, target("aarch64.svcount"), ptr dead_on_return noundef) // Not enough Z-regs, push PST to memory and pass a pointer, Z-regs and @@ -157,7 +157,7 @@ void test_argpass_no_z(PST *p, double dummy, svmfloat8_t u, int8x16_t v, mfloat8 void argpass_no_z_callee(svmfloat8_t, int8x16_t, mfloat8x16_t, double, double, int, PST, int, double, svbool_t); argpass_no_z_callee(u, v, w, .0, .0, 1, *p, 2, 3.0, svptrue_b64()); } -// CHECK: declare void @argpass_no_z_callee(, <16 x i8> noundef, <16 x i8>, double noundef, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, ) +// CHECK: declare void @argpass_no_z_callee(, <16 x i8> noundef, <16 x i8>, double noundef, double noundef, i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, ) // Like the above, using a tuple to occupy some registers. @@ -173,7 +173,7 @@ void test_argpass_no_z_tuple_f64(PST *p, float dummy, svfloat64x4_t x) { double, svbool_t); argpass_no_z_tuple_f64_callee(x, .0, 1, *p, 2, 3.0, svptrue_b64()); } -// CHECK: declare void @argpass_no_z_tuple_f64_callee(, , , , double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, ) +// CHECK: declare void @argpass_no_z_tuple_f64_callee(, , , , double noundef, i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, ) // Likewise, using a different tuple. @@ -189,7 +189,7 @@ void test_argpass_no_z_tuple_mfp8(PST *p, float dummy, svmfloat8x4_t x) { double, svbool_t); argpass_no_z_tuple_mfp8_callee(x, .0, 1, *p, 2, 3.0, svptrue_b64()); } -// CHECK: declare void @argpass_no_z_tuple_mfp8_callee(, , , , double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, ) +// CHECK: declare void @argpass_no_z_tuple_mfp8_callee(, , , , double noundef, i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, ) // Not enough Z-regs (consumed by a HFA), PST passed indirectly @@ -204,8 +204,8 @@ void test_argpass_no_z_hfa(HFA *h, PST *p) { void argpass_no_z_hfa_callee(double, HFA, int, PST, int, svbool_t); argpass_no_z_hfa_callee(.0, *h, 1, *p, 2, svptrue_b64()); } -// CHECK-AAPCS: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float] alignstack(8), i32 noundef, ptr noundef, i32 noundef, ) -// CHECK-DARWIN: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float], i32 noundef, ptr noundef, i32 noundef, ) +// CHECK-AAPCS: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float] alignstack(8), i32 noundef, ptr dead_on_return noundef, i32 noundef, ) +// CHECK-DARWIN: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float], i32 noundef, ptr dead_on_return noundef, i32 noundef, ) // Not enough Z-regs (consumed by a HVA), PST passed indirectly // 0.0 -> d0 @@ -219,8 +219,8 @@ void test_argpass_no_z_hva(HVA *h, PST *p) { void argpass_no_z_hva_callee(double, HVA, int, PST, int, svbool_t); argpass_no_z_hva_callee(.0, *h, 1, *p, 2, svptrue_b64()); } -// CHECK-AAPCS: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>] alignstack(16), i32 noundef, ptr noundef, i32 noundef, ) -// CHECK-DARWIN: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>], i32 noundef, ptr noundef, i32 noundef, ) +// CHECK-AAPCS: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>] alignstack(16), i32 noundef, ptr dead_on_return noundef, i32 noundef, ) +// CHECK-DARWIN: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>], i32 noundef, ptr dead_on_return noundef, i32 noundef, ) // Not enough P-regs, PST passed indirectly, Z-regs and P-regs still available. // true -> p0-p2 @@ -233,7 +233,7 @@ void test_argpass_no_p(PST *p) { void argpass_no_p_callee(svbool_t, svbool_t, svbool_t, int, PST, int, double, svbool_t); argpass_no_p_callee(svptrue_b8(), svptrue_b16(), svptrue_b32(), 1, *p, 2, 3.0, svptrue_b64()); } -// CHECK: declare void @argpass_no_p_callee(, , , i32 noundef, ptr noundef, i32 noundef, double noundef, ) +// CHECK: declare void @argpass_no_p_callee(, , , i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, ) // Like above, using a tuple to occupy some registers. @@ -250,7 +250,7 @@ void test_argpass_no_p_tuple(PST *p, svbool_t u, svboolx2_t v) { svbool_t); argpass_no_p_tuple_callee(v, u, 1, *p, 2, 3.0, svptrue_b64()); } -// CHECK: declare void @argpass_no_p_tuple_callee(, , , i32 noundef, ptr noundef, i32 noundef, double noundef, ) +// CHECK: declare void @argpass_no_p_tuple_callee(, , , i32 noundef, ptr dead_on_return noundef, i32 noundef, double noundef, ) // HFAs go back-to-back to memory, afterwards Z-regs not available, PST passed indirectly. @@ -263,8 +263,8 @@ void test_after_hfa(HFA *h, PST *p) { void after_hfa_callee(double, double, double, double, double, HFA, PST, HFA, svbool_t); after_hfa_callee(.0, .0, .0, .0, .0, *h, *p, *h, svpfalse()); } -// CHECK-AAPCS: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float] alignstack(8), ptr noundef, [4 x float] alignstack(8), ) -// CHECK-DARWIN: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float], ptr noundef, [4 x float], ) +// CHECK-AAPCS: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float] alignstack(8), ptr dead_on_return noundef, [4 x float] alignstack(8), ) +// CHECK-DARWIN: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float], ptr dead_on_return noundef, [4 x float], ) // Small PST, not enough registers, passed indirectly, unlike other small // aggregates. @@ -277,7 +277,7 @@ void test_small_pst(SmallPST *p, SmallAgg *s) { void small_pst_callee(SmallAgg, double, double, double, double, double, double, double, double, double, SmallPST, double); small_pst_callee(*s, .0, .0, .0, .0, .0, .0, .0, .0, 1.0, *p, 2.0); } -// CHECK-AAPCS: declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, ptr noundef, double noundef) +// CHECK-AAPCS: declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, ptr dead_on_return noundef, double noundef) // CHECK-DARWIN: declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, i128, double noundef) @@ -326,12 +326,12 @@ void test_pass_variadic(PST *p, PST *q) { pass_variadic_callee(*p, *q); } // CHECK-AAPCS: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp, ptr noundef nonnull align 16 dereferenceable(96) %q, i64 96, i1 false) -// CHECK-AAPCS: call void (, , , , , , ...) @pass_variadic_callee( %1, %cast.scalable1, %cast.scalable2, %cast.scalable3, %cast.scalable4, %12, ptr noundef nonnull %byval-temp) +// CHECK-AAPCS: call void (, , , , , , ...) @pass_variadic_callee( %1, %cast.scalable1, %cast.scalable2, %cast.scalable3, %cast.scalable4, %12, ptr dead_on_return noundef nonnull %byval-temp) // CHECK-DARWIN: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp, ptr noundef nonnull align 16 dereferenceable(96) %p, i64 96, i1 false) // CHECK-DARWIN: call void @llvm.lifetime.start.p0(i64 96, ptr nonnull %byval-temp1) // CHECK-DARWIN: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp1, ptr noundef nonnull align 16 dereferenceable(96) %q, i64 96, i1 false) -// CHECK-DARWIN: call void (ptr, ...) @pass_variadic_callee(ptr noundef nonnull %byval-temp, ptr noundef nonnull %byval-temp1) +// CHECK-DARWIN: call void (ptr, ...) @pass_variadic_callee(ptr dead_on_return noundef nonnull %byval-temp, ptr dead_on_return noundef nonnull %byval-temp1) // Test passing a small PST, still passed indirectly, despite being <= 128 bits @@ -340,7 +340,7 @@ void test_small_pst_variadic(SmallPST *p) { small_pst_variadic_callee(0, *p); } // CHECK-AAPCS: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) %byval-temp, ptr noundef nonnull align 16 dereferenceable(16) %p, i64 16, i1 false) -// CHECK-AAPCS: call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, ptr noundef nonnull %byval-temp) +// CHECK-AAPCS: call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, ptr dead_on_return noundef nonnull %byval-temp) // CHECK-DARWIN: %0 = load i128, ptr %p, align 16 // CHECK-DARWIN: tail call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, i128 %0) @@ -467,7 +467,7 @@ void test_tuple_reg_count(svfloat32_t x, svfloat32x2_t y) { svfloat32_t, svfloat32_t, svfloat32_t, svfloat32x2_t); test_tuple_reg_count_callee(x, x, x, x, x, x, x, y); } -// CHECK-AAPCS: declare void @test_tuple_reg_count_callee(, , , , , , , ptr noundef) +// CHECK-AAPCS: declare void @test_tuple_reg_count_callee(, , , , , , , ptr dead_on_return noundef) // CHECK-DARWIN: declare void @test_tuple_reg_count_callee(, , , , , , , , ) // Regression test for incorrect passing of SVE vector tuples @@ -476,5 +476,5 @@ void test_tuple_reg_count_bool(svboolx4_t x, svboolx4_t y) { void test_tuple_reg_count_bool_callee(svboolx4_t, svboolx4_t); test_tuple_reg_count_bool_callee(x, y); } -// CHECK-AAPCS: declare void @test_tuple_reg_count_bool_callee(, , , , ptr noundef) +// CHECK-AAPCS: declare void @test_tuple_reg_count_bool_callee(, , , , ptr dead_on_return noundef) // CHECK-DARWIN: declare void @test_tuple_reg_count_bool_callee(, , , , , , , ) diff --git a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp index b1232921df363..f0c9ef28201a5 100644 --- a/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp +++ b/clang/test/CodeGen/AArch64/struct-coerce-using-ptr.cpp @@ -82,7 +82,7 @@ struct Sppp { int *x, *y, *z; }; // CHECK-A64-LABEL: define dso_local void @_Z4Tppp4Sppp( -// CHECK-A64-SAME: ptr noundef [[S:%.*]]) #[[ATTR0]] { +// CHECK-A64-SAME: ptr dead_on_return noundef [[S:%.*]]) #[[ATTR0]] { // CHECK-A64-NEXT: [[ENTRY:.*:]] // CHECK-A64-NEXT: [[S_INDIRECT_ADDR:%.*]] = alloca ptr, align 8 // CHECK-A64-NEXT: store ptr [[S]], ptr [[S_INDIRECT_ADDR]], align 8 @@ -490,7 +490,7 @@ struct Spa3 { int* xs[3]; }; // CHECK-A64-LABEL: define dso_local void @_Z4Tpa34Spa3( -// CHECK-A64-SAME: ptr noundef [[S:%.*]]) #[[ATTR0]] { +// CHECK-A64-SAME: ptr dead_on_return noundef [[S:%.*]]) #[[ATTR0]] { // CHECK-A64-NEXT: [[ENTRY:.*:]] // CHECK-A64-NEXT: [[S_INDIRECT_ADDR:%.*]] = alloca ptr, align 8 // CHECK-A64-NEXT: store ptr [[S]], ptr [[S_INDIRECT_ADDR]], align 8 diff --git a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c index c3d0541229fac..d244a8ba88572 100644 --- a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c +++ b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.c @@ -58,7 +58,7 @@ typedef int8_t vec_int8 __attribute__((vector_size(N / 8))); // CHECK128-NEXT: ret <16 x i8> [[CASTFIXEDSVE]] // CHECK-LABEL: define{{.*}} void @f2( -// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<[[#div(VBITS,8)]] x i8>) align 16 captures(none) initializes((0, [[#div(VBITS,8)]])) %agg.result, ptr noundef readonly captures(none) %0) +// CHECK-SAME: ptr dead_on_unwind noalias writable writeonly sret(<[[#div(VBITS,8)]] x i8>) align 16 captures(none) initializes((0, [[#div(VBITS,8)]])) %agg.result, ptr dead_on_return noundef readonly captures(none) %0) // CHECK-NEXT: entry: // CHECK-NEXT: [[X:%.*]] = load <[[#div(VBITS,8)]] x i8>, ptr [[TMP0:%.*]], align 16, [[TBAA6:!tbaa !.*]] // CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv16i8.v[[#div(VBITS,8)]]i8( poison, <[[#div(VBITS,8)]] x i8> [[X]], i64 0) @@ -88,13 +88,13 @@ typedef svint8_t vec2 __attribute__((arm_sve_vector_bits(N))); // CHECK-NEXT: [[X:%.*]] = tail call <[[#div(VBITS,8)]] x i8> @llvm.vector.extract.v[[#div(VBITS,8)]]i8.nxv16i8( [[X_COERCE:%.*]], i64 0) // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 [[SIZE:[0-9]+]], ptr nonnull [[INDIRECT_ARG_TEMP]]) #[[ATTR6:[0-9]+]] // CHECK-NEXT: store <[[#div(VBITS,8)]] x i8> [[X]], ptr [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6]] -// CHECK-NEXT: call void @f3(ptr noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]] +// CHECK-NEXT: call void @f3(ptr dead_on_return noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]] // CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 [[SIZE]], ptr nonnull [[INDIRECT_ARG_TEMP]]) #[[ATTR6:[0-9]+]] // CHECK-NEXT: ret void // CHECK128-LABEL: declare void @f3(<16 x i8> noundef) // CHECK-LABEL: declare void @f3( -// CHECK-SAME: ptr noundef) +// CHECK-SAME: ptr dead_on_return noundef) void g(vec2 x) { f3(x); } // OK #endif diff --git a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp index e82069aab2486..d42ecb663050f 100644 --- a/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp +++ b/clang/test/CodeGen/AArch64/sve-acle-__ARM_FEATURE_SVE_VECTOR_OPERATORS.cpp @@ -75,7 +75,7 @@ typedef svint16_t vec2 __attribute__((arm_sve_vector_bits(N))); // CHECKWIDE-NEXT: [[X:%.*]] = tail call <[[#div(VBITS, 16)]] x i16> @llvm.vector.extract.v[[#div(VBITS, 16)]]i16.nxv8i16( [[X_COERCE:%.*]], i64 0) // CHECKWIDE-NEXT: call void @llvm.lifetime.start.p0(i64 [[SIZE:[0-9]+]], ptr nonnull [[INDIRECT_ARG_TEMP]]) #[[ATTR6:[0-9]+]] // CHECKWIDE-NEXT: store <[[#div(VBITS, 16)]] x i16> [[X]], ptr [[INDIRECT_ARG_TEMP]], align 16, [[TBAA6:!tbaa !.*]] -// CHECKWIDE-NEXT: call void @_Z1fDv[[#div(VBITS, 16)]]_s(ptr noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]] +// CHECKWIDE-NEXT: call void @_Z1fDv[[#div(VBITS, 16)]]_s(ptr dead_on_return noundef nonnull [[INDIRECT_ARG_TEMP]]) [[ATTR5:#.*]] // CHECKWIDE-NEXT: call void @llvm.lifetime.end.p0(i64 [[SIZE]], ptr nonnull [[INDIRECT_ARG_TEMP]]) #[[ATTR6:[0-9]+]] // CHECKWIDE-NEXT: ret void void g(vec2 x) { f(x); } // OK diff --git a/clang/test/CodeGen/LoongArch/bitint.c b/clang/test/CodeGen/LoongArch/bitint.c index f346f569d0eb0..950f5308e32cc 100644 --- a/clang/test/CodeGen/LoongArch/bitint.c +++ b/clang/test/CodeGen/LoongArch/bitint.c @@ -26,12 +26,12 @@ void pass_BitInt129(_BitInt(129)); // LA32-NEXT: [[LOADEDV1:%.*]] = trunc i128 [[TMP1]] to i65 // LA32-NEXT: [[STOREDV:%.*]] = sext i65 [[LOADEDV1]] to i128 // LA32-NEXT: store i128 [[STOREDV]], ptr [[BYVAL_TEMP]], align 16 -// LA32-NEXT: call void @pass_BitInt65(ptr noundef [[BYVAL_TEMP]]) +// LA32-NEXT: call void @pass_BitInt65(ptr dead_on_return noundef [[BYVAL_TEMP]]) // LA32-NEXT: [[TMP2:%.*]] = load i256, ptr [[L129]], align 16 // LA32-NEXT: [[LOADEDV2:%.*]] = trunc i256 [[TMP2]] to i129 // LA32-NEXT: [[STOREDV4:%.*]] = sext i129 [[LOADEDV2]] to i256 // LA32-NEXT: store i256 [[STOREDV4]], ptr [[BYVAL_TEMP3]], align 16 -// LA32-NEXT: call void @pass_BitInt129(ptr noundef [[BYVAL_TEMP3]]) +// LA32-NEXT: call void @pass_BitInt129(ptr dead_on_return noundef [[BYVAL_TEMP3]]) // LA32-NEXT: ret void // // LA64-LABEL: define dso_local void @example_BitInt( @@ -54,7 +54,7 @@ void pass_BitInt129(_BitInt(129)); // LA64-NEXT: [[LOADEDV2:%.*]] = trunc i256 [[TMP2]] to i129 // LA64-NEXT: [[STOREDV:%.*]] = sext i129 [[LOADEDV2]] to i256 // LA64-NEXT: store i256 [[STOREDV]], ptr [[BYVAL_TEMP]], align 16 -// LA64-NEXT: call void @pass_BitInt129(ptr noundef [[BYVAL_TEMP]]) +// LA64-NEXT: call void @pass_BitInt129(ptr dead_on_return noundef [[BYVAL_TEMP]]) // LA64-NEXT: ret void // void example_BitInt(void) { diff --git a/clang/test/CodeGen/PowerPC/ppc64-vector.c b/clang/test/CodeGen/PowerPC/ppc64-vector.c index 5d3dd86a009d5..2e99781f84910 100644 --- a/clang/test/CodeGen/PowerPC/ppc64-vector.c +++ b/clang/test/CodeGen/PowerPC/ppc64-vector.c @@ -39,7 +39,7 @@ v8i16 test_v8i16(v8i16 x) return x; } -// CHECK: define{{.*}} void @test_v16i16(ptr dead_on_unwind noalias writable sret(<16 x i16>) align 32 %agg.result, ptr noundef %0) +// CHECK: define{{.*}} void @test_v16i16(ptr dead_on_unwind noalias writable sret(<16 x i16>) align 32 %agg.result, ptr dead_on_return noundef %0) v16i16 test_v16i16(v16i16 x) { return x; diff --git a/clang/test/CodeGen/RISCV/riscv-abi.cpp b/clang/test/CodeGen/RISCV/riscv-abi.cpp index fe1a2b6d8595c..d2e080829e72f 100644 --- a/clang/test/CodeGen/RISCV/riscv-abi.cpp +++ b/clang/test/CodeGen/RISCV/riscv-abi.cpp @@ -75,7 +75,7 @@ struct child3_int64_s : parent3_float_s { }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @_Z30float_int64_struct_inheritance14child3_int64_s -// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD3_INT64_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD3_INT64_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // // LP64-LABEL: define dso_local [2 x i64] @_Z30float_int64_struct_inheritance14child3_int64_s @@ -99,7 +99,7 @@ struct child4_double_s : parent4_double_s { }; // ILP32-ILP32F-LABEL: define dso_local void @_Z32double_double_struct_inheritance15child4_double_s -// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD4_DOUBLE_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD4_DOUBLE_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local { double, double } @_Z32double_double_struct_inheritance15child4_double_s @@ -130,11 +130,11 @@ struct child5_virtual_s : virtual parent5_virtual_s { }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @_Z38int32_float_virtual_struct_inheritance16child5_virtual_s -// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD5_VIRTUAL_S:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD5_VIRTUAL_S:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // // LP64-LP64F-LP64D-LABEL: define dso_local void @_Z38int32_float_virtual_struct_inheritance16child5_virtual_s -// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD5_VIRTUAL_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_CHILD5_VIRTUAL_S:%.*]]) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // struct child5_virtual_s int32_float_virtual_struct_inheritance(struct child5_virtual_s a) { diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c index 82e43fff0c3aa..bc89cb532bdcc 100644 --- a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c +++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.c @@ -138,7 +138,7 @@ struct st_i32x4x9 { typedef int __attribute__((vector_size(256))) int32x64_t; -// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_too_large(ptr noundef %0) +// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_too_large(ptr dead_on_return noundef %0) void __attribute__((riscv_vls_cc)) test_too_large(int32x64_t arg) {} // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_too_large_256( noundef %arg.coerce) void __attribute__((riscv_vls_cc(256))) test_too_large_256(int32x64_t arg) {} @@ -173,9 +173,9 @@ void __attribute__((riscv_vls_cc)) test_st_i32x8x2(struct st_i32x8x2 arg) {} // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x8x2_256(target("riscv.vector.tuple", , 2) %arg) void __attribute__((riscv_vls_cc(256))) test_st_i32x8x2_256(struct st_i32x8x2 arg) {} -// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x64x2(ptr noundef %arg) +// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x64x2(ptr dead_on_return noundef %arg) void __attribute__((riscv_vls_cc)) test_st_i32x64x2(struct st_i32x64x2 arg) {} -// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x64x2_256(ptr noundef %arg) +// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x64x2_256(ptr dead_on_return noundef %arg) void __attribute__((riscv_vls_cc(256))) test_st_i32x64x2_256(struct st_i32x64x2 arg) {} // CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x3(target("riscv.vector.tuple", , 3) %arg) @@ -188,7 +188,7 @@ void __attribute__((riscv_vls_cc)) test_st_i32x4x8(struct st_i32x4x8 arg) {} // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x8_256(target("riscv.vector.tuple", , 8) %arg) void __attribute__((riscv_vls_cc(256))) test_st_i32x4x8_256(struct st_i32x4x8 arg) {} -// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x9(ptr noundef %arg) +// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @test_st_i32x4x9(ptr dead_on_return noundef %arg) void __attribute__((riscv_vls_cc)) test_st_i32x4x9(struct st_i32x4x9 arg) {} -// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x9_256(ptr noundef %arg) +// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @test_st_i32x4x9_256(ptr dead_on_return noundef %arg) void __attribute__((riscv_vls_cc(256))) test_st_i32x4x9_256(struct st_i32x4x9 arg) {} diff --git a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp index 5f6539796c20d..128610e578c26 100644 --- a/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp +++ b/clang/test/CodeGen/RISCV/riscv-vector-callingconv-llvm-ir.cpp @@ -118,7 +118,7 @@ struct st_i32x4x9 { typedef int __attribute__((vector_size(256))) int32x64_t; -// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z14test_too_largeDv64_i(ptr noundef %0) +// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z14test_too_largeDv64_i(ptr dead_on_return noundef %0) [[riscv::vls_cc]] void test_too_large(int32x64_t arg) {} // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z18test_too_large_256Dv64_i( noundef %arg.coerce) [[riscv::vls_cc(256)]] void test_too_large_256(int32x64_t arg) {} @@ -153,9 +153,9 @@ typedef int __attribute__((vector_size(256))) int32x64_t; // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x8x2_25610st_i32x8x2(target("riscv.vector.tuple", , 2) %arg) [[riscv::vls_cc(256)]] void test_st_i32x8x2_256(struct st_i32x8x2 arg) {} -// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z16test_st_i32x64x211st_i32x64x2(ptr noundef %arg) +// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z16test_st_i32x64x211st_i32x64x2(ptr dead_on_return noundef %arg) [[riscv::vls_cc]] void test_st_i32x64x2(struct st_i32x64x2 arg) {} -// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z20test_st_i32x64x2_25611st_i32x64x2(ptr noundef %arg) +// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z20test_st_i32x64x2_25611st_i32x64x2(ptr dead_on_return noundef %arg) [[riscv::vls_cc(256)]] void test_st_i32x64x2_256(struct st_i32x64x2 arg) {} // CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x310st_i32x4x3(target("riscv.vector.tuple", , 3) %arg) @@ -168,7 +168,7 @@ typedef int __attribute__((vector_size(256))) int32x64_t; // CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x8_25610st_i32x4x8(target("riscv.vector.tuple", , 8) %arg) [[riscv::vls_cc(256)]] void test_st_i32x4x8_256(struct st_i32x4x8 arg) {} -// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x910st_i32x4x9(ptr noundef %arg) +// CHECK-LLVM: define dso_local riscv_vls_cc(128) void @_Z15test_st_i32x4x910st_i32x4x9(ptr dead_on_return noundef %arg) [[riscv::vls_cc]] void test_st_i32x4x9(struct st_i32x4x9 arg) {} -// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x9_25610st_i32x4x9(ptr noundef %arg) +// CHECK-LLVM: define dso_local riscv_vls_cc(256) void @_Z19test_st_i32x4x9_25610st_i32x4x9(ptr dead_on_return noundef %arg) [[riscv::vls_cc(256)]] void test_st_i32x4x9_256(struct st_i32x4x9 arg) {} diff --git a/clang/test/CodeGen/RISCV/riscv32-abi.c b/clang/test/CodeGen/RISCV/riscv32-abi.c index b53f9a9169146..a9e56d40817ae 100644 --- a/clang/test/CodeGen/RISCV/riscv32-abi.c +++ b/clang/test/CodeGen/RISCV/riscv32-abi.c @@ -246,7 +246,7 @@ struct large { }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_agg_large -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[X:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[X:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_agg_large(struct large x) { @@ -266,7 +266,7 @@ struct large f_agg_large_ret(int32_t i, int8_t j) { typedef unsigned char v16i8 __attribute__((vector_size(16))); // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_vec_large_v16i8 -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[TMP0:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_vec_large_v16i8(v16i8 x) { @@ -285,7 +285,7 @@ v16i8 f_vec_large_v16i8_ret(void) { // if they were passed in registers. // ILP32-ILP32F-ILP32D-LABEL: define dso_local i32 @f_scalar_stack_1 -// ILP32-ILP32F-ILP32D-SAME: (i32 [[A_COERCE:%.*]], [2 x i32] [[B_COERCE:%.*]], i64 [[C_COERCE:%.*]], ptr noundef [[D:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]], i8 noundef signext [[H:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (i32 [[A_COERCE:%.*]], [2 x i32] [[B_COERCE:%.*]], i64 [[C_COERCE:%.*]], ptr dead_on_return noundef [[D:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]], i8 noundef signext [[H:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c, @@ -343,7 +343,7 @@ struct large f_scalar_stack_6(float a, int64_t b, double c, long double d, // they would be if passed via registers. // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_agg_stack -// ILP32-ILP32F-ILP32D-SAME: (double noundef [[A:%.*]], i64 noundef [[B:%.*]], double noundef [[C:%.*]], i64 noundef [[D:%.*]], i32 [[E_COERCE:%.*]], [2 x i32] [[F_COERCE:%.*]], i64 [[G_COERCE:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (double noundef [[A:%.*]], i64 noundef [[B:%.*]], double noundef [[C:%.*]], i64 noundef [[D:%.*]], i32 [[E_COERCE:%.*]], [2 x i32] [[F_COERCE:%.*]], i64 [[G_COERCE:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_agg_stack(double a, int64_t b, double c, int64_t d, struct tiny e, @@ -366,7 +366,7 @@ struct double_int8_s { double d; int64_t i; }; struct int_double_s { int a; double b; }; // ILP32-ILP32F-LABEL: define dso_local void @f_int_double_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_int_double_s_arg @@ -482,7 +482,7 @@ struct zbf_double_zbf_s f_ret_zbf_double_zbf_s(void) { struct double_float_s { double f; float g; }; // ILP32-ILP32F-LABEL: define dso_local void @f_double_double_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_double_double_s_arg @@ -504,7 +504,7 @@ struct double_double_s f_ret_double_double_s(void) { } // ILP32-ILP32F-LABEL: define dso_local void @f_double_float_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_double_float_s_arg @@ -526,7 +526,7 @@ struct double_float_s f_ret_double_float_s(void) { } // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_double_s_arg_insufficient_fprs -// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_double_double_s_arg_insufficient_fprs(float a, double b, double c, double d, @@ -543,7 +543,7 @@ struct double_int64bf_s { double d; int64_t i : 32; }; struct double_int8_zbf_s { double d; int8_t i; int : 0; }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_int8_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_double_int8_s_arg(struct double_int8_s a) {} @@ -557,7 +557,7 @@ struct double_int8_s f_ret_double_int8_s(void) { } // ILP32-ILP32F-LABEL: define dso_local void @f_double_uint8_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_double_uint8_s_arg @@ -579,7 +579,7 @@ struct double_uint8_s f_ret_double_uint8_s(void) { } // ILP32-ILP32F-LABEL: define dso_local void @f_double_int32_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_double_int32_s_arg @@ -601,7 +601,7 @@ struct double_int32_s f_ret_double_int32_s(void) { } // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_int64_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_double_int64_s_arg(struct double_int64_s a) {} @@ -615,7 +615,7 @@ struct double_int64_s f_ret_double_int64_s(void) { } // ILP32-ILP32F-LABEL: define dso_local void @f_double_int64bf_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_double_int64bf_s_arg @@ -640,7 +640,7 @@ struct double_int64bf_s f_ret_double_int64bf_s(void) { // floating point calling convention. // ILP32-ILP32F-LABEL: define dso_local void @f_double_int8_zbf_s -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_double_int8_zbf_s @@ -662,14 +662,14 @@ struct double_int8_zbf_s f_ret_double_int8_zbf_s(void) { } // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_int8_s_arg_insufficient_gprs -// ILP32-ILP32F-ILP32D-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], i32 noundef [[H:%.*]], ptr noundef [[I:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], i32 noundef [[H:%.*]], ptr dead_on_return noundef [[I:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_double_int8_s_arg_insufficient_gprs(int a, int b, int c, int d, int e, int f, int g, int h, struct double_int8_s i) {} // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_struct_double_int8_insufficient_fprs -// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], double noundef [[H:%.*]], ptr noundef [[I:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], double noundef [[H:%.*]], ptr dead_on_return noundef [[I:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_struct_double_int8_insufficient_fprs(float a, double b, double c, double d, @@ -679,7 +679,7 @@ void f_struct_double_int8_insufficient_fprs(float a, double b, double c, double // floating-point value should be passed as if it were an fp+fp struct. // ILP32-ILP32F-LABEL: define dso_local void @f_doublecomplex -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_doublecomplex @@ -703,7 +703,7 @@ double __complex__ f_ret_doublecomplex(void) { struct doublecomplex_s { double __complex__ c; }; // ILP32-ILP32F-LABEL: define dso_local void @f_doublecomplex_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_doublecomplex_s_arg @@ -754,7 +754,7 @@ struct doublearr1_s f_ret_doublearr1_s(void) { struct doublearr2_s { double a[2]; }; // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_doublearr2_s_arg @@ -778,7 +778,7 @@ struct doublearr2_s f_ret_doublearr2_s(void) { struct doublearr2_tricky1_s { struct { double f[1]; } g[2]; }; // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_tricky1_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_doublearr2_tricky1_s_arg @@ -802,7 +802,7 @@ struct doublearr2_tricky1_s f_ret_doublearr2_tricky1_s(void) { struct doublearr2_tricky2_s { struct {}; struct { double f[1]; } g[2]; }; // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_tricky2_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_doublearr2_tricky2_s_arg @@ -826,7 +826,7 @@ struct doublearr2_tricky2_s f_ret_doublearr2_tricky2_s(void) { struct doublearr2_tricky3_s { union {}; struct { double f[1]; } g[2]; }; // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_tricky3_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_doublearr2_tricky3_s_arg @@ -850,7 +850,7 @@ struct doublearr2_tricky3_s f_ret_doublearr2_tricky3_s(void) { struct doublearr2_tricky4_s { union {}; struct { struct {}; double f[1]; } g[2]; }; // ILP32-ILP32F-LABEL: define dso_local void @f_doublearr2_tricky4_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_doublearr2_tricky4_s_arg @@ -877,7 +877,7 @@ struct doublearr2_tricky4_s f_ret_doublearr2_tricky4_s(void) { struct int_double_int_s { int a; double b; int c; }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int_double_int_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_int_double_int_s_arg(struct int_double_int_s a) {} @@ -893,7 +893,7 @@ struct int_double_int_s f_ret_int_double_int_s(void) { struct int64_double_s { int64_t a; double b; }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int64_double_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_int64_double_s_arg(struct int64_double_s a) {} @@ -909,7 +909,7 @@ struct int64_double_s f_ret_int64_double_s(void) { struct char_char_double_s { char a; char b; double c; }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_char_char_double_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_char_char_double_s_arg(struct char_char_double_s a) {} @@ -948,7 +948,7 @@ union double_u f_ret_double_u(void) { // double+double structs by the ABI. // ILP32-ILP32F-LABEL: define dso_local void @f_ret_double_int32_s_double_int32_s_just_sufficient_gprs -// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_DOUBLE_INT32_S:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_DOUBLE_INT32_S:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local { double, i32 } @f_ret_double_int32_s_double_int32_s_just_sufficient_gprs @@ -961,7 +961,7 @@ struct double_int32_s f_ret_double_int32_s_double_int32_s_just_sufficient_gprs( } // ILP32-ILP32F-LABEL: define dso_local void @f_ret_double_double_s_double_int32_s_just_sufficient_gprs -// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_DOUBLE_DOUBLE_S:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_DOUBLE_DOUBLE_S:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local { double, double } @f_ret_double_double_s_double_int32_s_just_sufficient_gprs @@ -974,7 +974,7 @@ struct double_double_s f_ret_double_double_s_double_int32_s_just_sufficient_gprs } // ILP32-ILP32F-LABEL: define dso_local void @f_ret_doublecomplex_double_int32_s_just_sufficient_gprs -// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]], i32 noundef [[E:%.*]], i32 noundef [[F:%.*]], i32 noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local { double, double } @f_ret_doublecomplex_double_int32_s_just_sufficient_gprs @@ -1189,7 +1189,7 @@ struct float_int32_s f_ret_float_int32_s(void) { } // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_float_int64_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_float_int64_s_arg(struct float_int64_s a) {} @@ -1465,7 +1465,7 @@ struct floatarr2_tricky4_s f_ret_floatarr2_tricky4_s(void) { struct int_float_int_s { int a; float b; int c; }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int_float_int_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_int_float_int_s_arg(struct int_float_int_s a) {} @@ -1481,7 +1481,7 @@ struct int_float_int_s f_ret_int_float_int_s(void) { struct int64_float_s { int64_t a; float b; }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int64_float_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_int64_float_s_arg(struct int64_float_s a) {} @@ -1619,7 +1619,7 @@ struct zbf_float16_zbf_s f_ret_zbf_float16_zbf_s(void) { struct double_float16_s { double f; _Float16 g; }; // ILP32-ILP32F-LABEL: define dso_local void @f_double_float16_s_arg -// ILP32-ILP32F-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F: entry: // // ILP32D-LABEL: define dso_local void @f_double_float16_s_arg @@ -1641,7 +1641,7 @@ struct double_float16_s f_ret_double_float16_s(void) { } // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_double_float16_s_arg_insufficient_fprs -// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (float noundef [[A:%.*]], double noundef [[B:%.*]], double noundef [[C:%.*]], double noundef [[D:%.*]], double noundef [[E:%.*]], double noundef [[F:%.*]], double noundef [[G:%.*]], ptr dead_on_return noundef [[H:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_double_float16_s_arg_insufficient_fprs(float a, double b, double c, double d, @@ -1725,7 +1725,7 @@ struct float16_int32_s f_ret_float16_int32_s(void) { } // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_float16_int64_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_float16_int64_s_arg(struct float16_int64_s a) {} @@ -2001,7 +2001,7 @@ struct float16arr2_tricky4_s f_ret_float16arr2_tricky4_s(void) { struct int_float16_int_s { int a; _Float16 b; int c; }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int_float16_int_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_int_float16_int_s_arg(struct int_float16_int_s a) {} @@ -2017,7 +2017,7 @@ struct int_float16_int_s f_ret_int_float16_int_s(void) { struct int64_float16_s { int64_t a; _Float16 b; }; // ILP32-ILP32F-ILP32D-LABEL: define dso_local void @f_int64_float16_s_arg -// ILP32-ILP32F-ILP32D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// ILP32-ILP32F-ILP32D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // ILP32-ILP32F-ILP32D: entry: // void f_int64_float16_s_arg(struct int64_float16_s a) {} diff --git a/clang/test/CodeGen/RISCV/riscv32-vararg.c b/clang/test/CodeGen/RISCV/riscv32-vararg.c index 2b332410f8637..ed301f9269bb8 100644 --- a/clang/test/CodeGen/RISCV/riscv32-vararg.c +++ b/clang/test/CodeGen/RISCV/riscv32-vararg.c @@ -64,7 +64,7 @@ int f_va_callee(int, ...); // CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SMALL_ALIGNED]], ptr [[DOTCOMPOUNDLITERAL4]], i32 0, i32 0 // CHECK-NEXT: [[TMP2:%.*]] = load i64, ptr [[COERCE_DIVE]], align 8 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[BYVAL_TEMP]], ptr align 4 [[DOTCOMPOUNDLITERAL6]], i32 16, i1 false) -// CHECK-NEXT: [[CALL:%.*]] = call i32 (i32, ...) @f_va_callee(i32 noundef 1, i32 noundef 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i32 [[TMP0]], [2 x i32] [[TMP1]], i64 [[TMP2]], ptr noundef [[BYVAL_TEMP]]) +// CHECK-NEXT: [[CALL:%.*]] = call i32 (i32, ...) @f_va_callee(i32 noundef 1, i32 noundef 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i32 [[TMP0]], [2 x i32] [[TMP1]], i64 [[TMP2]], ptr dead_on_return noundef [[BYVAL_TEMP]]) // CHECK-NEXT: ret void // void f_va_caller(void) { diff --git a/clang/test/CodeGen/RISCV/riscv64-abi.c b/clang/test/CodeGen/RISCV/riscv64-abi.c index 021565238904e..dc01750e56970 100644 --- a/clang/test/CodeGen/RISCV/riscv64-abi.c +++ b/clang/test/CodeGen/RISCV/riscv64-abi.c @@ -242,7 +242,7 @@ struct large { }; // LP64-LP64F-LP64D-LABEL: define dso_local void @f_agg_large -// LP64-LP64F-LP64D-SAME: (ptr noundef [[X:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (ptr dead_on_return noundef [[X:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // void f_agg_large(struct large x) { @@ -262,7 +262,7 @@ struct large f_agg_large_ret(int32_t i, int8_t j) { typedef unsigned char v32i8 __attribute__((vector_size(32))); // LP64-LP64F-LP64D-LABEL: define dso_local void @f_vec_large_v32i8 -// LP64-LP64F-LP64D-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (ptr dead_on_return noundef [[TMP0:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // void f_vec_large_v32i8(v32i8 x) { @@ -281,7 +281,7 @@ v32i8 f_vec_large_v32i8_ret(void) { // if they were passed in registers. // LP64-LP64F-LP64D-LABEL: define dso_local signext i32 @f_scalar_stack_1 -// LP64-LP64F-LP64D-SAME: (i64 [[A_COERCE:%.*]], [2 x i64] [[B_COERCE:%.*]], i128 [[C_COERCE:%.*]], ptr noundef [[D:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]], i8 noundef signext [[H:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (i64 [[A_COERCE:%.*]], [2 x i64] [[B_COERCE:%.*]], i128 [[C_COERCE:%.*]], ptr dead_on_return noundef [[D:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]], i8 noundef signext [[H:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c, @@ -290,7 +290,7 @@ int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c, } // LP64-LP64F-LP64D-LABEL: define dso_local signext i32 @f_scalar_stack_2 -// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], i64 noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], i64 noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // int f_scalar_stack_2(int32_t a, __int128_t b, int64_t c, long double d, v32i8 e, @@ -299,7 +299,7 @@ int f_scalar_stack_2(int32_t a, __int128_t b, int64_t c, long double d, v32i8 e, } // LP64-LP64F-LP64D-LABEL: define dso_local signext i32 @f_scalar_stack_3 -// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], double noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], double noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // int f_scalar_stack_3(int32_t a, __int128_t b, double c, long double d, v32i8 e, @@ -312,7 +312,7 @@ int f_scalar_stack_3(int32_t a, __int128_t b, double c, long double d, v32i8 e, // to pass a pointer. // LP64-LP64F-LP64D-LABEL: define dso_local void @f_scalar_stack_4 -// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_LARGE:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], fp128 noundef [[C:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_LARGE:%.*]]) align 8 [[AGG_RESULT:%.*]], i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], fp128 noundef [[C:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // struct large f_scalar_stack_4(uint32_t a, __int128_t b, long double c, v32i8 d, @@ -321,7 +321,7 @@ struct large f_scalar_stack_4(uint32_t a, __int128_t b, long double c, v32i8 d, } // LP64-LP64F-LP64D-LABEL: define dso_local void @f_scalar_stack_5 -// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_LARGE:%.*]]) align 8 [[AGG_RESULT:%.*]], double noundef [[A:%.*]], i128 noundef [[B:%.*]], fp128 noundef [[C:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (ptr dead_on_unwind noalias writable sret([[STRUCT_LARGE:%.*]]) align 8 [[AGG_RESULT:%.*]], double noundef [[A:%.*]], i128 noundef [[B:%.*]], fp128 noundef [[C:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[E:%.*]], i8 noundef signext [[F:%.*]], i8 noundef zeroext [[G:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // struct large f_scalar_stack_5(double a, __int128_t b, long double c, v32i8 d, @@ -330,7 +330,7 @@ struct large f_scalar_stack_5(double a, __int128_t b, long double c, v32i8 d, } // LP64-LP64F-LP64D-LABEL: define dso_local signext i32 @f_scalar_stack_6 -// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], float noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]], half noundef [[I:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (i32 noundef signext [[A:%.*]], i128 noundef [[B:%.*]], float noundef [[C:%.*]], fp128 noundef [[D:%.*]], ptr dead_on_return noundef [[TMP0:%.*]], i8 noundef zeroext [[F:%.*]], i8 noundef signext [[G:%.*]], i8 noundef zeroext [[H:%.*]], half noundef [[I:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // int f_scalar_stack_6(int32_t a, __int128_t b, float c, long double d, v32i8 e, @@ -1440,7 +1440,7 @@ struct doublearr2_tricky4_s f_ret_doublearr2_tricky4_s(void) { struct int_double_int_s { int a; double b; int c; }; // LP64-LP64F-LP64D-LABEL: define dso_local void @f_int_double_int_s_arg -// LP64-LP64F-LP64D-SAME: (ptr noundef [[A:%.*]]) #[[ATTR0]] { +// LP64-LP64F-LP64D-SAME: (ptr dead_on_return noundef [[A:%.*]]) #[[ATTR0]] { // LP64-LP64F-LP64D: entry: // void f_int_double_int_s_arg(struct int_double_int_s a) {} diff --git a/clang/test/CodeGen/RISCV/riscv64-vararg.c b/clang/test/CodeGen/RISCV/riscv64-vararg.c index a278f74ca4a86..17802553c795a 100644 --- a/clang/test/CodeGen/RISCV/riscv64-vararg.c +++ b/clang/test/CodeGen/RISCV/riscv64-vararg.c @@ -74,7 +74,7 @@ int f_va_callee(int, ...); // CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_SMALL_ALIGNED]], ptr [[DOTCOMPOUNDLITERAL4]], i32 0, i32 0 // CHECK-NEXT: [[TMP2:%.*]] = load i128, ptr [[COERCE_DIVE]], align 16 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[BYVAL_TEMP]], ptr align 8 [[DOTCOMPOUNDLITERAL6]], i64 32, i1 false) -// CHECK-NEXT: [[CALL:%.*]] = call signext i32 (i32, ...) @f_va_callee(i32 noundef signext 1, i32 noundef signext 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i64 [[TMP0]], [2 x i64] [[TMP1]], i128 [[TMP2]], ptr noundef [[BYVAL_TEMP]]) +// CHECK-NEXT: [[CALL:%.*]] = call signext i32 (i32, ...) @f_va_callee(i32 noundef signext 1, i32 noundef signext 2, i64 noundef 3, double noundef 4.000000e+00, double noundef 5.000000e+00, i64 [[TMP0]], [2 x i64] [[TMP1]], i128 [[TMP2]], ptr dead_on_return noundef [[BYVAL_TEMP]]) // CHECK-NEXT: [[CALL11:%.*]] = call signext i32 (i32, ...) @f_va_callee(i32 noundef signext 1, i32 noundef signext 2, i32 noundef signext 3, i32 noundef signext 4, fp128 noundef 0xL00000000000000004001400000000000, i32 noundef signext 6, i32 noundef signext 7, i32 noundef signext 8, i32 noundef signext 9) // CHECK-NEXT: [[A13:%.*]] = getelementptr inbounds nuw [[STRUCT_SMALL_ALIGNED]], ptr [[DOTCOMPOUNDLITERAL12]], i32 0, i32 0 // CHECK-NEXT: store i128 5, ptr [[A13]], align 16 diff --git a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c index e5704709a3a33..fab6050a0d876 100644 --- a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c +++ b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c @@ -58,91 +58,91 @@ unsigned int align = __alignof__ (v16i8); // CHECK-VECTOR: @align ={{.*}} global i32 8 v1i8 pass_v1i8(v1i8 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v1i8(ptr dead_on_unwind noalias writable sret(<1 x i8>) align 1 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v1i8(ptr dead_on_unwind noalias writable sret(<1 x i8>) align 1 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x i8> @pass_v1i8(<1 x i8> %{{.*}}) v2i8 pass_v2i8(v2i8 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v2i8(ptr dead_on_unwind noalias writable sret(<2 x i8>) align 2 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v2i8(ptr dead_on_unwind noalias writable sret(<2 x i8>) align 2 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <2 x i8> @pass_v2i8(<2 x i8> %{{.*}}) v4i8 pass_v4i8(v4i8 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v4i8(ptr dead_on_unwind noalias writable sret(<4 x i8>) align 4 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v4i8(ptr dead_on_unwind noalias writable sret(<4 x i8>) align 4 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <4 x i8> @pass_v4i8(<4 x i8> %{{.*}}) v8i8 pass_v8i8(v8i8 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v8i8(ptr dead_on_unwind noalias writable sret(<8 x i8>) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v8i8(ptr dead_on_unwind noalias writable sret(<8 x i8>) align 8 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <8 x i8> @pass_v8i8(<8 x i8> %{{.*}}) v16i8 pass_v16i8(v16i8 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v16i8(ptr dead_on_unwind noalias writable sret(<16 x i8>) align 16 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v16i8(ptr dead_on_unwind noalias writable sret(<16 x i8>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <16 x i8> @pass_v16i8(<16 x i8> %{{.*}}) v32i8 pass_v32i8(v32i8 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v32i8(ptr dead_on_unwind noalias writable sret(<32 x i8>) align 32 %{{.*}}, ptr %0) -// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v32i8(ptr dead_on_unwind noalias writable sret(<32 x i8>) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v32i8(ptr dead_on_unwind noalias writable sret(<32 x i8>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v32i8(ptr dead_on_unwind noalias writable sret(<32 x i8>) align 8 %{{.*}}, ptr dead_on_return %0) v1i16 pass_v1i16(v1i16 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v1i16(ptr dead_on_unwind noalias writable sret(<1 x i16>) align 2 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v1i16(ptr dead_on_unwind noalias writable sret(<1 x i16>) align 2 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x i16> @pass_v1i16(<1 x i16> %{{.*}}) v2i16 pass_v2i16(v2i16 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v2i16(ptr dead_on_unwind noalias writable sret(<2 x i16>) align 4 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v2i16(ptr dead_on_unwind noalias writable sret(<2 x i16>) align 4 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <2 x i16> @pass_v2i16(<2 x i16> %{{.*}}) v4i16 pass_v4i16(v4i16 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v4i16(ptr dead_on_unwind noalias writable sret(<4 x i16>) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v4i16(ptr dead_on_unwind noalias writable sret(<4 x i16>) align 8 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <4 x i16> @pass_v4i16(<4 x i16> %{{.*}}) v8i16 pass_v8i16(v8i16 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v8i16(ptr dead_on_unwind noalias writable sret(<8 x i16>) align 16 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v8i16(ptr dead_on_unwind noalias writable sret(<8 x i16>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <8 x i16> @pass_v8i16(<8 x i16> %{{.*}}) v1i32 pass_v1i32(v1i32 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v1i32(ptr dead_on_unwind noalias writable sret(<1 x i32>) align 4 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v1i32(ptr dead_on_unwind noalias writable sret(<1 x i32>) align 4 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x i32> @pass_v1i32(<1 x i32> %{{.*}}) v2i32 pass_v2i32(v2i32 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v2i32(ptr dead_on_unwind noalias writable sret(<2 x i32>) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v2i32(ptr dead_on_unwind noalias writable sret(<2 x i32>) align 8 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <2 x i32> @pass_v2i32(<2 x i32> %{{.*}}) v4i32 pass_v4i32(v4i32 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v4i32(ptr dead_on_unwind noalias writable sret(<4 x i32>) align 16 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v4i32(ptr dead_on_unwind noalias writable sret(<4 x i32>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <4 x i32> @pass_v4i32(<4 x i32> %{{.*}}) v1i64 pass_v1i64(v1i64 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v1i64(ptr dead_on_unwind noalias writable sret(<1 x i64>) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v1i64(ptr dead_on_unwind noalias writable sret(<1 x i64>) align 8 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x i64> @pass_v1i64(<1 x i64> %{{.*}}) v2i64 pass_v2i64(v2i64 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v2i64(ptr dead_on_unwind noalias writable sret(<2 x i64>) align 16 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v2i64(ptr dead_on_unwind noalias writable sret(<2 x i64>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <2 x i64> @pass_v2i64(<2 x i64> %{{.*}}) v1i128 pass_v1i128(v1i128 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v1i128(ptr dead_on_unwind noalias writable sret(<1 x i128>) align 16 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v1i128(ptr dead_on_unwind noalias writable sret(<1 x i128>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x i128> @pass_v1i128(<1 x i128> %{{.*}}) v1f32 pass_v1f32(v1f32 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v1f32(ptr dead_on_unwind noalias writable sret(<1 x float>) align 4 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v1f32(ptr dead_on_unwind noalias writable sret(<1 x float>) align 4 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x float> @pass_v1f32(<1 x float> %{{.*}}) v2f32 pass_v2f32(v2f32 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v2f32(ptr dead_on_unwind noalias writable sret(<2 x float>) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v2f32(ptr dead_on_unwind noalias writable sret(<2 x float>) align 8 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <2 x float> @pass_v2f32(<2 x float> %{{.*}}) v4f32 pass_v4f32(v4f32 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v4f32(ptr dead_on_unwind noalias writable sret(<4 x float>) align 16 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v4f32(ptr dead_on_unwind noalias writable sret(<4 x float>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <4 x float> @pass_v4f32(<4 x float> %{{.*}}) v1f64 pass_v1f64(v1f64 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v1f64(ptr dead_on_unwind noalias writable sret(<1 x double>) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v1f64(ptr dead_on_unwind noalias writable sret(<1 x double>) align 8 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x double> @pass_v1f64(<1 x double> %{{.*}}) v2f64 pass_v2f64(v2f64 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v2f64(ptr dead_on_unwind noalias writable sret(<2 x double>) align 16 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v2f64(ptr dead_on_unwind noalias writable sret(<2 x double>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <2 x double> @pass_v2f64(<2 x double> %{{.*}}) v1f128 pass_v1f128(v1f128 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_v1f128(ptr dead_on_unwind noalias writable sret(<1 x fp128>) align 16 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_v1f128(ptr dead_on_unwind noalias writable sret(<1 x fp128>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x fp128> @pass_v1f128(<1 x fp128> %{{.*}}) @@ -170,13 +170,13 @@ struct agg_v8i8 pass_agg_v8i8(struct agg_v8i8 arg) { return arg; } struct agg_v16i8 { v16i8 a; }; struct agg_v16i8 pass_agg_v16i8(struct agg_v16i8 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_v16i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v16i8) align 16 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_v16i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v16i8) align 16 %{{.*}}, ptr dead_on_return %{{.*}}) // CHECK-VECTOR-LABEL: define{{.*}} void @pass_agg_v16i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v16i8) align 8 %{{.*}}, <16 x i8> %{{.*}}) struct agg_v32i8 { v32i8 a; }; struct agg_v32i8 pass_agg_v32i8(struct agg_v32i8 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_v32i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v32i8) align 32 %{{.*}}, ptr %{{.*}}) -// CHECK-VECTOR-LABEL: define{{.*}} void @pass_agg_v32i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v32i8) align 8 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_v32i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v32i8) align 32 %{{.*}}, ptr dead_on_return %{{.*}}) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_agg_v32i8(ptr dead_on_unwind noalias writable sret(%struct.agg_v32i8) align 8 %{{.*}}, ptr dead_on_return %{{.*}}) // Verify that the following are *not* vector-like aggregate types diff --git a/clang/test/CodeGen/SystemZ/systemz-abi.c b/clang/test/CodeGen/SystemZ/systemz-abi.c index f26084ab44eae..83137ae6d5f82 100644 --- a/clang/test/CodeGen/SystemZ/systemz-abi.c +++ b/clang/test/CodeGen/SystemZ/systemz-abi.c @@ -50,7 +50,7 @@ long long pass_longlong(long long arg) { return arg; } // CHECK-LABEL: define{{.*}} i64 @pass_longlong(i64 %{{.*}}) __int128 pass_int128(__int128 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_int128(ptr dead_on_unwind noalias writable sret(i128) align 8 %{{.*}}, ptr dead_on_return %0) _Float16 pass__Float16(_Float16 arg) { return arg; } // CHECK-LABEL: define{{.*}} half @pass__Float16(half %{{.*}}) @@ -62,37 +62,37 @@ double pass_double(double arg) { return arg; } // CHECK-LABEL: define{{.*}} double @pass_double(double %{{.*}}) long double pass_longdouble(long double arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_longdouble(ptr dead_on_unwind noalias writable sret(fp128) align 8 %{{.*}}, ptr %0) +// CHECK-LABEL: define{{.*}} void @pass_longdouble(ptr dead_on_unwind noalias writable sret(fp128) align 8 %{{.*}}, ptr dead_on_return %0) // Complex types _Complex char pass_complex_char(_Complex char arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex_char(ptr dead_on_unwind noalias writable sret({ i8, i8 }) align 1 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex_char(ptr dead_on_unwind noalias writable sret({ i8, i8 }) align 1 %{{.*}}, ptr dead_on_return %{{.*}}arg) _Complex short pass_complex_short(_Complex short arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex_short(ptr dead_on_unwind noalias writable sret({ i16, i16 }) align 2 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex_short(ptr dead_on_unwind noalias writable sret({ i16, i16 }) align 2 %{{.*}}, ptr dead_on_return %{{.*}}arg) _Complex int pass_complex_int(_Complex int arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex_int(ptr dead_on_unwind noalias writable sret({ i32, i32 }) align 4 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex_int(ptr dead_on_unwind noalias writable sret({ i32, i32 }) align 4 %{{.*}}, ptr dead_on_return %{{.*}}arg) _Complex long pass_complex_long(_Complex long arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex_long(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex_long(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr dead_on_return %{{.*}}arg) _Complex long long pass_complex_longlong(_Complex long long arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex_longlong(ptr dead_on_unwind noalias writable sret({ i64, i64 }) align 8 %{{.*}}, ptr dead_on_return %{{.*}}arg) _Complex _Float16 pass_complex__Float16(_Complex _Float16 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex__Float16(ptr dead_on_unwind noalias writable sret({ half, half }) align 2 %{{.*}}, ptr dead_on_return %{{.*}}arg) _Complex float pass_complex_float(_Complex float arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex_float(ptr dead_on_unwind noalias writable sret({ float, float }) align 4 %{{.*}}, ptr dead_on_return %{{.*}}arg) _Complex double pass_complex_double(_Complex double arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex_double(ptr dead_on_unwind noalias writable sret({ double, double }) align 8 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex_double(ptr dead_on_unwind noalias writable sret({ double, double }) align 8 %{{.*}}, ptr dead_on_return %{{.*}}arg) _Complex long double pass_complex_longdouble(_Complex long double arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_complex_longdouble(ptr dead_on_unwind noalias writable sret({ fp128, fp128 }) align 8 %{{.*}}, ptr %{{.*}}arg) +// CHECK-LABEL: define{{.*}} void @pass_complex_longdouble(ptr dead_on_unwind noalias writable sret({ fp128, fp128 }) align 8 %{{.*}}, ptr dead_on_return %{{.*}}arg) // Aggregate types @@ -107,7 +107,7 @@ struct agg_2byte pass_agg_2byte(struct agg_2byte arg) { return arg; } struct agg_3byte { char a[3]; }; struct agg_3byte pass_agg_3byte(struct agg_3byte arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_3byte(ptr dead_on_unwind noalias writable sret(%struct.agg_3byte) align 1 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_3byte(ptr dead_on_unwind noalias writable sret(%struct.agg_3byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}}) struct agg_4byte { char a[4]; }; struct agg_4byte pass_agg_4byte(struct agg_4byte arg) { return arg; } @@ -115,15 +115,15 @@ struct agg_4byte pass_agg_4byte(struct agg_4byte arg) { return arg; } struct agg_5byte { char a[5]; }; struct agg_5byte pass_agg_5byte(struct agg_5byte arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_5byte(ptr dead_on_unwind noalias writable sret(%struct.agg_5byte) align 1 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_5byte(ptr dead_on_unwind noalias writable sret(%struct.agg_5byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}}) struct agg_6byte { char a[6]; }; struct agg_6byte pass_agg_6byte(struct agg_6byte arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_6byte(ptr dead_on_unwind noalias writable sret(%struct.agg_6byte) align 1 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_6byte(ptr dead_on_unwind noalias writable sret(%struct.agg_6byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}}) struct agg_7byte { char a[7]; }; struct agg_7byte pass_agg_7byte(struct agg_7byte arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_7byte(ptr dead_on_unwind noalias writable sret(%struct.agg_7byte) align 1 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_7byte(ptr dead_on_unwind noalias writable sret(%struct.agg_7byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}}) struct agg_8byte { char a[8]; }; struct agg_8byte pass_agg_8byte(struct agg_8byte arg) { return arg; } @@ -131,7 +131,7 @@ struct agg_8byte pass_agg_8byte(struct agg_8byte arg) { return arg; } struct agg_16byte { char a[16]; }; struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_16byte(ptr dead_on_unwind noalias writable sret(%struct.agg_16byte) align 1 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_16byte(ptr dead_on_unwind noalias writable sret(%struct.agg_16byte) align 1 %{{.*}}, ptr dead_on_return %{{.*}}) // Float-like aggregate types @@ -153,7 +153,7 @@ struct agg_double pass_agg_double(struct agg_double arg) { return arg; } struct agg_longdouble { long double a; }; struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_longdouble(ptr dead_on_unwind noalias writable sret(%struct.agg_longdouble) align 8 %{{.*}}, ptr dead_on_return %{{.*}}) struct agg__Float16_a4 { _Float16 a __attribute__((aligned (4))); }; struct agg__Float16_a4 pass_agg__Float16_a4(struct agg__Float16_a4 arg) { return arg; } @@ -167,7 +167,7 @@ struct agg__Float16_a8 pass_agg__Float16_a8(struct agg__Float16_a8 arg) { return struct agg__Float16_a16 { _Float16 a __attribute__((aligned (16))); }; struct agg__Float16_a16 pass_agg__Float16_a16(struct agg__Float16_a16 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg__Float16_a16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a16) align 16 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg__Float16_a16(ptr dead_on_unwind noalias writable sret(%struct.agg__Float16_a16) align 16 %{{.*}}, ptr dead_on_return %{{.*}}) struct agg_float_a8 { float a __attribute__((aligned (8))); }; struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; } @@ -176,7 +176,7 @@ struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; } struct agg_float_a16 { float a __attribute__((aligned (16))); }; struct agg_float_a16 pass_agg_float_a16(struct agg_float_a16 arg) { return arg; } -// CHECK-LABEL: define{{.*}} void @pass_agg_float_a16(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a16) align 16 %{{.*}}, ptr %{{.*}}) +// CHECK-LABEL: define{{.*}} void @pass_agg_float_a16(ptr dead_on_unwind noalias writable sret(%struct.agg_float_a16) align 16 %{{.*}}, ptr dead_on_return %{{.*}}) // Verify that the following are *not* float-like aggregate types diff --git a/clang/test/CodeGen/SystemZ/systemz-inline-asm.c b/clang/test/CodeGen/SystemZ/systemz-inline-asm.c index 434937a66389c..d76fb4bd1fda6 100644 --- a/clang/test/CodeGen/SystemZ/systemz-inline-asm.c +++ b/clang/test/CodeGen/SystemZ/systemz-inline-asm.c @@ -131,7 +131,7 @@ double test_f64(double f, double g) { long double test_f128(long double f, long double g) { asm("axbr %0, %2" : "=f" (f) : "0" (f), "f" (g)); return f; -// CHECK: define{{.*}} void @test_f128(ptr dead_on_unwind noalias writable writeonly sret(fp128) align 8 captures(none) initializes((0, 16)) [[DEST:%.*]], ptr noundef readonly captures(none) %0, ptr noundef readonly captures(none) %1) +// CHECK: define{{.*}} void @test_f128(ptr dead_on_unwind noalias writable writeonly sret(fp128) align 8 captures(none) initializes((0, 16)) [[DEST:%.*]], ptr dead_on_return noundef readonly captures(none) %0, ptr dead_on_return noundef readonly captures(none) %1) // CHECK: %f = load fp128, ptr %0 // CHECK: %g = load fp128, ptr %1 // CHECK: [[RESULT:%.*]] = tail call fp128 asm "axbr $0, $2", "=f,0,f"(fp128 %f, fp128 %g) diff --git a/clang/test/CodeGen/X86/cx-complex-range.c b/clang/test/CodeGen/X86/cx-complex-range.c index f87091427df71..b16b10b7b8a21 100644 --- a/clang/test/CodeGen/X86/cx-complex-range.c +++ b/clang/test/CodeGen/X86/cx-complex-range.c @@ -1064,7 +1064,7 @@ _Complex _Float16 mulf16(_Complex _Float16 a, _Complex _Float16 b) { // PRMTD-NEXT: ret <2 x half> [[TMP33]] // // X86WINPRMTD-LABEL: define dso_local i32 @f1( -// X86WINPRMTD-SAME: i32 noundef [[A_COERCE:%.*]], ptr noundef [[B:%.*]], i32 noundef [[C_COERCE:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: i32 noundef [[A_COERCE:%.*]], ptr dead_on_return noundef [[B:%.*]], i32 noundef [[C_COERCE:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[RETVAL:%.*]] = alloca { half, half }, align 2 // X86WINPRMTD-NEXT: [[A:%.*]] = alloca { half, half }, align 2 diff --git a/clang/test/CodeGen/X86/x86_32-arguments-win32.c b/clang/test/CodeGen/X86/x86_32-arguments-win32.c index 5b81c43f4bbb8..53d040af725b0 100644 --- a/clang/test/CodeGen/X86/x86_32-arguments-win32.c +++ b/clang/test/CodeGen/X86/x86_32-arguments-win32.c @@ -72,10 +72,10 @@ void receive_vec_512(__m512 x, __m512 y, __m512 z, __m512 w, __m512 q) { void receive_vec_1024(__m1024 x, __m1024 y, __m1024 z, __m1024 w, __m1024 q) { gv1024 = x + y + z + w + q; } -// CHECK-LABEL: define dso_local void @receive_vec_128(<4 x float> inreg noundef %x, <4 x float> inreg noundef %y, <4 x float> inreg noundef %z, ptr noundef %0, ptr noundef %1) -// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg noundef %x, <8 x float> inreg noundef %y, <8 x float> inreg noundef %z, ptr noundef %0, ptr noundef %1) -// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg noundef %x, <16 x float> inreg noundef %y, <16 x float> inreg noundef %z, ptr noundef %0, ptr noundef %1) -// CHECK-LABEL: define dso_local void @receive_vec_1024(ptr noundef %0, ptr noundef %1, ptr noundef %2, ptr noundef %3, ptr noundef %4) +// CHECK-LABEL: define dso_local void @receive_vec_128(<4 x float> inreg noundef %x, <4 x float> inreg noundef %y, <4 x float> inreg noundef %z, ptr dead_on_return noundef %0, ptr dead_on_return noundef %1) +// CHECK-LABEL: define dso_local void @receive_vec_256(<8 x float> inreg noundef %x, <8 x float> inreg noundef %y, <8 x float> inreg noundef %z, ptr dead_on_return noundef %0, ptr dead_on_return noundef %1) +// CHECK-LABEL: define dso_local void @receive_vec_512(<16 x float> inreg noundef %x, <16 x float> inreg noundef %y, <16 x float> inreg noundef %z, ptr dead_on_return noundef %0, ptr dead_on_return noundef %1) +// CHECK-LABEL: define dso_local void @receive_vec_1024(ptr dead_on_return noundef %0, ptr dead_on_return noundef %1, ptr dead_on_return noundef %2, ptr dead_on_return noundef %3, ptr dead_on_return noundef %4) void pass_vec_128(void) { __m128 z = {0}; @@ -83,13 +83,13 @@ void pass_vec_128(void) { } // CHECK-LABEL: define dso_local void @pass_vec_128() -// CHECK: call void @receive_vec_128(<4 x float> inreg noundef %{{[^,)]*}}, <4 x float> inreg noundef %{{[^,)]*}}, <4 x float> inreg noundef %{{[^,)]*}}, ptr noundef %{{[^,)]*}}, ptr noundef %{{[^,)]*}}) +// CHECK: call void @receive_vec_128(<4 x float> inreg noundef %{{[^,)]*}}, <4 x float> inreg noundef %{{[^,)]*}}, <4 x float> inreg noundef %{{[^,)]*}}, ptr dead_on_return noundef %{{[^,)]*}}, ptr dead_on_return noundef %{{[^,)]*}}) void __fastcall fastcall_indirect_vec(__m128 x, __m128 y, __m128 z, __m128 w, int edx, __m128 q) { gv128 = x + y + z + w + q; } -// CHECK-LABEL: define dso_local x86_fastcallcc void @"\01@fastcall_indirect_vec@84"(<4 x float> inreg noundef %x, <4 x float> inreg noundef %y, <4 x float> inreg noundef %z, ptr inreg noundef %0, i32 inreg noundef %edx, ptr noundef %1) +// CHECK-LABEL: define dso_local x86_fastcallcc void @"\01@fastcall_indirect_vec@84"(<4 x float> inreg noundef %x, <4 x float> inreg noundef %y, <4 x float> inreg noundef %z, ptr dead_on_return inreg noundef %0, i32 inreg noundef %edx, ptr dead_on_return noundef %1) struct __declspec(align(1)) Align1 { unsigned long long x; }; struct __declspec(align(4)) Align4 { unsigned long long x; }; @@ -156,4 +156,4 @@ void pass_fixed_align_variadic() { // correctly in Clang than it is to be bug for bug compatible, so we pass such // arguments indirectly. // CHECK-LABEL: define dso_local void @pass_fixed_align_variadic() -// CHECK: call void (ptr, ...) @receive_fixed_align_variadic(ptr noundef %{{[^)]*}}, i32 noundef 42) +// CHECK: call void (ptr, ...) @receive_fixed_align_variadic(ptr dead_on_return noundef %{{[^)]*}}, i32 noundef 42) diff --git a/clang/test/CodeGen/X86/x86_64-arguments-win32.c b/clang/test/CodeGen/X86/x86_64-arguments-win32.c index 8768e73a854aa..6010e531acb00 100644 --- a/clang/test/CodeGen/X86/x86_64-arguments-win32.c +++ b/clang/test/CodeGen/X86/x86_64-arguments-win32.c @@ -21,7 +21,7 @@ void f4(unsigned short a) {} // CHECK-LABEL: define dso_local void @f5(i64 noundef %a.coerce) void f5(_Complex float a) {} -// CHECK-LABEL: define dso_local void @f6(ptr noundef %a) +// CHECK-LABEL: define dso_local void @f6(ptr dead_on_return noundef %a) void f6(_Complex double a) {} // CHECK-LABEL: define dso_local i64 @f7() diff --git a/clang/test/CodeGen/aapcs64-align.cpp b/clang/test/CodeGen/aapcs64-align.cpp index 1c26d68e434f4..53fc53c2f7296 100644 --- a/clang/test/CodeGen/aapcs64-align.cpp +++ b/clang/test/CodeGen/aapcs64-align.cpp @@ -122,8 +122,8 @@ unsigned sizeof_RidiculouslyOverSizedBitfield = sizeof(RidiculouslyOverSizedBitf unsigned alignof_RidiculouslyOverSizedBitfield = alignof(RidiculouslyOverSizedBitfield); // CHECK: define{{.*}} void @g9 -// CHECK: call void @f9(i32 noundef 1, ptr noundef nonnull %agg.tmp) -// CHECK: declare void @f9(i32 noundef, ptr noundef) +// CHECK: call void @f9(i32 noundef 1, ptr dead_on_return noundef nonnull %agg.tmp) +// CHECK: declare void @f9(i32 noundef, ptr dead_on_return noundef) void f9(int a, RidiculouslyOverSizedBitfield b); void g9() { RidiculouslyOverSizedBitfield s = {42}; diff --git a/clang/test/CodeGen/arm-aapcs-vfp.c b/clang/test/CodeGen/arm-aapcs-vfp.c index 6581929f99f14..e60ed1e52c33a 100644 --- a/clang/test/CodeGen/arm-aapcs-vfp.c +++ b/clang/test/CodeGen/arm-aapcs-vfp.c @@ -65,7 +65,7 @@ struct big_struct { float f4; }; // CHECK: define{{.*}} arm_aapcs_vfpcc void @test_big([5 x i32] %{{.*}}) -// CHECK64: define{{.*}} void @test_big(ptr noundef %{{.*}}) +// CHECK64: define{{.*}} void @test_big(ptr dead_on_return noundef %{{.*}}) // CHECK64: call void @llvm.memcpy // CHECK64: call void @big_callee(ptr extern void big_callee(struct big_struct); diff --git a/clang/test/CodeGen/arm-abi-vector.c b/clang/test/CodeGen/arm-abi-vector.c index c2a8902007980..93b770878c3fa 100644 --- a/clang/test/CodeGen/arm-abi-vector.c +++ b/clang/test/CodeGen/arm-abi-vector.c @@ -177,11 +177,11 @@ double varargs_vec_19c(int fixed, ...) { double test_19c(__char19 *in) { // CHECK: test_19c -// CHECK: call arm_aapcscc double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr noundef {{%.*}}) +// CHECK: call arm_aapcscc double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}}) // APCS-GNU: test_19c -// APCS-GNU: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr noundef {{%.*}}) +// APCS-GNU: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}}) // ANDROID: test_19c -// ANDROID: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr noundef {{%.*}}) +// ANDROID: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}}) return varargs_vec_19c(19, *in); } diff --git a/clang/test/CodeGen/arm-swiftcall.c b/clang/test/CodeGen/arm-swiftcall.c index 677b878c6765d..55c22a45fd8de 100644 --- a/clang/test/CodeGen/arm-swiftcall.c +++ b/clang/test/CodeGen/arm-swiftcall.c @@ -261,7 +261,7 @@ TEST(struct_big_1) // CHECK-LABEL: define{{.*}} void @return_struct_big_1({{.*}} dead_on_unwind noalias writable sret({{.*}}) // Should not be byval. -// CHECK-LABEL: define{{.*}} void @take_struct_big_1(ptr{{( %.*)?}}) +// CHECK-LABEL: define{{.*}} void @take_struct_big_1(ptr dead_on_return{{( %.*)?}}) /*****************************************************************************/ /********************************* TYPE MERGING ******************************/ diff --git a/clang/test/CodeGen/arm64-abi-vector.c b/clang/test/CodeGen/arm64-abi-vector.c index 81e42315c883b..cf50cdd2fe86e 100644 --- a/clang/test/CodeGen/arm64-abi-vector.c +++ b/clang/test/CodeGen/arm64-abi-vector.c @@ -128,7 +128,7 @@ double varargs_vec_19c(int fixed, ...) { double test_19c(__char19 *in) { // CHECK: test_19c -// CHECK: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr noundef {{%.*}}) +// CHECK: call double (i32, ...) @varargs_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}}) return varargs_vec_19c(19, *in); } @@ -211,7 +211,7 @@ double varargs_vec_5i(int fixed, ...) { double test_5i(__int5 *in) { // CHECK: test_5i -// CHECK: call double (i32, ...) @varargs_vec_5i(i32 noundef 5, ptr noundef {{%.*}}) +// CHECK: call double (i32, ...) @varargs_vec_5i(i32 noundef 5, ptr dead_on_return noundef {{%.*}}) return varargs_vec_5i(5, *in); } @@ -231,7 +231,7 @@ double varargs_vec_3d(int fixed, ...) { double test_3d(__double3 *in) { // CHECK: test_3d -// CHECK: call double (i32, ...) @varargs_vec_3d(i32 noundef 3, ptr noundef {{%.*}}) +// CHECK: call double (i32, ...) @varargs_vec_3d(i32 noundef 3, ptr dead_on_return noundef {{%.*}}) return varargs_vec_3d(3, *in); } @@ -291,7 +291,7 @@ double test(__char3 *c3, __char5 *c5, __char9 *c9, __char19 *c19, __short3 *s3, __short5 *s5, __int3 *i3, __int5 *i5, __double3 *d3) { double ret = varargs_vec(3, *c3, *c5, *c9, *c19, *s3, *s5, *i3, *i5, *d3); -// CHECK: call double (i32, ...) @varargs_vec(i32 noundef 3, i32 {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, ptr noundef {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, ptr noundef {{%.*}}, ptr noundef {{%.*}}) +// CHECK: call double (i32, ...) @varargs_vec(i32 noundef 3, i32 {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, ptr dead_on_return noundef {{%.*}}, <2 x i32> {{%.*}}, <4 x i32> {{%.*}}, <4 x i32> {{%.*}}, ptr dead_on_return noundef {{%.*}}, ptr dead_on_return noundef {{%.*}}) return ret; } @@ -350,7 +350,7 @@ __attribute__((noinline)) double args_vec_19c(int fixed, __char19 c19) { double fixed_19c(__char19 *in) { // CHECK: fixed_19c -// CHECK: call double @args_vec_19c(i32 noundef 19, ptr noundef {{%.*}}) +// CHECK: call double @args_vec_19c(i32 noundef 19, ptr dead_on_return noundef {{%.*}}) return args_vec_19c(19, *in); } @@ -409,7 +409,7 @@ __attribute__((noinline)) double args_vec_5i(int fixed, __int5 c5) { double fixed_5i(__int5 *in) { // CHECK: fixed_5i -// CHECK: call double @args_vec_5i(i32 noundef 5, ptr noundef {{%.*}}) +// CHECK: call double @args_vec_5i(i32 noundef 5, ptr dead_on_return noundef {{%.*}}) return args_vec_5i(5, *in); } @@ -424,6 +424,6 @@ __attribute__((noinline)) double args_vec_3d(int fixed, __double3 c3) { double fixed_3d(__double3 *in) { // CHECK: fixed_3d -// CHECK: call double @args_vec_3d(i32 noundef 3, ptr noundef {{%.*}}) +// CHECK: call double @args_vec_3d(i32 noundef 3, ptr dead_on_return noundef {{%.*}}) return args_vec_3d(3, *in); } diff --git a/clang/test/CodeGen/arm64-arguments.c b/clang/test/CodeGen/arm64-arguments.c index 4c4f85d923e78..2e3ab388432f6 100644 --- a/clang/test/CodeGen/arm64-arguments.c +++ b/clang/test/CodeGen/arm64-arguments.c @@ -163,7 +163,7 @@ void f32(struct s32 s) { } // A composite type larger than 16 bytes should be passed indirectly. struct s33 { char buf[32*32]; }; void f33(struct s33 s) { } -// CHECK: define{{.*}} void @f33(ptr noundef %s) +// CHECK: define{{.*}} void @f33(ptr dead_on_return noundef %s) struct s34 { char c; }; void f34(struct s34 s); @@ -226,9 +226,9 @@ T_float32x2 f1_0(T_float32x2 a0) { return a0; } // CHECK: define{{.*}} <4 x float> @f1_1(<4 x float> noundef %{{.*}}) T_float32x4 f1_1(T_float32x4 a0) { return a0; } // Vector with length bigger than 16-byte is illegal and is passed indirectly. -// CHECK: define{{.*}} void @f1_2(ptr dead_on_unwind noalias writable sret(<8 x float>) align 16 %{{.*}}, ptr noundef %0) +// CHECK: define{{.*}} void @f1_2(ptr dead_on_unwind noalias writable sret(<8 x float>) align 16 %{{.*}}, ptr dead_on_return noundef %0) T_float32x8 f1_2(T_float32x8 a0) { return a0; } -// CHECK: define{{.*}} void @f1_3(ptr dead_on_unwind noalias writable sret(<16 x float>) align 16 %{{.*}}, ptr noundef %0) +// CHECK: define{{.*}} void @f1_3(ptr dead_on_unwind noalias writable sret(<16 x float>) align 16 %{{.*}}, ptr dead_on_return noundef %0) T_float32x16 f1_3(T_float32x16 a0) { return a0; } // Testing alignment with aggregates: HFA, aggregates with size <= 16 bytes and @@ -278,7 +278,7 @@ struct s37 typedef struct s37 s37_with_align; int32x4_t f37(int i, s37_with_align s1, s37_with_align s2) { -// CHECK: define{{.*}} <4 x i32> @f37(i32 noundef %i, ptr noundef %s1, ptr noundef %s2) +// CHECK: define{{.*}} <4 x i32> @f37(i32 noundef %i, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2) // CHECK: load <4 x i32>, ptr %s1, align 16 // CHECK: load <4 x i32>, ptr %s2, align 16 int32x4_t v = vaddq_s32(*(int32x4_t *)&s1, @@ -292,7 +292,7 @@ int32x4_t caller37() { // CHECK: %[[b:.*]] = alloca %struct.s37, align 16 // CHECK: call void @llvm.memcpy // CHECK: call void @llvm.memcpy -// CHECK: call <4 x i32> @f37(i32 noundef 3, ptr noundef %[[a]], ptr noundef %[[b]]) +// CHECK: call <4 x i32> @f37(i32 noundef 3, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]]) return f37(3, g37, g37); } @@ -530,7 +530,7 @@ typedef struct s42 s42_no_align; // passing structs in registers __attribute__ ((noinline)) int f42(int i, s42_no_align s1, s42_no_align s2) { -// CHECK: define{{.*}} i32 @f42(i32 noundef %i, ptr noundef %s1, ptr noundef %s2) +// CHECK: define{{.*}} i32 @f42(i32 noundef %i, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2) // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s1, i32 0, i32 0 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s2, i32 0, i32 0 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s1, i32 0, i32 1 @@ -545,14 +545,14 @@ int caller42() { // CHECK: %[[b:.*]] = alloca %struct.s42, align 4 // CHECK: call void @llvm.memcpy.p0.p0.i64 // CHECK: call void @llvm.memcpy.p0.p0.i64 -// CHECK: call i32 @f42(i32 noundef 3, ptr noundef %[[a]], ptr noundef %[[b]]) +// CHECK: call i32 @f42(i32 noundef 3, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]]) return f42(3, g42, g42_2); } // passing structs on stack __attribute__ ((noinline)) int f42_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, s42_no_align s1, s42_no_align s2) { -// CHECK: define{{.*}} i32 @f42_stack(i32 noundef %i, i32 noundef %i2, i32 noundef %i3, i32 noundef %i4, i32 noundef %i5, i32 noundef %i6, i32 noundef %i7, i32 noundef %i8, i32 noundef %i9, ptr noundef %s1, ptr noundef %s2) +// CHECK: define{{.*}} i32 @f42_stack(i32 noundef %i, i32 noundef %i2, i32 noundef %i3, i32 noundef %i4, i32 noundef %i5, i32 noundef %i6, i32 noundef %i7, i32 noundef %i8, i32 noundef %i9, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2) // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s1, i32 0, i32 0 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s2, i32 0, i32 0 // CHECK: getelementptr inbounds nuw %struct.s42, ptr %s1, i32 0, i32 1 @@ -565,7 +565,7 @@ int caller42_stack() { // CHECK: %[[b:.*]] = alloca %struct.s42, align 4 // CHECK: call void @llvm.memcpy.p0.p0.i64 // CHECK: call void @llvm.memcpy.p0.p0.i64 -// CHECK: call i32 @f42_stack(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, i32 noundef 6, i32 noundef 7, i32 noundef 8, i32 noundef 9, ptr noundef %[[a]], ptr noundef %[[b]]) +// CHECK: call i32 @f42_stack(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, i32 noundef 6, i32 noundef 7, i32 noundef 8, i32 noundef 9, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]]) return f42_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g42, g42_2); } @@ -583,7 +583,7 @@ typedef struct s43 s43_with_align; // passing aligned structs in registers __attribute__ ((noinline)) int f43(int i, s43_with_align s1, s43_with_align s2) { -// CHECK: define{{.*}} i32 @f43(i32 noundef %i, ptr noundef %s1, ptr noundef %s2) +// CHECK: define{{.*}} i32 @f43(i32 noundef %i, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2) // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s1, i32 0, i32 0 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s2, i32 0, i32 0 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s1, i32 0, i32 1 @@ -598,14 +598,14 @@ int caller43() { // CHECK: %[[b:.*]] = alloca %struct.s43, align 16 // CHECK: call void @llvm.memcpy.p0.p0.i64 // CHECK: call void @llvm.memcpy.p0.p0.i64 -// CHECK: call i32 @f43(i32 noundef 3, ptr noundef %[[a]], ptr noundef %[[b]]) +// CHECK: call i32 @f43(i32 noundef 3, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]]) return f43(3, g43, g43_2); } // passing aligned structs on stack __attribute__ ((noinline)) int f43_stack(int i, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, s43_with_align s1, s43_with_align s2) { -// CHECK: define{{.*}} i32 @f43_stack(i32 noundef %i, i32 noundef %i2, i32 noundef %i3, i32 noundef %i4, i32 noundef %i5, i32 noundef %i6, i32 noundef %i7, i32 noundef %i8, i32 noundef %i9, ptr noundef %s1, ptr noundef %s2) +// CHECK: define{{.*}} i32 @f43_stack(i32 noundef %i, i32 noundef %i2, i32 noundef %i3, i32 noundef %i4, i32 noundef %i5, i32 noundef %i6, i32 noundef %i7, i32 noundef %i8, i32 noundef %i9, ptr dead_on_return noundef %s1, ptr dead_on_return noundef %s2) // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s1, i32 0, i32 0 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s2, i32 0, i32 0 // CHECK: getelementptr inbounds nuw %struct.s43, ptr %s1, i32 0, i32 1 @@ -618,7 +618,7 @@ int caller43_stack() { // CHECK: %[[b:.*]] = alloca %struct.s43, align 16 // CHECK: call void @llvm.memcpy.p0.p0.i64 // CHECK: call void @llvm.memcpy.p0.p0.i64 -// CHECK: call i32 @f43_stack(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, i32 noundef 6, i32 noundef 7, i32 noundef 8, i32 noundef 9, ptr noundef %[[a]], ptr noundef %[[b]]) +// CHECK: call i32 @f43_stack(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, i32 noundef 6, i32 noundef 7, i32 noundef 8, i32 noundef 9, ptr dead_on_return noundef %[[a]], ptr dead_on_return noundef %[[b]]) return f43_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, g43, g43_2); } diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp index 85472645acb3b..a0a81be54325f 100644 --- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp +++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp @@ -29,7 +29,7 @@ S2 f2() { // Pass and return for type size > 16 bytes. // CHECK: define {{.*}} void @{{.*}}f3{{.*}}(ptr dead_on_unwind noalias writable sret(%struct.S3) align 4 %agg.result) -// CHECK: call void {{.*}}func3{{.*}}(ptr dead_on_unwind writable sret(%struct.S3) align 4 %agg.result, ptr noundef %agg.tmp) +// CHECK: call void {{.*}}func3{{.*}}(ptr dead_on_unwind writable sret(%struct.S3) align 4 %agg.result, ptr dead_on_return noundef %agg.tmp) struct S3 { int a[5]; }; diff --git a/clang/test/CodeGen/armv7k-abi.c b/clang/test/CodeGen/armv7k-abi.c index 872e6423a4a99..6a781bc04d042 100644 --- a/clang/test/CodeGen/armv7k-abi.c +++ b/clang/test/CodeGen/armv7k-abi.c @@ -39,7 +39,7 @@ typedef struct { double z; } BigStruct; -// CHECK: define{{.*}} void @big_struct_indirect(ptr noundef %b) +// CHECK: define{{.*}} void @big_struct_indirect(ptr dead_on_return noundef %b) void big_struct_indirect(BigStruct b) {} // CHECK: define{{.*}} void @return_big_struct_indirect(ptr dead_on_unwind noalias writable sret diff --git a/clang/test/CodeGen/atomic-arm64.c b/clang/test/CodeGen/atomic-arm64.c index d2a30a3b6e66f..d539cad0c6a7d 100644 --- a/clang/test/CodeGen/atomic-arm64.c +++ b/clang/test/CodeGen/atomic-arm64.c @@ -57,7 +57,7 @@ void test3(pointer_pair_t pair) { } // CHECK-LABEL:define{{.*}} void @test4( -// CHECK-SAME: ptr noundef [[QUAD:%.*]]) +// CHECK-SAME: ptr dead_on_return noundef [[QUAD:%.*]]) // CHECK: [[QUAD_INDIRECT_ADDR:%.*]] = alloca ptr, align 8 // CHECK-NEXT: [[TEMP:%.*]] = alloca [[QUAD_T:%.*]], align 8 // CHECK-NEXT: store ptr [[QUAD]], ptr [[QUAD_INDIRECT_ADDR]] diff --git a/clang/test/CodeGen/attr-noundef.cpp b/clang/test/CodeGen/attr-noundef.cpp index abdf9496bd396..619dbec7678db 100644 --- a/clang/test/CodeGen/attr-noundef.cpp +++ b/clang/test/CodeGen/attr-noundef.cpp @@ -27,7 +27,7 @@ struct NoCopy { NoCopy ret_nocopy() { return {}; } void pass_nocopy(NoCopy e) {} // CHECK: [[DEF]] void @{{.*}}ret_nocopy{{.*}}(ptr dead_on_unwind noalias writable sret({{[^)]+}}) align 4 % -// CHECK: [[DEF]] void @{{.*}}pass_nocopy{{.*}}(ptr noundef % +// CHECK: [[DEF]] void @{{.*}}pass_nocopy{{.*}}(ptr dead_on_return noundef % struct Huge { int a[1024]; @@ -35,7 +35,8 @@ struct Huge { Huge ret_huge() { return {}; } void pass_huge(Huge h) {} // CHECK: [[DEF]] void @{{.*}}ret_huge{{.*}}(ptr dead_on_unwind noalias writable sret({{[^)]+}}) align 4 % -// CHECK: [[DEF]] void @{{.*}}pass_huge{{.*}}(ptr noundef +// CHECK-INTEL: [[DEF]] void @{{.*}}pass_huge{{.*}}(ptr noundef +// CHECK-AARCH: [[DEF]] void @{{.*}}pass_huge{{.*}}(ptr dead_on_return noundef } // namespace check_structs //************ Passing unions by value @@ -59,7 +60,7 @@ union NoCopy { NoCopy ret_nocopy() { return {}; } void pass_nocopy(NoCopy e) {} // CHECK: [[DEF]] void @{{.*}}ret_nocopy{{.*}}(ptr dead_on_unwind noalias writable sret({{[^)]+}}) align 4 % -// CHECK: [[DEF]] void @{{.*}}pass_nocopy{{.*}}(ptr noundef % +// CHECK: [[DEF]] void @{{.*}}pass_nocopy{{.*}}(ptr dead_on_return noundef % } // namespace check_unions //************ Passing `this` pointers diff --git a/clang/test/CodeGen/cx-complex-range.c b/clang/test/CodeGen/cx-complex-range.c index cf74ab2dcca3f..fba692cac4492 100644 --- a/clang/test/CodeGen/cx-complex-range.c +++ b/clang/test/CodeGen/cx-complex-range.c @@ -1520,7 +1520,7 @@ void mulassignf(_Complex float *a, _Complex float b) { // PRMTD-NEXT: ret { double, double } [[DOTFCA_1_INSERT]] // // X86WINPRMTD-LABEL: define dso_local void @divd( -// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 // X86WINPRMTD-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 @@ -1744,7 +1744,7 @@ void mulassignf(_Complex float *a, _Complex float b) { // PRMTD_FAST-NEXT: ret { double, double } [[DOTFCA_1_INSERT]] // // X86WINPRMTD_STRICT-LABEL: define dso_local void @divd( -// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 // X86WINPRMTD_STRICT-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 @@ -1938,7 +1938,7 @@ _Complex double divd(_Complex double a, _Complex double b) { // PRMTD-NEXT: ret void // // X86WINPRMTD-LABEL: define dso_local void @divassignd( -// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 // X86WINPRMTD-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 @@ -2180,7 +2180,7 @@ _Complex double divd(_Complex double a, _Complex double b) { // PRMTD_FAST-NEXT: ret void // // X86WINPRMTD_STRICT-LABEL: define dso_local void @divassignd( -// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 // X86WINPRMTD_STRICT-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 @@ -2325,7 +2325,7 @@ void divassignd(_Complex double *a, _Complex double b) { // PRMTD-NEXT: ret { double, double } [[DOTFCA_1_INSERT]] // // X86WINPRMTD-LABEL: define dso_local void @muld( -// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 // X86WINPRMTD-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 @@ -2457,7 +2457,7 @@ void divassignd(_Complex double *a, _Complex double b) { // PRMTD_FAST-NEXT: ret { double, double } [[DOTFCA_1_INSERT]] // // X86WINPRMTD_STRICT-LABEL: define dso_local void @muld( -// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 // X86WINPRMTD_STRICT-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 @@ -2594,7 +2594,7 @@ _Complex double muld(_Complex double a, _Complex double b) { // PRMTD-NEXT: ret void // // X86WINPRMTD-LABEL: define dso_local void @mulassignd( -// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 // X86WINPRMTD-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 @@ -2744,7 +2744,7 @@ _Complex double muld(_Complex double a, _Complex double b) { // PRMTD_FAST-NEXT: ret void // // X86WINPRMTD_STRICT-LABEL: define dso_local void @mulassignd( -// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 // X86WINPRMTD_STRICT-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 @@ -2922,7 +2922,7 @@ void mulassignd(_Complex double *a, _Complex double b) { // PRMTD-NEXT: ret { x86_fp80, x86_fp80 } [[DOTFCA_1_INSERT]] // // X86WINPRMTD-LABEL: define dso_local void @divld( -// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 // X86WINPRMTD-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 @@ -3190,7 +3190,7 @@ void mulassignd(_Complex double *a, _Complex double b) { // PRMTD_FAST-NEXT: ret { x86_fp80, x86_fp80 } [[DOTFCA_1_INSERT]] // // X86WINPRMTD_STRICT-LABEL: define dso_local void @divld( -// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 // X86WINPRMTD_STRICT-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 @@ -3432,7 +3432,7 @@ _Complex long double divld(_Complex long double a, _Complex long double b) { // PRMTD-NEXT: ret void // // X86WINPRMTD-LABEL: define dso_local void @divassignld( -// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 // X86WINPRMTD-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 @@ -3702,7 +3702,7 @@ _Complex long double divld(_Complex long double a, _Complex long double b) { // PRMTD_FAST-NEXT: ret void // // X86WINPRMTD_STRICT-LABEL: define dso_local void @divassignld( -// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 // X86WINPRMTD_STRICT-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 @@ -3895,7 +3895,7 @@ void divassignld(_Complex long double *a, _Complex long double b) { // PRMTD-NEXT: ret { x86_fp80, x86_fp80 } [[DOTFCA_1_INSERT]] // // X86WINPRMTD-LABEL: define dso_local void @mulld( -// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 // X86WINPRMTD-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 @@ -4059,7 +4059,7 @@ void divassignld(_Complex long double *a, _Complex long double b) { // PRMTD_FAST-NEXT: ret { x86_fp80, x86_fp80 } [[DOTFCA_1_INSERT]] // // X86WINPRMTD_STRICT-LABEL: define dso_local void @mulld( -// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: ptr dead_on_unwind noalias writable sret({ double, double }) align 8 [[AGG_RESULT:%.*]], ptr dead_on_return noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[A_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i32 0, i32 0 // X86WINPRMTD_STRICT-NEXT: [[A_REAL:%.*]] = load double, ptr [[A_REALP]], align 8 @@ -4220,7 +4220,7 @@ _Complex long double mulld(_Complex long double a, _Complex long double b) { // PRMTD-NEXT: ret void // // X86WINPRMTD-LABEL: define dso_local void @mulassignld( -// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 // X86WINPRMTD-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 @@ -4386,7 +4386,7 @@ _Complex long double mulld(_Complex long double a, _Complex long double b) { // PRMTD_FAST-NEXT: ret void // // X86WINPRMTD_STRICT-LABEL: define dso_local void @mulassignld( -// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: ptr noundef [[A:%.*]], ptr dead_on_return noundef [[B:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[B_REALP:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i32 0, i32 0 // X86WINPRMTD_STRICT-NEXT: [[B_REAL:%.*]] = load double, ptr [[B_REALP]], align 8 @@ -4644,7 +4644,7 @@ void mulassignld(_Complex long double *a, _Complex long double b) { // PRMTD-NEXT: ret <2 x float> [[RETVAL_SROA_0_4_VEC_INSERT]] // // X86WINPRMTD-LABEL: define dso_local i64 @f1( -// X86WINPRMTD-SAME: i64 noundef [[A_COERCE:%.*]], ptr noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] { +// X86WINPRMTD-SAME: i64 noundef [[A_COERCE:%.*]], ptr dead_on_return noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] { // X86WINPRMTD-NEXT: entry: // X86WINPRMTD-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE]] to i32 // X86WINPRMTD-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_SROA_0_0_EXTRACT_TRUNC]] to float @@ -5052,7 +5052,7 @@ void mulassignld(_Complex long double *a, _Complex long double b) { // PRMTD_FAST-NEXT: ret <2 x float> [[RETVAL_SROA_0_4_VEC_INSERT]] // // X86WINPRMTD_STRICT-LABEL: define dso_local i64 @f1( -// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], ptr noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] { +// X86WINPRMTD_STRICT-SAME: i64 noundef [[A_COERCE:%.*]], ptr dead_on_return noundef [[B:%.*]], i64 noundef [[C_COERCE:%.*]]) #[[ATTR0]] { // X86WINPRMTD_STRICT-NEXT: entry: // X86WINPRMTD_STRICT-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE]] to i32 // X86WINPRMTD_STRICT-NEXT: [[TMP0:%.*]] = bitcast i32 [[A_SROA_0_0_EXTRACT_TRUNC]] to float diff --git a/clang/test/CodeGen/ext-int-cc.c b/clang/test/CodeGen/ext-int-cc.c index fdca4012ee4a4..7cfd992fd48b4 100644 --- a/clang/test/CodeGen/ext-int-cc.c +++ b/clang/test/CodeGen/ext-int-cc.c @@ -33,9 +33,9 @@ // Make sure 128 and 64 bit versions are passed like integers. void ParamPassing(_BitInt(128) b, _BitInt(64) c) {} // LIN64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) -// WIN64: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) -// LIN32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) -// WIN32: define dso_local void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) +// WIN64: define dso_local void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}}) +// LIN32: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}}) +// WIN32: define dso_local void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}}) // NVPTX64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) // NVPTX: define{{.*}} void @ParamPassing(ptr byval(i128) align 8 %{{.+}}, i64 %{{.+}}) // SPARCV9: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) @@ -50,22 +50,22 @@ void ParamPassing(_BitInt(128) b, _BitInt(64) c) {} // ARC: define{{.*}} void @ParamPassing(ptr byval(i128) align 4 %{{.+}}, i64 inreg %{{.+}}) // XCORE: define{{.*}} void @ParamPassing(ptr byval(i128) align 4 %{{.+}}, i64 %{{.+}}) // RISCV64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) -// RISCV32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) +// RISCV32: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}}) // WASM: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) -// SYSTEMZ: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) +// SYSTEMZ: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}}) // PPC64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) // PPC32: define{{.*}} void @ParamPassing(ptr byval(i128) align 8 %{{.+}}, i64 %{{.+}}) // AARCH64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) // AARCH64DARWIN: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) // ARM: define{{.*}} arm_aapcscc void @ParamPassing(ptr byval(i128) align 8 %{{.+}}, i64 %{{.+}}) // LA64: define{{.*}} void @ParamPassing(i128 %{{.+}}, i64 %{{.+}}) -// LA32: define{{.*}} void @ParamPassing(ptr %{{.+}}, i64 %{{.+}}) +// LA32: define{{.*}} void @ParamPassing(ptr dead_on_return %{{.+}}, i64 %{{.+}}) void ParamPassing2(_BitInt(127) b, _BitInt(63) c) {} // LIN64: define{{.*}} void @ParamPassing2(i64 %{{.+}}, i64 %{{.+}}, i64 %{{.+}}) -// WIN64: define dso_local void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) -// LIN32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) -// WIN32: define dso_local void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) +// WIN64: define dso_local void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}}) +// LIN32: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}}) +// WIN32: define dso_local void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}}) // NVPTX64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}}) // NVPTX: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) // SPARCV9: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}}) @@ -80,16 +80,16 @@ void ParamPassing2(_BitInt(127) b, _BitInt(63) c) {} // ARC: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 inreg %{{.+}}) // XCORE: define{{.*}} void @ParamPassing2(ptr byval(i128) align 4 %{{.+}}, i63 %{{.+}}) // RISCV64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}}) -// RISCV32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) +// RISCV32: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}}) // WASM: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}}) -// SYSTEMZ: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 signext %{{.+}}) +// SYSTEMZ: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 signext %{{.+}}) // PPC64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}}) // PPC32: define{{.*}} void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) // AARCH64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}}) // AARCH64DARWIN: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 %{{.+}}) // ARM: define{{.*}} arm_aapcscc void @ParamPassing2(ptr byval(i128) align 8 %{{.+}}, i63 %{{.+}}) // LA64: define{{.*}} void @ParamPassing2(i127 %{{.+}}, i63 signext %{{.+}}) -// LA32: define{{.*}} void @ParamPassing2(ptr %{{.+}}, i63 %{{.+}}) +// LA32: define{{.*}} void @ParamPassing2(ptr dead_on_return %{{.+}}, i63 %{{.+}}) // Make sure we follow the signext rules for promotable integer types. void ParamPassing3(_BitInt(15) a, _BitInt(31) b) {} @@ -129,10 +129,10 @@ void ParamPassing3(_BitInt(15) a, _BitInt(31) b) {} // _BitInt widths to alert us to enable the test. void ParamPassing4(_BitInt(129) a) {} // LIN64: define{{.*}} void @ParamPassing4(ptr byval([24 x i8]) align 8 %{{.+}}) -// WIN64: define dso_local void @ParamPassing4(ptr %{{.+}}) -// LIN32: define{{.*}} void @ParamPassing4(ptr %{{.+}}) -// WIN32: define dso_local void @ParamPassing4(ptr %{{.+}}) -// AARCH64: define{{.*}} void @ParamPassing4(ptr %{{.+}}) +// WIN64: define dso_local void @ParamPassing4(ptr dead_on_return %{{.+}}) +// LIN32: define{{.*}} void @ParamPassing4(ptr dead_on_return %{{.+}}) +// WIN32: define dso_local void @ParamPassing4(ptr dead_on_return %{{.+}}) +// AARCH64: define{{.*}} void @ParamPassing4(ptr dead_on_return %{{.+}}) // NVPTX64-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}}) // NVPTX-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}}) // SPARCV9-NOT: define{{.*}} void @ParamPassing4(ptr %{{.+}}) @@ -154,8 +154,8 @@ void ParamPassing4(_BitInt(129) a) {} // PPC32-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}}) // AARCH64DARWIN-NOT: define{{.*}} void @ParamPassing4(ptr byval(i129) align 8 %{{.+}}) // ARM-NOT: define{{.*}} arm_aapcscc void @ParamPassing4(ptr byval(i129) align 8 %{{.+}}) -// LA64: define{{.*}} void @ParamPassing4(ptr %{{.+}}) -// LA32: define{{.*}} void @ParamPassing4(ptr %{{.+}}) +// LA64: define{{.*}} void @ParamPassing4(ptr dead_on_return %{{.+}}) +// LA32: define{{.*}} void @ParamPassing4(ptr dead_on_return %{{.+}}) #endif _BitInt(63) ReturnPassing(void) { return 0; } diff --git a/clang/test/CodeGen/isfpclass.c b/clang/test/CodeGen/isfpclass.c index 26dd846a2bf20..ee3a22b40fefd 100644 --- a/clang/test/CodeGen/isfpclass.c +++ b/clang/test/CodeGen/isfpclass.c @@ -160,7 +160,7 @@ int4 check_isfpclass_nan_strict_v4f32(float4 x) { } // CHECK-LABEL: define dso_local void @check_isfpclass_nan_v4f64 -// CHECK-SAME: (ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { +// CHECK-SAME: (ptr dead_on_unwind noalias writable writeonly sret(<4 x i64>) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[X:%.*]] = load <4 x double>, ptr [[TMP0]], align 16, !tbaa [[TBAA2:![0-9]+]] // CHECK-NEXT: [[TMP1:%.*]] = fcmp uno <4 x double> [[X]], zeroinitializer diff --git a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c index eb706154300a2..0124cc5c06d43 100644 --- a/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c +++ b/clang/test/CodeGen/math-libcalls-tbaa-indirect-args.c @@ -30,37 +30,37 @@ long double powl(long double a, long double b); // // CHECK-PPC-LABEL: define dso_local ppc_fp128 @test_powl( // CHECK-PPC-SAME: ppc_fp128 noundef [[A:%.*]], ppc_fp128 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-PPC: [[CALL:%.*]] = tail call ppc_fp128 @powl(ppc_fp128 noundef [[A]], ppc_fp128 noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA2:![0-9]+]] +// CHECK-PPC: [[CALL:%.*]] = tail call ppc_fp128 @powl(ppc_fp128 noundef [[A]], ppc_fp128 noundef [[B]]) #[[ATTR4:[0-9]+]], !tbaa [[TBAA2:![0-9]+]] // // CHECK-ARM-LABEL: define dso_local double @test_powl( // CHECK-ARM-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-ARM: [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA3:![0-9]+]] +// CHECK-ARM: [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]] // // CHECK-ARM-HF-LABEL: define dso_local double @test_powl( // CHECK-ARM-HF-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-ARM-HF: [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA3:![0-9]+]] +// CHECK-ARM-HF: [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]] // // CHECK-THUMB-LABEL: define double @test_powl( // CHECK-THUMB-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-THUMB: [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA3:![0-9]+]] +// CHECK-THUMB: [[CALL:%.*]] = tail call double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA3:![0-9]+]] // // CHECK-AARCH-LABEL: define dso_local fp128 @test_powl( // CHECK-AARCH-SAME: fp128 noundef [[A:%.*]], fp128 noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-AARCH: [[CALL:%.*]] = tail call fp128 @powl(fp128 noundef [[A]], fp128 noundef [[B]]) #[[ATTR2:[0-9]+]], !tbaa [[TBAA2:![0-9]+]] +// CHECK-AARCH: [[CALL:%.*]] = tail call fp128 @powl(fp128 noundef [[A]], fp128 noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA2:![0-9]+]] // // CHECK-SPIR-LABEL: define dso_local spir_func double @test_powl( // CHECK-SPIR-SAME: double noundef [[A:%.*]], double noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-SPIR: [[CALL:%.*]] = tail call spir_func double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR3:[0-9]+]], !tbaa [[TBAA2:![0-9]+]] +// CHECK-SPIR: [[CALL:%.*]] = tail call spir_func double @powl(double noundef [[A]], double noundef [[B]]) #[[ATTR4:[0-9]+]], !tbaa [[TBAA2:![0-9]+]] // // CHECK-MINGW32-LABEL: define dso_local void @test_powl( -// CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret(x86_fp80) align 16 captures(none) initializes((0, 10)) [[AGG_RESULT:%.*]], ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -// CHECK-MINGW32: [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA3:![0-9]+]] -// CHECK-MINGW32: [[B:%.*]] = load x86_fp80, ptr [[TMP1]], align 16, !tbaa [[TBAA3]] -// CHECK-MINGW32: store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA3]] -// CHECK-MINGW32: store x86_fp80 [[B]], ptr [[BYVAL_TEMP1:%.*]], align 16, !tbaa [[TBAA3]] -// CHECK-MINGW32: call void @powl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr noundef nonnull [[BYVAL_TEMP]], ptr noundef nonnull [[BYVAL_TEMP1]]) #[[ATTR3:[0-9]+]] -// CHECK-MINGW32: [[TMP2:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA3]] -// CHECK-MINGW32: store x86_fp80 [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[TBAA3]] +// CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret(x86_fp80) align 16 captures(none) initializes((0, 10)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]], ptr dead_on_return noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +// CHECK-MINGW32: [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA6:![0-9]+]] +// CHECK-MINGW32: [[B:%.*]] = load x86_fp80, ptr [[TMP1]], align 16, !tbaa [[TBAA6]] +// CHECK-MINGW32: store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-MINGW32: store x86_fp80 [[B]], ptr [[BYVAL_TEMP1:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-MINGW32: call void @powl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP1]]) #[[ATTR3:[0-9]+]] +// CHECK-MINGW32: [[TMP2:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA6]] +// CHECK-MINGW32: store x86_fp80 [[TMP2]], ptr [[AGG_RESULT]], align 16, !tbaa [[TBAA6]] // long double test_powl(long double a, long double b) { return powl(a, b); @@ -93,51 +93,51 @@ long double test_powl(long double a, long double b) { // CHECK-I686: store x86_fp80 [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 4 // // CHECK-PPC-LABEL: define dso_local void @test_cargl( -// CHECK-PPC-SAME: ptr dead_on_unwind noalias writable writeonly sret({ ppc_fp128, ppc_fp128 }) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ ppc_fp128, ppc_fp128 }) align 16 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +// CHECK-PPC-SAME: ptr dead_on_unwind noalias writable writeonly sret({ ppc_fp128, ppc_fp128 }) align 16 captures(none) initializes((0, 32)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ ppc_fp128, ppc_fp128 }) align 16 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { // CHECK-PPC: [[CLD_REAL:%.*]] = load ppc_fp128, ptr [[CLD]], align 16 // CHECK-PPC: [[CLD_IMAG:%.*]] = load ppc_fp128, ptr [[CLD_IMAGP:%.*]], align 16 // CHECK-PPC: store ppc_fp128 [[CLD_REAL]], ptr [[BYVAL_TEMP:%.*]], align 16 // CHECK-PPC: store ppc_fp128 [[CLD_IMAG]], ptr [[BYVAL_TEMP_IMAGP:%.*]], align 16 -// CHECK-PPC: [[CALL:%.*]] = tail call ppc_fp128 @cargl(ptr noundef nonnull byval({ ppc_fp128, ppc_fp128 }) align 16 [[BYVAL_TEMP]]) #[[ATTR3]] +// CHECK-PPC: [[CALL:%.*]] = tail call ppc_fp128 @cargl(ptr noundef nonnull byval({ ppc_fp128, ppc_fp128 }) align 16 [[BYVAL_TEMP]]) #[[ATTR4]] // CHECK-PPC: store ppc_fp128 [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 16 // CHECK-PPC: store ppc_fp128 [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 16 // // CHECK-ARM-LABEL: define dso_local void @test_cargl( -// CHECK-ARM-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], [2 x i64] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { -// CHECK-ARM: [[CALL:%.*]] = tail call double @cargl([2 x i64] noundef [[CLD_COERCE]]) #[[ATTR2]], !tbaa [[TBAA3]] +// CHECK-ARM-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], [2 x i64] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-ARM: [[CALL:%.*]] = tail call double @cargl([2 x i64] noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]] // CHECK-ARM: store double [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 8 // CHECK-ARM: store double [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 8 // // CHECK-ARM-HF-LABEL: define dso_local { double, double } @test_cargl( -// CHECK-ARM-HF-SAME: { double, double } noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { -// CHECK-ARM-HF: [[CALL:%.*]] = tail call double @cargl({ double, double } noundef [[CLD_COERCE]]) #[[ATTR2]], !tbaa [[TBAA3]] +// CHECK-ARM-HF-SAME: { double, double } noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-ARM-HF: [[CALL:%.*]] = tail call double @cargl({ double, double } noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]] // // CHECK-THUMB-LABEL: define { double, double } @test_cargl( -// CHECK-THUMB-SAME: [2 x double] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { -// CHECK-THUMB: [[CALL:%.*]] = tail call double @cargl([2 x double] noundef [[CLD_COERCE]]) #[[ATTR2]], !tbaa [[TBAA3]] +// CHECK-THUMB-SAME: [2 x double] noundef [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-THUMB: [[CALL:%.*]] = tail call double @cargl([2 x double] noundef [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA3]] // // CHECK-AARCH-LABEL: define dso_local { fp128, fp128 } @test_cargl( -// CHECK-AARCH-SAME: [2 x fp128] noundef alignstack(16) [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { -// CHECK-AARCH: [[CALL:%.*]] = tail call fp128 @cargl([2 x fp128] noundef alignstack(16) [[CLD_COERCE]]) #[[ATTR2]], !tbaa [[TBAA2]] +// CHECK-AARCH-SAME: [2 x fp128] noundef alignstack(16) [[CLD_COERCE:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +// CHECK-AARCH: [[CALL:%.*]] = tail call fp128 @cargl([2 x fp128] noundef alignstack(16) [[CLD_COERCE]]) #[[ATTR3]], !tbaa [[TBAA2]] // // CHECK-SPIR-LABEL: define dso_local spir_func void @test_cargl( -// CHECK-SPIR-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ double, double }) align 8 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +// CHECK-SPIR-SAME: ptr dead_on_unwind noalias writable writeonly sret({ double, double }) align 8 captures(none) initializes((0, 16)) [[AGG_RESULT:%.*]], ptr noundef readonly byval({ double, double }) align 8 captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { // CHECK-SPIR: [[CLD_REAL:%.*]] = load double, ptr [[CLD]], align 8 // CHECK-SPIR: [[CLD_IMAG:%.*]] = load double, ptr [[CLD_IMAGP:%.*]], align 8 // CHECK-SPIR: store double [[CLD_REAL]], ptr [[BYVAL_TEMP:%.*]], align 8 // CHECK-SPIR: store double [[CLD_IMAG]], ptr [[BYVAL_TEMP_IMAGP:%.*]], align 8 -// CHECK-SPIR: [[CALL:%.*]] = tail call spir_func double @cargl(ptr noundef nonnull byval({ double, double }) align 8 [[BYVAL_TEMP]]) #[[ATTR3]] +// CHECK-SPIR: [[CALL:%.*]] = tail call spir_func double @cargl(ptr noundef nonnull byval({ double, double }) align 8 [[BYVAL_TEMP]]) #[[ATTR4]] // CHECK-SPIR: store double [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 8 // CHECK-SPIR: store double [[MUL_IR:%.*]], ptr [[AGG_RESULT_IMAGP:%.*]], align 8 // // CHECK-MINGW32-LABEL: define dso_local void @test_cargl( -// CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret({ x86_fp80, x86_fp80 }) align 16 captures(none) initializes((0, 10), (16, 26)) [[AGG_RESULT:%.*]], ptr noundef readonly captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-MINGW32-SAME: ptr dead_on_unwind noalias writable writeonly sret({ x86_fp80, x86_fp80 }) align 16 captures(none) initializes((0, 10), (16, 26)) [[AGG_RESULT:%.*]], ptr dead_on_return noundef readonly captures(none) [[CLD:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-MINGW32: [[CLD_REAL:%.*]] = load x86_fp80, ptr [[CLD]], align 16 // CHECK-MINGW32: [[CLD_IMAG:%.*]] = load x86_fp80, ptr [[CLD_IMAGP:%.*]], align 16 // CHECK-MINGW32: store x86_fp80 [[CLD_REAL]], ptr [[BYVAL_TEMP:%.*]], align 16 // CHECK-MINGW32: store x86_fp80 [[CLD_IMAG]], ptr [[BYVAL_TEMP_IMAGP:%.*]], align 16 -// CHECK-MINGW32: call void @cargl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]] -// CHECK-MINGW32: [[TMP0:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA3]] +// CHECK-MINGW32: call void @cargl(ptr dead_on_unwind nonnull writable sret(x86_fp80) align 16 [[TMP:%.*]], ptr dead_on_return noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]] +// CHECK-MINGW32: [[TMP0:%.*]] = load x86_fp80, ptr [[TMP]], align 16, !tbaa [[TBAA6]] // CHECK-MINGW32: [[CLD_REAL3:%.*]] = load x86_fp80, ptr [[CLD]], align 16 // CHECK-MINGW32: [[CLD_IMAG5:%.*]] = load x86_fp80, ptr [[CLD_IMAGP]], align 16 // CHECK-MINGW32: store x86_fp80 [[MUL_RL:%.*]], ptr [[AGG_RESULT]], align 16 @@ -166,33 +166,33 @@ int ilogbl(long double a); // // CHECK-PPC-LABEL: define dso_local i32 @test_ilogb( // CHECK-PPC-SAME: ppc_fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-PPC: [[CALL:%.*]] = tail call i32 @ilogbl(ppc_fp128 noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]] +// CHECK-PPC: [[CALL:%.*]] = tail call i32 @ilogbl(ppc_fp128 noundef [[A]]) #[[ATTR4]], !tbaa [[TBAA2]] // // CHECK-ARM-LABEL: define dso_local i32 @test_ilogb( // CHECK-ARM-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-ARM: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]] +// CHECK-ARM: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]] // // CHECK-ARM-HF-LABEL: define dso_local i32 @test_ilogb( // CHECK-ARM-HF-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-ARM-HF: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]] +// CHECK-ARM-HF: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]] // // CHECK-THUMB-LABEL: define i32 @test_ilogb( // CHECK-THUMB-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-THUMB: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA3]] +// CHECK-THUMB: [[CALL:%.*]] = tail call i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA3]] // // CHECK-AARCH-LABEL: define dso_local i32 @test_ilogb( // CHECK-AARCH-SAME: fp128 noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-AARCH: [[CALL:%.*]] = tail call i32 @ilogbl(fp128 noundef [[A]]) #[[ATTR2]], !tbaa [[TBAA2]] +// CHECK-AARCH: [[CALL:%.*]] = tail call i32 @ilogbl(fp128 noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]] // // CHECK-SPIR-LABEL: define dso_local spir_func i32 @test_ilogb( // CHECK-SPIR-SAME: double noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-SPIR: [[CALL:%.*]] = tail call spir_func i32 @ilogbl(double noundef [[A]]) #[[ATTR3]], !tbaa [[TBAA2]] +// CHECK-SPIR: [[CALL:%.*]] = tail call spir_func i32 @ilogbl(double noundef [[A]]) #[[ATTR4]], !tbaa [[TBAA2]] // // CHECK-MINGW32-LABEL: define dso_local i32 @test_ilogb( -// CHECK-MINGW32-SAME: ptr noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { -// CHECK-MINGW32: [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA3]] -// CHECK-MINGW32: store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA3]] -// CHECK-MINGW32: [[CALL:%.*]] = call i32 @ilogbl(ptr noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]] +// CHECK-MINGW32-SAME: ptr dead_on_return noundef readonly captures(none) [[TMP0:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-MINGW32: [[A:%.*]] = load x86_fp80, ptr [[TMP0]], align 16, !tbaa [[TBAA6]] +// CHECK-MINGW32: store x86_fp80 [[A]], ptr [[BYVAL_TEMP:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-MINGW32: [[CALL:%.*]] = call i32 @ilogbl(ptr dead_on_return noundef nonnull [[BYVAL_TEMP]]) #[[ATTR3]] // int test_ilogb(long double a) { return ilogbl(a); @@ -243,8 +243,8 @@ int test_ilogb(long double a) { // CHECK-SPIR: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0} // CHECK-SPIR: [[META5]] = !{!"Simple C/C++ TBAA"} //. -// CHECK-MINGW32: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0} -// CHECK-MINGW32: [[META4]] = !{!"long double", [[META5:![0-9]+]], i64 0} -// CHECK-MINGW32: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0} -// CHECK-MINGW32: [[META6]] = !{!"Simple C/C++ TBAA"} +// CHECK-MINGW32: [[TBAA6]] = !{[[META7:![0-9]+]], [[META7]], i64 0} +// CHECK-MINGW32: [[META7]] = !{!"long double", [[META8:![0-9]+]], i64 0} +// CHECK-MINGW32: [[META8]] = !{!"omnipotent char", [[META9:![0-9]+]], i64 0} +// CHECK-MINGW32: [[META9]] = !{!"Simple C/C++ TBAA"} //. diff --git a/clang/test/CodeGen/mingw-long-double.c b/clang/test/CodeGen/mingw-long-double.c index 0fc8f01509682..b98929701bc62 100644 --- a/clang/test/CodeGen/mingw-long-double.c +++ b/clang/test/CodeGen/mingw-long-double.c @@ -29,15 +29,15 @@ long double TestLD(long double x) { return x * x; } // GNU32: define dso_local x86_fp80 @TestLD(x86_fp80 noundef %x) -// GNU64: define dso_local void @TestLD(ptr dead_on_unwind noalias writable sret(x86_fp80) align 16 %agg.result, ptr noundef %0) +// GNU64: define dso_local void @TestLD(ptr dead_on_unwind noalias writable sret(x86_fp80) align 16 %agg.result, ptr dead_on_return noundef %0) // MSC64: define dso_local double @TestLD(double noundef %x) long double _Complex TestLDC(long double _Complex x) { return x * x; } // GNU32: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ x86_fp80, x86_fp80 }) align 4 %agg.result, ptr noundef byval({ x86_fp80, x86_fp80 }) align 4 %x) -// GNU64: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ x86_fp80, x86_fp80 }) align 16 %agg.result, ptr noundef %x) -// MSC64: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ double, double }) align 8 %agg.result, ptr noundef %x) +// GNU64: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ x86_fp80, x86_fp80 }) align 16 %agg.result, ptr dead_on_return noundef %x) +// MSC64: define dso_local void @TestLDC(ptr dead_on_unwind noalias writable sret({ double, double }) align 8 %agg.result, ptr dead_on_return noundef %x) // GNU32: declare dso_local void @__mulxc3 // GNU64: declare dso_local void @__mulxc3 diff --git a/clang/test/CodeGen/ms_abi.c b/clang/test/CodeGen/ms_abi.c index 528e546f315d5..5d58c9816da78 100644 --- a/clang/test/CodeGen/ms_abi.c +++ b/clang/test/CodeGen/ms_abi.c @@ -142,7 +142,7 @@ struct i128 { }; __attribute__((ms_abi)) struct i128 f7(struct i128 a) { - // WIN64: define dso_local void @f7(ptr dead_on_unwind noalias writable sret(%struct.i128) align 8 %agg.result, ptr noundef %a) - // FREEBSD: define{{.*}} win64cc void @f7(ptr dead_on_unwind noalias writable sret(%struct.i128) align 8 %agg.result, ptr noundef %a) + // WIN64: define dso_local void @f7(ptr dead_on_unwind noalias writable sret(%struct.i128) align 8 %agg.result, ptr dead_on_return noundef %a) + // FREEBSD: define{{.*}} win64cc void @f7(ptr dead_on_unwind noalias writable sret(%struct.i128) align 8 %agg.result, ptr dead_on_return noundef %a) return a; } diff --git a/clang/test/CodeGen/pass-by-value-noalias.c b/clang/test/CodeGen/pass-by-value-noalias.c index bc35d13c4df6a..e673ceb80bebe 100644 --- a/clang/test/CodeGen/pass-by-value-noalias.c +++ b/clang/test/CodeGen/pass-by-value-noalias.c @@ -11,6 +11,6 @@ struct Foo { int f; }; -// WITH_NOALIAS: define{{.*}} void @take(ptr noalias noundef %arg) -// NO_NOALIAS: define{{.*}} void @take(ptr noundef %arg) +// WITH_NOALIAS: define{{.*}} void @take(ptr dead_on_return noalias noundef %arg) +// NO_NOALIAS: define{{.*}} void @take(ptr dead_on_return noundef %arg) void take(struct Foo arg) {} diff --git a/clang/test/CodeGen/ptrauth-in-c-struct.c b/clang/test/CodeGen/ptrauth-in-c-struct.c index 2aec31ec3baf9..c74be17b4c837 100644 --- a/clang/test/CodeGen/ptrauth-in-c-struct.c +++ b/clang/test/CodeGen/ptrauth-in-c-struct.c @@ -115,7 +115,7 @@ void test_move_assignment_SA(SA *p) { *p = getSA(); } -// CHECK: define void @test_parameter_SA(ptr noundef %{{.*}}) +// CHECK: define void @test_parameter_SA(ptr dead_on_return noundef %{{.*}}) // CHECK-NOT: call // CHECK: ret void @@ -128,7 +128,7 @@ void test_parameter_SA(SA a) { // CHECK: store ptr %[[A]], ptr %[[A_ADDR]], align 8 // CHECK: %[[V0:.*]] = load ptr, ptr %[[A_ADDR]], align 8 // CHECK: call void @__copy_constructor_8_8_t0w4_pa1_50_8(ptr %[[AGG_TMP]], ptr %[[V0]]) -// CHECK: call void @calleeSA(ptr noundef %[[AGG_TMP]]) +// CHECK: call void @calleeSA(ptr dead_on_return noundef %[[AGG_TMP]]) // CHECK-NOT: call // CHECK: ret void diff --git a/clang/test/CodeGen/regcall.c b/clang/test/CodeGen/regcall.c index f10da87353fa1..d4b9f00d54d41 100644 --- a/clang/test/CodeGen/regcall.c +++ b/clang/test/CodeGen/regcall.c @@ -28,7 +28,7 @@ struct Large { int a[5]; }; void __regcall v4(int a, struct Large b, int c) {} // Win32: define dso_local x86_regcallcc void @__regcall3__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 inreg noundef %c) // Lin32: define dso_local x86_regcallcc void @__regcall3__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 noundef %c) -// Win64: define dso_local x86_regcallcc void @__regcall3__v4(i32 noundef %a, ptr noundef %b, i32 noundef %c) +// Win64: define dso_local x86_regcallcc void @__regcall3__v4(i32 noundef %a, ptr dead_on_return noundef %b, i32 noundef %c) // Lin64: define dso_local x86_regcallcc void @__regcall3__v4(i32 noundef %a, [5 x i32] %b.coerce, i32 noundef %c) void __regcall v5(long long a, int b, int c) {} @@ -47,7 +47,7 @@ void __regcall hfa1(int a, struct HFA4 b, int c) {} // indirectly. Additional vector arguments can consume the rest of the SSE // registers. void __regcall hfa2(struct HFA4 a, struct HFA4 b, double c) {} -// X86: define dso_local x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr inreg noundef %0) +// X86: define dso_local x86_regcallcc void @__regcall3__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr dead_on_return inreg noundef %0) // X64: define dso_local x86_regcallcc void @__regcall3__hfa2(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double noundef %c) // Ensure that we pass builtin types directly while counting them against the @@ -61,7 +61,7 @@ void __regcall hfa3(double a, double b, double c, double d, double e, struct HFA // handling to ensure alignment. void __regcall hfa4(struct HFA5 a) {} // X32: define dso_local x86_regcallcc void @__regcall3__hfa4(ptr noundef byval(%struct.HFA5) align 4 %{{.*}}) -// Win64: define dso_local x86_regcallcc void @__regcall3__hfa4(ptr noundef %a) +// Win64: define dso_local x86_regcallcc void @__regcall3__hfa4(ptr dead_on_return noundef %a) // Lin64: define dso_local x86_regcallcc void @__regcall3__hfa4(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %a.coerce4) // Return HFAs of 4 or fewer elements in registers. @@ -79,7 +79,7 @@ void __regcall hva1(int a, struct HVA4 b, int c) {} // X64: define dso_local x86_regcallcc void @__regcall3__hva1(i32 noundef %a, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 noundef %c) void __regcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {} -// X86: define dso_local x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %0) +// X86: define dso_local x86_regcallcc void @__regcall3__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr dead_on_return inreg noundef %0) // X64: define dso_local x86_regcallcc void @__regcall3__hva2(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> noundef %c) void __regcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {} @@ -95,6 +95,6 @@ void __regcall odd_size_hva(struct OddSizeHVA a) {} struct HFA6 { __m128 f[4]; }; struct HFA6 __regcall ret_reg_reused(struct HFA6 a, struct HFA6 b, struct HFA6 c, struct HFA6 d){ struct HFA6 h; return h;} -// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %c, ptr inreg noundef %d) +// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr dead_on_return inreg noundef %c, ptr dead_on_return inreg noundef %d) // Win64: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c.0, <4 x float> %c.1, <4 x float> %c.2, <4 x float> %c.3, <4 x float> %d.0, <4 x float> %d.1, <4 x float> %d.2, <4 x float> %d.3) // Lin64: define dso_local x86_regcallcc %struct.HFA6 @__regcall3__ret_reg_reused([4 x <4 x float>] %a.coerce, [4 x <4 x float>] %b.coerce, [4 x <4 x float>] %c.coerce, [4 x <4 x float>] %d.coerce) diff --git a/clang/test/CodeGen/regcall2.c b/clang/test/CodeGen/regcall2.c index c88d4e485b104..42619369677b2 100644 --- a/clang/test/CodeGen/regcall2.c +++ b/clang/test/CodeGen/regcall2.c @@ -20,7 +20,7 @@ double __regcall bar(__sVector a) { // FIXME: Do we need to change for Windows? // Win: define dso_local x86_regcallcc void @__regcall3__foo(ptr dead_on_unwind noalias writable sret(%struct.__sVector) align 64 %agg.result, i32 noundef %a) #0 -// Win: define dso_local x86_regcallcc double @__regcall3__bar(ptr noundef %a) #0 +// Win: define dso_local x86_regcallcc double @__regcall3__bar(ptr dead_on_return noundef %a) #0 // Win: attributes #0 = { noinline nounwind optnone "min-legal-vector-width"="0" "no-builtins" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+avx,+avx2,+avx512f,+avx512vl,+crc32,+cx8,+evex512,+f16c,+fma,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" } // Lin: define dso_local x86_regcallcc %struct.__sVector @__regcall3__foo(i32 noundef %a) #0 diff --git a/clang/test/CodeGen/regcall4.c b/clang/test/CodeGen/regcall4.c index 5fbe77fbc7d76..d5fe5d88a0e8c 100644 --- a/clang/test/CodeGen/regcall4.c +++ b/clang/test/CodeGen/regcall4.c @@ -28,7 +28,7 @@ struct Large { int a[5]; }; void __regcall v4(int a, struct Large b, int c) {} // Win32: define dso_local x86_regcallcc void @__regcall4__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 inreg noundef %c) // Lin32: define dso_local x86_regcallcc void @__regcall4__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 noundef %c) -// Win64: define dso_local x86_regcallcc void @__regcall4__v4(i32 noundef %a, ptr noundef %b, i32 noundef %c) +// Win64: define dso_local x86_regcallcc void @__regcall4__v4(i32 noundef %a, ptr dead_on_return noundef %b, i32 noundef %c) // Lin64: define dso_local x86_regcallcc void @__regcall4__v4(i32 noundef %a, [5 x i32] %b.coerce, i32 noundef %c) void __regcall v5(long long a, int b, int c) {} @@ -47,7 +47,7 @@ void __regcall hfa1(int a, struct HFA4 b, int c) {} // indirectly. Additional vector arguments can consume the rest of the SSE // registers. void __regcall hfa2(struct HFA4 a, struct HFA4 b, double c) {} -// X86: define dso_local x86_regcallcc void @__regcall4__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr inreg noundef %0) +// X86: define dso_local x86_regcallcc void @__regcall4__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr dead_on_return inreg noundef %0) // X64: define dso_local x86_regcallcc void @__regcall4__hfa2(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double noundef %c) // Ensure that we pass builtin types directly while counting them against the @@ -61,7 +61,7 @@ void __regcall hfa3(double a, double b, double c, double d, double e, struct HFA // handling to ensure alignment. void __regcall hfa4(struct HFA5 a) {} // X32: define dso_local x86_regcallcc void @__regcall4__hfa4(ptr noundef byval(%struct.HFA5) align 4 %{{.*}}) -// Win64: define dso_local x86_regcallcc void @__regcall4__hfa4(ptr noundef %a) +// Win64: define dso_local x86_regcallcc void @__regcall4__hfa4(ptr dead_on_return noundef %a) // Lin64: define dso_local x86_regcallcc void @__regcall4__hfa4(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %a.coerce4) // Return HFAs of 4 or fewer elements in registers. @@ -79,7 +79,7 @@ void __regcall hva1(int a, struct HVA4 b, int c) {} // X64: define dso_local x86_regcallcc void @__regcall4__hva1(i32 noundef %a, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 noundef %c) void __regcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {} -// X86: define dso_local x86_regcallcc void @__regcall4__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %0) +// X86: define dso_local x86_regcallcc void @__regcall4__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr dead_on_return inreg noundef %0) // X64: define dso_local x86_regcallcc void @__regcall4__hva2(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> noundef %c) void __regcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {} @@ -95,6 +95,6 @@ void __regcall odd_size_hva(struct OddSizeHVA a) {} struct HFA6 { __m128 f[4]; }; struct HFA6 __regcall ret_reg_reused(struct HFA6 a, struct HFA6 b, struct HFA6 c, struct HFA6 d){ struct HFA6 h; return h;} -// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %c, ptr inreg noundef %d) +// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr dead_on_return inreg noundef %c, ptr dead_on_return inreg noundef %d) // Win64: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c.0, <4 x float> %c.1, <4 x float> %c.2, <4 x float> %c.3, <4 x float> %d.0, <4 x float> %d.1, <4 x float> %d.2, <4 x float> %d.3) // Lin64: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused([4 x <4 x float>] %a.coerce, [4 x <4 x float>] %b.coerce, [4 x <4 x float>] %c.coerce, [4 x <4 x float>] %d.coerce) diff --git a/clang/test/CodeGen/sparcv9-abi.c b/clang/test/CodeGen/sparcv9-abi.c index 616e24e7c519d..5a3d64fd37889 100644 --- a/clang/test/CodeGen/sparcv9-abi.c +++ b/clang/test/CodeGen/sparcv9-abi.c @@ -80,7 +80,7 @@ struct medium { int *c, *d; }; -// CHECK-LABEL: define{{.*}} %struct.medium @f_medium(ptr noundef %x) +// CHECK-LABEL: define{{.*}} %struct.medium @f_medium(ptr dead_on_return noundef %x) struct medium f_medium(struct medium x) { x.a += *x.b; x.b = 0; @@ -94,7 +94,7 @@ struct large { int x; }; -// CHECK-LABEL: define{{.*}} void @f_large(ptr dead_on_unwind noalias writable sret(%struct.large) align 8 %agg.result, ptr noundef %x) +// CHECK-LABEL: define{{.*}} void @f_large(ptr dead_on_unwind noalias writable sret(%struct.large) align 8 %agg.result, ptr dead_on_return noundef %x) struct large f_large(struct large x) { x.a += *x.b; x.b = 0; diff --git a/clang/test/CodeGen/vectorcall.c b/clang/test/CodeGen/vectorcall.c index cab7fc0972d7b..09b3310c7c4c8 100644 --- a/clang/test/CodeGen/vectorcall.c +++ b/clang/test/CodeGen/vectorcall.c @@ -17,7 +17,7 @@ void __vectorcall v3(int a, struct Small b, int c) {} struct Large { int a[5]; }; void __vectorcall v4(int a, struct Large b, int c) {} // X86: define dso_local x86_vectorcallcc void @"\01v4@@28"(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 inreg noundef %c) -// X64: define dso_local x86_vectorcallcc void @"\01v4@@40"(i32 noundef %a, ptr noundef %b, i32 noundef %c) +// X64: define dso_local x86_vectorcallcc void @"\01v4@@40"(i32 noundef %a, ptr dead_on_return noundef %b, i32 noundef %c) void __vectorcall v5(long long a, int b, int c) {} // X86: define dso_local x86_vectorcallcc void @"\01v5@@16"(i64 noundef %a, i32 inreg noundef %b, i32 inreg noundef %c) @@ -35,21 +35,21 @@ void __vectorcall hfa1(int a, struct HFA4 b, int c) {} // indirectly. Additional vector arguments can consume the rest of the SSE // registers. void __vectorcall hfa2(struct HFA4 a, struct HFA4 b, double c) {} -// X86: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, ptr inreg noundef %b, double inreg noundef %c) -// X64: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, ptr noundef %b, double noundef %c) +// X86: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, ptr dead_on_return inreg noundef %b, double inreg noundef %c) +// X64: define dso_local x86_vectorcallcc void @"\01hfa2@@72"(%struct.HFA4 inreg %a.coerce, ptr dead_on_return noundef %b, double noundef %c) // Ensure that we pass builtin types directly while counting them against the // SSE register usage. void __vectorcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {} -// X86: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double inreg noundef %a, double inreg noundef %b, double inreg noundef %c, double inreg noundef %d, double inreg noundef %e, ptr inreg noundef %f) -// X64: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, ptr noundef %f) +// X86: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double inreg noundef %a, double inreg noundef %b, double inreg noundef %c, double inreg noundef %d, double inreg noundef %e, ptr dead_on_return inreg noundef %f) +// X64: define dso_local x86_vectorcallcc void @"\01hfa3@@56"(double noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, ptr dead_on_return noundef %f) // Aggregates with more than four elements are not HFAs and are passed byval. // Because they are not classified as homogeneous, they don't get special // handling to ensure alignment. void __vectorcall hfa4(struct HFA5 a) {} // X86: define dso_local x86_vectorcallcc void @"\01hfa4@@40"(ptr noundef byval(%struct.HFA5) align 4 %0) -// X64: define dso_local x86_vectorcallcc void @"\01hfa4@@40"(ptr noundef %a) +// X64: define dso_local x86_vectorcallcc void @"\01hfa4@@40"(ptr dead_on_return noundef %a) // Return HFAs of 4 or fewer elements in registers. static struct HFA2 g_hfa2; @@ -68,26 +68,26 @@ v4f32 __vectorcall hva1(int a, struct HVA4 b, int c) {return b.w;} // X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva1@@80"(i32 noundef %a, %struct.HVA4 inreg %b.coerce, i32 noundef %c) v4f32 __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {return c;} -// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, ptr inreg noundef %b, <4 x float> inreg noundef %c) -// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, ptr noundef %b, <4 x float> noundef %c) +// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return inreg noundef %b, <4 x float> inreg noundef %c) +// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva2@@144"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return noundef %b, <4 x float> noundef %c) v4f32 __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {return f.x;} -// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> inreg noundef %a, <4 x float> inreg noundef %b, <4 x float> inreg noundef %c, <4 x float> inreg noundef %d, <4 x float> inreg noundef %e, ptr inreg noundef %f) -// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c, <4 x float> noundef %d, <4 x float> noundef %e, ptr noundef %f) +// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> inreg noundef %a, <4 x float> inreg noundef %b, <4 x float> inreg noundef %c, <4 x float> inreg noundef %d, <4 x float> inreg noundef %e, ptr dead_on_return inreg noundef %f) +// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva3@@112"(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c, <4 x float> noundef %d, <4 x float> noundef %e, ptr dead_on_return noundef %f) // vector types have higher priority then HVA structures, So vector types are allocated first // and HVAs are allocated if enough registers are available v4f32 __vectorcall hva4(struct HVA4 a, struct HVA2 b, v4f32 c) {return b.y;} -// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, ptr inreg noundef %b, <4 x float> inreg noundef %c) -// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, ptr noundef %b, <4 x float> noundef %c) +// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return inreg noundef %b, <4 x float> inreg noundef %c) +// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva4@@112"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return noundef %b, <4 x float> noundef %c) v4f32 __vectorcall hva5(struct HVA3 a, struct HVA3 b, v4f32 c, struct HVA2 d) {return d.y;} -// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, ptr inreg noundef %b, <4 x float> inreg noundef %c, %struct.HVA2 inreg %d.coerce) -// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, ptr noundef %b, <4 x float> noundef %c, %struct.HVA2 inreg %d.coerce) +// X86: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, ptr dead_on_return inreg noundef %b, <4 x float> inreg noundef %c, %struct.HVA2 inreg %d.coerce) +// X64: define dso_local x86_vectorcallcc <4 x float> @"\01hva5@@144"(%struct.HVA3 inreg %a.coerce, ptr dead_on_return noundef %b, <4 x float> noundef %c, %struct.HVA2 inreg %d.coerce) struct HVA4 __vectorcall hva6(struct HVA4 a, struct HVA4 b) { return b;} -// X86: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, ptr inreg noundef %b) -// X64: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, ptr noundef %b) +// X86: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return inreg noundef %b) +// X64: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, ptr dead_on_return noundef %b) struct HVA5 __vectorcall hva7(void) {struct HVA5 a = {}; return a;} // X86: define dso_local x86_vectorcallcc void @"\01hva7@@0"(ptr dead_on_unwind noalias writable sret(%struct.HVA5) align 16 %agg.result) @@ -108,8 +108,8 @@ void __vectorcall odd_size_hva(struct OddSizeHVA a) {} // consider 'p7' as a register. Instead p5 gets put into the register on the second pass. // x86 should pass p2, p6 and p7 in registers, then p1 in the second pass. struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3, int p4, struct HFA2 p5, float p6, float p7, int p8){ return p1;} -// X86: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@84"(%struct.HFA2 inreg %p1.coerce, float inreg noundef %p2, ptr inreg noundef %p3, i32 inreg noundef %p4, ptr noundef %p5, float inreg noundef %p6, float inreg noundef %p7, i32 noundef %p8) -// X64: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@104"(%struct.HFA2 inreg %p1.coerce, float noundef %p2, ptr noundef %p3, i32 noundef %p4, %struct.HFA2 inreg %p5.coerce, float noundef %p6, float noundef %p7, i32 noundef %p8) +// X86: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@84"(%struct.HFA2 inreg %p1.coerce, float inreg noundef %p2, ptr dead_on_return inreg noundef %p3, i32 inreg noundef %p4, ptr dead_on_return noundef %p5, float inreg noundef %p6, float inreg noundef %p7, i32 noundef %p8) +// X64: define dso_local x86_vectorcallcc %struct.HFA2 @"\01AddParticles@@104"(%struct.HFA2 inreg %p1.coerce, float noundef %p2, ptr dead_on_return noundef %p3, i32 noundef %p4, %struct.HFA2 inreg %p5.coerce, float noundef %p6, float noundef %p7, i32 noundef %p8) // Vectorcall in both architectures allows passing of an HVA as long as there is room, // even if it is not one of the first 6 arguments. First pass puts p4 into a @@ -117,8 +117,8 @@ struct HFA2 __vectorcall AddParticles(struct HFA2 p1, float p2, struct HFA4 p3, // in a register, does NOT put p7 in a register (since there's no room), then puts // p8 in a register. void __vectorcall HVAAnywhere(struct HFA2 p1, int p2, int p3, float p4, int p5, int p6, struct HFA4 p7, struct HFA2 p8, float p9){} -// X86: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@88"(%struct.HFA2 inreg %p1.coerce, i32 inreg noundef %p2, i32 inreg noundef %p3, float inreg noundef %p4, i32 noundef %p5, i32 noundef %p6, ptr noundef %p7, %struct.HFA2 inreg %p8.coerce, float inreg noundef %p9) -// X64: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@112"(%struct.HFA2 inreg %p1.coerce, i32 noundef %p2, i32 noundef %p3, float noundef %p4, i32 noundef %p5, i32 noundef %p6, ptr noundef %p7, %struct.HFA2 inreg %p8.coerce, float noundef %p9) +// X86: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@88"(%struct.HFA2 inreg %p1.coerce, i32 inreg noundef %p2, i32 inreg noundef %p3, float inreg noundef %p4, i32 noundef %p5, i32 noundef %p6, ptr dead_on_return noundef %p7, %struct.HFA2 inreg %p8.coerce, float inreg noundef %p9) +// X64: define dso_local x86_vectorcallcc void @"\01HVAAnywhere@@112"(%struct.HFA2 inreg %p1.coerce, i32 noundef %p2, i32 noundef %p3, float noundef %p4, i32 noundef %p5, i32 noundef %p6, ptr dead_on_return noundef %p7, %struct.HFA2 inreg %p8.coerce, float noundef %p9) #ifndef __x86_64__ // This covers the three ways XMM values can be passed on 32-bit x86: @@ -137,9 +137,9 @@ void __vectorcall vectorcall_indirect_vec( // X86-SAME: double inreg noundef %xmm3, // X86-SAME: double inreg noundef %xmm4, // X86-SAME: <4 x float> inreg noundef %xmm5, -// X86-SAME: ptr inreg noundef %0, +// X86-SAME: ptr dead_on_return inreg noundef %0, // X86-SAME: i32 inreg noundef %edx, -// X86-SAME: ptr noundef %1) +// X86-SAME: ptr dead_on_return noundef %1) void __vectorcall vectorcall_indirect_fp( double xmm0, double xmm1, double xmm2, double xmm3, double xmm4, @@ -153,7 +153,7 @@ void __vectorcall vectorcall_indirect_fp( // X86-SAME: double inreg noundef %xmm3, // X86-SAME: double inreg noundef %xmm4, // X86-SAME: <4 x float> inreg noundef %xmm5, -// X86-SAME: ptr inreg noundef %0, +// X86-SAME: ptr dead_on_return inreg noundef %0, // X86-SAME: i32 inreg noundef %edx, // X86-SAME: double noundef %mem) #endif diff --git a/clang/test/CodeGen/win-fp128.c b/clang/test/CodeGen/win-fp128.c index 328a7aaa7df57..8d223741bc93e 100644 --- a/clang/test/CodeGen/win-fp128.c +++ b/clang/test/CodeGen/win-fp128.c @@ -6,7 +6,7 @@ __float128 fp128_ret(void) { return 0; } // CHECK-GNU64: define dso_local <2 x i64> @fp128_ret() __float128 fp128_args(__float128 a, __float128 b) { return a * b; } -// CHECK-GNU64: define dso_local <2 x i64> @fp128_args(ptr noundef %0, ptr noundef %1) +// CHECK-GNU64: define dso_local <2 x i64> @fp128_args(ptr dead_on_return noundef %0, ptr dead_on_return noundef %1) void fp128_vararg(int a, ...) { // CHECK-GNU64-LABEL: define dso_local void @fp128_vararg diff --git a/clang/test/CodeGen/win64-i128.c b/clang/test/CodeGen/win64-i128.c index e10b2be0530eb..2d83889d8f89b 100644 --- a/clang/test/CodeGen/win64-i128.c +++ b/clang/test/CodeGen/win64-i128.c @@ -12,8 +12,8 @@ int128_t foo(void) { return 0; } int128_t bar(int128_t a, int128_t b) { return a * b; } -// GNU64: define dso_local <2 x i64> @bar(ptr noundef %0, ptr noundef %1) -// MSC64: define dso_local <2 x i64> @bar(ptr noundef %0, ptr noundef %1) +// GNU64: define dso_local <2 x i64> @bar(ptr dead_on_return noundef %0, ptr dead_on_return noundef %1) +// MSC64: define dso_local <2 x i64> @bar(ptr dead_on_return noundef %0, ptr dead_on_return noundef %1) void vararg(int a, ...) { // GNU64-LABEL: define{{.*}} void @vararg diff --git a/clang/test/CodeGen/windows-swiftcall.c b/clang/test/CodeGen/windows-swiftcall.c index 41569c2606622..8716f25b9ddfb 100644 --- a/clang/test/CodeGen/windows-swiftcall.c +++ b/clang/test/CodeGen/windows-swiftcall.c @@ -219,7 +219,7 @@ TEST(struct_big_1) // CHECK-LABEL: define {{.*}} void @return_struct_big_1({{.*}} dead_on_unwind noalias writable sret // Should not be byval. -// CHECK-LABEL: define {{.*}} void @take_struct_big_1(ptr noundef{{( %.*)?}}) +// CHECK-LABEL: define {{.*}} void @take_struct_big_1(ptr dead_on_return noundef{{( %.*)?}}) /*****************************************************************************/ /********************************* TYPE MERGING ******************************/ diff --git a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp index 152be26948f28..1709c88563267 100644 --- a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp +++ b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp @@ -576,7 +576,7 @@ void f(__clang_svmfloat8x4_t, __clang_svmfloat8x4_t); // CHECK-NEXT: [[COERCE74_EXTRACT2:%.*]] = extractvalue { , , , } [[COERCE74_TUPLE]], 2 // CHECK-NEXT: [[COERCE74_EXTRACT3:%.*]] = extractvalue { , , , } [[COERCE74_TUPLE]], 3 // CHECK-NEXT: store { , , , } zeroinitializer, ptr [[BYVAL_TEMP]], align 2 -// CHECK-NEXT: call void @_Z1f10svboolx4_tS_( [[COERCE74_EXTRACT0]], [[COERCE74_EXTRACT1]], [[COERCE74_EXTRACT2]], [[COERCE74_EXTRACT3]], ptr noundef [[BYVAL_TEMP]]) +// CHECK-NEXT: call void @_Z1f10svboolx4_tS_( [[COERCE74_EXTRACT0]], [[COERCE74_EXTRACT1]], [[COERCE74_EXTRACT2]], [[COERCE74_EXTRACT3]], ptr dead_on_return noundef [[BYVAL_TEMP]]) // CHECK-NEXT: store { , } zeroinitializer, ptr [[COERCE75]], align 16 // CHECK-NEXT: [[COERCE75_TUPLE:%.*]] = load { , }, ptr [[COERCE75]], align 16 // CHECK-NEXT: [[COERCE75_EXTRACT0:%.*]] = extractvalue { , } [[COERCE75_TUPLE]], 0 @@ -1125,7 +1125,7 @@ void f(__clang_svmfloat8x4_t, __clang_svmfloat8x4_t); // COMPAT_17-NEXT: [[COERCE74_EXTRACT2:%.*]] = extractvalue { , , , } [[COERCE74_TUPLE]], 2 // COMPAT_17-NEXT: [[COERCE74_EXTRACT3:%.*]] = extractvalue { , , , } [[COERCE74_TUPLE]], 3 // COMPAT_17-NEXT: store { , , , } zeroinitializer, ptr [[BYVAL_TEMP]], align 2 -// COMPAT_17-NEXT: call void @_Z1f10svboolx4_t10svboolx4_t( [[COERCE74_EXTRACT0]], [[COERCE74_EXTRACT1]], [[COERCE74_EXTRACT2]], [[COERCE74_EXTRACT3]], ptr noundef [[BYVAL_TEMP]]) +// COMPAT_17-NEXT: call void @_Z1f10svboolx4_t10svboolx4_t( [[COERCE74_EXTRACT0]], [[COERCE74_EXTRACT1]], [[COERCE74_EXTRACT2]], [[COERCE74_EXTRACT3]], ptr dead_on_return noundef [[BYVAL_TEMP]]) // COMPAT_17-NEXT: store { , } zeroinitializer, ptr [[COERCE75]], align 16 // COMPAT_17-NEXT: [[COERCE75_TUPLE:%.*]] = load { , }, ptr [[COERCE75]], align 16 // COMPAT_17-NEXT: [[COERCE75_EXTRACT0:%.*]] = extractvalue { , } [[COERCE75_TUPLE]], 0 diff --git a/clang/test/CodeGenCXX/arm-cc.cpp b/clang/test/CodeGenCXX/arm-cc.cpp index 68e1b7e4e1e46..939615fcc69e0 100644 --- a/clang/test/CodeGenCXX/arm-cc.cpp +++ b/clang/test/CodeGenCXX/arm-cc.cpp @@ -17,4 +17,4 @@ void baz() { } // CHECK: declare void @_Z3fooPv(ptr dead_on_unwind writable sret(%class.SMLoc) align 4, ptr noundef) -// CHECK: declare void @_Z3zed5SMLoc(ptr noundef) +// CHECK: declare void @_Z3zed5SMLoc(ptr dead_on_return noundef) diff --git a/clang/test/CodeGenCXX/attr-target-mv-inalloca.cpp b/clang/test/CodeGenCXX/attr-target-mv-inalloca.cpp index c341bd2b855ff..f2d602b3b523e 100644 --- a/clang/test/CodeGenCXX/attr-target-mv-inalloca.cpp +++ b/clang/test/CodeGenCXX/attr-target-mv-inalloca.cpp @@ -55,18 +55,18 @@ void usage() { // WINDOWS-NEXT: ret i32 %[[RET]] -// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z"(ptr noundef %[[O:[0-9a-zA-Z]+]]) +// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z"(ptr dead_on_return noundef %[[O:[0-9a-zA-Z]+]]) // WINDOWS64: %[[X:[0-9a-zA-Z]+]] = getelementptr inbounds nuw %struct.Foo, ptr %[[O]], i32 0, i32 0 // WINDOWS64: %[[LOAD:[0-9a-zA-Z]+]] = load i32, ptr %[[X]] // WINDOWS64: ret i32 %[[LOAD]] -// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z.sse4.2"(ptr noundef %[[O:[0-9a-zA-Z]+]]) +// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z.sse4.2"(ptr dead_on_return noundef %[[O:[0-9a-zA-Z]+]]) // WINDOWS64: %[[X:[0-9a-zA-Z]+]] = getelementptr inbounds nuw %struct.Foo, ptr %[[O]], i32 0, i32 0 // WINDOWS64: %[[LOAD:[0-9a-zA-Z]+]] = load i32, ptr %[[X]] // WINDOWS64: %[[ADD:[0-9a-zA-Z]+]] = add nsw i32 %[[LOAD]], 1 // WINDOWS64: ret i32 %[[ADD]] -// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z.arch_ivybridge"(ptr noundef %[[O:[0-9a-zA-Z]+]]) +// WINDOWS64: define dso_local noundef i32 @"?bar@@YAHUFoo@@@Z.arch_ivybridge"(ptr dead_on_return noundef %[[O:[0-9a-zA-Z]+]]) // WINDOWS64: %[[X:[0-9a-zA-Z]+]] = getelementptr inbounds nuw %struct.Foo, ptr %[[O]], i32 0, i32 0 // WINDOWS64: %[[LOAD:[0-9a-zA-Z]+]] = load i32, ptr %[[X]] // WINDOWS64: %[[ADD:[0-9a-zA-Z]+]] = add nsw i32 %[[LOAD]], 2 @@ -75,7 +75,7 @@ void usage() { // WINDOWS64: define dso_local void @"?usage@@YAXXZ"() // WINDOWS64: %[[F:[0-9a-zA-Z]+]] = alloca %struct.Foo // WINDOWS64: %[[ARG:[0-9a-zA-Z.]+]] = alloca %struct.Foo -// WINDOWS64: %[[CALL:[0-9a-zA-Z]+]] = call noundef i32 @"?bar@@YAHUFoo@@@Z.resolver"(ptr noundef %[[ARG]]) +// WINDOWS64: %[[CALL:[0-9a-zA-Z]+]] = call noundef i32 @"?bar@@YAHUFoo@@@Z.resolver"(ptr dead_on_return noundef %[[ARG]]) // WINDOWS64: define weak_odr dso_local i32 @"?bar@@YAHUFoo@@@Z.resolver"(ptr %0) // WINDOWS64: %[[RET:[0-9a-zA-Z]+]] = musttail call i32 @"?bar@@YAHUFoo@@@Z.arch_ivybridge"(ptr %0) diff --git a/clang/test/CodeGenCXX/copy-initialization.cpp b/clang/test/CodeGenCXX/copy-initialization.cpp index aa0c6395f158d..4e6194cc040c2 100644 --- a/clang/test/CodeGenCXX/copy-initialization.cpp +++ b/clang/test/CodeGenCXX/copy-initialization.cpp @@ -12,7 +12,7 @@ struct Bar { void f(Foo); -// CHECK-LABEL: define{{.*}} void @_Z1g3Foo(ptr noundef %foo) +// CHECK-LABEL: define{{.*}} void @_Z1g3Foo(ptr dead_on_return noundef %foo) void g(Foo foo) { // CHECK: call void @_ZN3BarC1Ev // CHECK: @_ZNK3BarcvRK3FooEv diff --git a/clang/test/CodeGenCXX/debug-info.cpp b/clang/test/CodeGenCXX/debug-info.cpp index 8594a897ef7c0..9cf26ba83ba3e 100644 --- a/clang/test/CodeGenCXX/debug-info.cpp +++ b/clang/test/CodeGenCXX/debug-info.cpp @@ -4,7 +4,7 @@ // CHECK: @_ZN6pr96081xE ={{.*}} global ptr null, align 8, !dbg [[X:![0-9]+]] // CHECK: define{{.*}} void @_ZN7pr147634funcENS_3fooE -// CHECK-SAME: ptr noundef [[param:%.*]]) +// CHECK-SAME: ptr dead_on_return noundef [[param:%.*]]) // CHECK-NEXT: entry: // CHECK-NEXT: alloca ptr, align 8 // CHECK-NEXT: [[param_addr_storage:%.*]] = alloca ptr, align 8 diff --git a/clang/test/CodeGenCXX/empty-nontrivially-copyable.cpp b/clang/test/CodeGenCXX/empty-nontrivially-copyable.cpp index c7d3a017414ef..c8f5a0f7c2ea6 100644 --- a/clang/test/CodeGenCXX/empty-nontrivially-copyable.cpp +++ b/clang/test/CodeGenCXX/empty-nontrivially-copyable.cpp @@ -13,7 +13,7 @@ struct Empty { }; bool foo(Empty e) { -// CHECK: @_Z3foo5Empty(ptr noundef %e) +// CHECK: @_Z3foo5Empty(ptr dead_on_return noundef %e) // CHECK: call {{.*}} @_ZN5Empty5checkEv(ptr {{[^,]*}} %e) return e.check(); } @@ -21,6 +21,6 @@ bool foo(Empty e) { void caller(Empty &e) { // CHECK: @_Z6callerR5Empty(ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %e) // CHECK: call {{.*}} @_ZN5EmptyC1ERKS_(ptr {{[^,]*}} [[NEWTMP:%.*]], ptr -// CHECK: call {{.*}} @_Z3foo5Empty(ptr noundef [[NEWTMP]]) +// CHECK: call {{.*}} @_Z3foo5Empty(ptr dead_on_return noundef [[NEWTMP]]) foo(e); } diff --git a/clang/test/CodeGenCXX/fastcall.cpp b/clang/test/CodeGenCXX/fastcall.cpp index 4c94c1623ee16..405917f7a14bb 100644 --- a/clang/test/CodeGenCXX/fastcall.cpp +++ b/clang/test/CodeGenCXX/fastcall.cpp @@ -15,6 +15,6 @@ struct S1 { void __attribute__((fastcall)) foo2(S1 a, int b); void bar2(S1 a, int b) { // CHECK-LABEL: define{{.*}} void @_Z4bar22S1i - // CHECK: call x86_fastcallcc void @_Z4foo22S1i(ptr inreg %{{.*}}, i32 inreg % + // CHECK: call x86_fastcallcc void @_Z4foo22S1i(ptr dead_on_return inreg %{{.*}}, i32 inreg % foo2(a, b); } diff --git a/clang/test/CodeGenCXX/homogeneous-aggregates.cpp b/clang/test/CodeGenCXX/homogeneous-aggregates.cpp index 63ffc6b5bfac8..5ebeb8aad4c18 100644 --- a/clang/test/CodeGenCXX/homogeneous-aggregates.cpp +++ b/clang/test/CodeGenCXX/homogeneous-aggregates.cpp @@ -41,8 +41,8 @@ struct D5 : I1, I2, I3 {}; // homogeneous aggregate // PPC: define{{.*}} void @_Z7func_D12D1(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, [3 x i64] %x.coerce) // ARM32: define{{.*}} arm_aapcs_vfpcc void @_Z7func_D12D1(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, [3 x i64] %x.coerce) -// ARM64: define{{.*}} void @_Z7func_D12D1(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, ptr noundef %x) -// X64: define dso_local x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, ptr noundef %x) +// ARM64: define{{.*}} void @_Z7func_D12D1(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, ptr dead_on_return noundef %x) +// X64: define dso_local x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(ptr dead_on_unwind noalias writable sret(%struct.D1) align 8 %agg.result, ptr dead_on_return noundef %x) D1 CC func_D1(D1 x) { return x; } // PPC: define{{.*}} [3 x double] @_Z7func_D22D2([3 x double] %x.coerce) @@ -53,7 +53,7 @@ D2 CC func_D2(D2 x) { return x; } // PPC: define{{.*}} void @_Z7func_D32D3(ptr dead_on_unwind noalias writable sret(%struct.D3) align 8 %agg.result, [4 x i64] %x.coerce) // ARM32: define{{.*}} arm_aapcs_vfpcc void @_Z7func_D32D3(ptr dead_on_unwind noalias writable sret(%struct.D3) align 8 %agg.result, [4 x i64] %x.coerce) -// ARM64: define{{.*}} void @_Z7func_D32D3(ptr dead_on_unwind noalias writable sret(%struct.D3) align 8 %agg.result, ptr noundef %x) +// ARM64: define{{.*}} void @_Z7func_D32D3(ptr dead_on_unwind noalias writable sret(%struct.D3) align 8 %agg.result, ptr dead_on_return noundef %x) D3 CC func_D3(D3 x) { return x; } // PPC: define{{.*}} [4 x double] @_Z7func_D42D4([4 x double] %x.coerce) @@ -201,7 +201,7 @@ struct NonHFA { virtual void f1(); }; double foo(NonHFA v) { return v.x + v.y; } -// WOA64: define dso_local noundef double @"?foo@polymorphic@@YANUNonHFA@1@@Z"(ptr noundef %{{.*}}) +// WOA64: define dso_local noundef double @"?foo@polymorphic@@YANUNonHFA@1@@Z"(ptr dead_on_return noundef %{{.*}}) } namespace trivial_copy_assignment { struct HFA { @@ -221,7 +221,7 @@ struct NonHFA { NonHFA &operator=(const NonHFA&); }; double foo(NonHFA v) { return v.x + v.y; } -// WOA64: define dso_local noundef double @"?foo@non_trivial_copy_assignment@@YANUNonHFA@1@@Z"(ptr noundef %{{.*}}) +// WOA64: define dso_local noundef double @"?foo@non_trivial_copy_assignment@@YANUNonHFA@1@@Z"(ptr dead_on_return noundef %{{.*}}) } namespace user_provided_ctor { struct HFA { @@ -251,7 +251,7 @@ struct NonHFA { ~NonHFA(); }; double foo(NonHFA v) { return v.x + v.y; } -// WOA64: define dso_local noundef double @"?foo@non_trivial_dtor@@YANUNonHFA@1@@Z"(ptr noundef %{{.*}}) +// WOA64: define dso_local noundef double @"?foo@non_trivial_dtor@@YANUNonHFA@1@@Z"(ptr dead_on_return noundef %{{.*}}) } namespace non_empty_base { struct non_empty_base { double d; }; @@ -272,7 +272,7 @@ struct NonHFA { empty e; }; double foo(NonHFA v) { return v.x + v.y; } -// WOA64: define dso_local noundef double @"?foo@empty_field@@YANUNonHFA@1@@Z"(ptr noundef %{{.*}}) +// WOA64: define dso_local noundef double @"?foo@empty_field@@YANUNonHFA@1@@Z"(ptr dead_on_return noundef %{{.*}}) } namespace non_empty_field { struct non_empty { double d; }; diff --git a/clang/test/CodeGenCXX/inalloca-lambda.cpp b/clang/test/CodeGenCXX/inalloca-lambda.cpp index dc78aa2773f89..0d527e1a02e5f 100644 --- a/clang/test/CodeGenCXX/inalloca-lambda.cpp +++ b/clang/test/CodeGenCXX/inalloca-lambda.cpp @@ -22,7 +22,7 @@ void test() { // CHECK: %[[V:.*]] = getelementptr inbounds nuw <{ %struct.A }>, ptr %[[ARG]], i32 0, i32 0 // CHECK: %call = call x86_thiscallcc noundef i32 // CHECK-SAME: @"?__impl@@?0??test@@YAXXZ@QBE?A?@@UA@@@Z" -// CHECK-SAME: (ptr noundef %this, ptr noundef %[[V]]) +// CHECK-SAME: (ptr noundef %this, ptr dead_on_return noundef %[[V]]) // CHECK: define internal noundef i32 // CHECK-SAME: @"?__invoke@@?0??test@@YAXXZ@CA?A?@@UA@@@Z" @@ -31,12 +31,12 @@ void test() { // CHECK: %[[VAR:.*]] = getelementptr inbounds nuw <{ %struct.A }>, ptr %[[ARG]], i32 0, i32 0 // CHECK: %call = call x86_thiscallcc noundef i32 // CHECK-SAME: @"?__impl@@?0??test@@YAXXZ@QBE?A?@@UA@@@Z" -// CHECK-SAME: (ptr noundef %unused.capture, ptr noundef %[[VAR]]) +// CHECK-SAME: (ptr noundef %unused.capture, ptr dead_on_return noundef %[[VAR]]) // CHECK: ret i32 %call // CHECK: define internal x86_thiscallcc noundef i32 // CHECK-SAME: @"?__impl@@?0??test@@YAXXZ@QBE?A?@@UA@@@Z" -// CHECK-SAME: (ptr noundef %this, ptr noundef %[[ARG:.*]]) +// CHECK-SAME: (ptr noundef %this, ptr dead_on_return noundef %[[ARG:.*]]) // CHECK: %this.addr = alloca ptr, align 4 // CHECK: store ptr %this, ptr %this.addr, align 4 // CHECK: %this1 = load ptr, ptr %this.addr, align 4 diff --git a/clang/test/CodeGenCXX/inalloca-overaligned.cpp b/clang/test/CodeGenCXX/inalloca-overaligned.cpp index 3751751ec0bca..305b8c5dccd10 100644 --- a/clang/test/CodeGenCXX/inalloca-overaligned.cpp +++ b/clang/test/CodeGenCXX/inalloca-overaligned.cpp @@ -57,7 +57,7 @@ int receive_both(Both o) { } // CHECK-LABEL: define dso_local noundef i32 @"?receive_both@@Y{{.*}}" -// CHECK-SAME: (ptr noundef %o) +// CHECK-SAME: (ptr dead_on_return noundef %o) int pass_both() { gvi32 = receive_both(Both()); @@ -67,7 +67,7 @@ int pass_both() { // CHECK-LABEL: define dso_local noundef i32 @"?pass_both@@Y{{.*}}" // CHECK: [[TMP:%[^ ]*]] = alloca %struct.Both, align 8 // CHECK: call x86_thiscallcc noundef ptr @"??0Both@@QAE@XZ"(ptr {{[^,]*}} [[TMP]]) -// CHECK: call noundef i32 @"?receive_both@@Y{{.*}}"(ptr noundef [[TMP]]) +// CHECK: call noundef i32 @"?receive_both@@Y{{.*}}"(ptr dead_on_return noundef [[TMP]]) int receive_inalloca_both(NonTrivial nt, Both o) { return nt.x + o.x + o.y; @@ -101,11 +101,11 @@ struct [[trivial_abi]] alignas(8) MyPtr { int receiveMyPtr(MyPtr o) { return *o.ptr; } // CHECK-LABEL: define dso_local noundef i32 @"?receiveMyPtr@@Y{{.*}}" -// CHECK-SAME: (ptr noundef %o) +// CHECK-SAME: (ptr dead_on_return noundef %o) int passMyPtr() { return receiveMyPtr(MyPtr()); } // CHECK-LABEL: define dso_local noundef i32 @"?passMyPtr@@Y{{.*}}" // CHECK: [[TMP:%[^ ]*]] = alloca %struct.MyPtr, align 8 // CHECK: call x86_thiscallcc noundef ptr @"??0MyPtr@@QAE@XZ"(ptr {{[^,]*}} [[TMP]]) -// CHECK: call noundef i32 @"?receiveMyPtr@@Y{{.*}}"(ptr noundef [[TMP]]) +// CHECK: call noundef i32 @"?receiveMyPtr@@Y{{.*}}"(ptr dead_on_return noundef [[TMP]]) diff --git a/clang/test/CodeGenCXX/inalloca-vector.cpp b/clang/test/CodeGenCXX/inalloca-vector.cpp index d1bacb4f0dc8c..2db4c49df116a 100644 --- a/clang/test/CodeGenCXX/inalloca-vector.cpp +++ b/clang/test/CodeGenCXX/inalloca-vector.cpp @@ -56,7 +56,7 @@ void __fastcall fastcall_receive_vec(__m128 x, __m128 y, __m128 z, __m128 w, int // CHECK-SAME: (<4 x float> inreg noundef %x, // CHECK-SAME: <4 x float> inreg noundef %y, // CHECK-SAME: <4 x float> inreg noundef %z, -// CHECK-SAME: ptr inreg noundef %0, +// CHECK-SAME: ptr dead_on_return inreg noundef %0, // CHECK-SAME: i32 inreg noundef %edx, // CHECK-SAME: ptr inalloca(<{ ptr, %struct.NonTrivial }>) %1) @@ -73,6 +73,6 @@ void __vectorcall vectorcall_receive_vec(double xmm0, double xmm1, double xmm2, // CHECK-SAME: <4 x float> inreg noundef %x, // CHECK-SAME: <4 x float> inreg noundef %y, // CHECK-SAME: <4 x float> inreg noundef %z, -// CHECK-SAME: ptr inreg noundef %0, +// CHECK-SAME: ptr dead_on_return inreg noundef %0, // CHECK-SAME: i32 inreg noundef %edx, // CHECK-SAME: ptr inalloca(<{ ptr, %struct.NonTrivial }>) %1) diff --git a/clang/test/CodeGenCXX/inheriting-constructor.cpp b/clang/test/CodeGenCXX/inheriting-constructor.cpp index 100ca269d7f3c..21751bea055dc 100644 --- a/clang/test/CodeGenCXX/inheriting-constructor.cpp +++ b/clang/test/CodeGenCXX/inheriting-constructor.cpp @@ -166,7 +166,7 @@ namespace inalloca_nonvirt { // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG3]], i32 3) // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG1]], i32 1) // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"( - // WIN64: call {{.*}} @"??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr %[[ARG1]], i32 2, ptr %[[ARG3]], ptr{{.*}} %[[TMP]]) + // WIN64: call {{.*}} @"??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr dead_on_return %[[ARG1]], i32 2, ptr dead_on_return %[[ARG3]], ptr{{.*}} %[[TMP]]) // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"( // WIN64: call void @"??1Q@@QEAA@XZ"(ptr {{[^,]*}} %[[TMP]]) @@ -202,7 +202,7 @@ namespace inalloca_nonvirt { // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG3]], i32 3) // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG1]], i32 1) // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"( - // WIN64: call {{.*}} @"??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr %[[ARG1]], i32 2, ptr %[[ARG3]], ptr{{.*}} %[[TMP]]) + // WIN64: call {{.*}} @"??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr dead_on_return %[[ARG1]], i32 2, ptr dead_on_return %[[ARG3]], ptr{{.*}} %[[TMP]]) // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"( // WIN64: call void @"??1Q@@QEAA@XZ"(ptr {{[^,]*}} %[[TMP]]) } @@ -253,7 +253,7 @@ namespace inalloca_virt { // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG3]], i32 3) // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG1]], i32 1) // WIN64: br i1 - // WIN64: call {{.*}} @"??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr %[[ARG1]], i32 2, ptr %[[ARG3]], ptr{{.*}} %[[TMP]]) + // WIN64: call {{.*}} @"??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr dead_on_return %[[ARG1]], i32 2, ptr dead_on_return %[[ARG3]], ptr{{.*}} %[[TMP]]) // WIN64: br // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"( // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"( @@ -302,7 +302,7 @@ namespace inalloca_virt { // WIN64: call {{.*}} @"??0Q@@QEAA@H@Z"(ptr {{[^,]*}} %[[ARG1]], i32 1) // WIN64: br i1 // WIN64: store {{.*}} @"??_8C@inalloca_virt@@7B@" - // WIN64: call {{.*}} @"??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr %[[ARG1]], i32 2, ptr %[[ARG3]], ptr{{.*}} %[[TMP]]) + // WIN64: call {{.*}} @"??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(ptr{{.*}}, ptr dead_on_return %[[ARG1]], i32 2, ptr dead_on_return %[[ARG3]], ptr{{.*}} %[[TMP]]) // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"( // WIN64: call {{.*}} @"??0Z@@QEAA@XZ"( // WIN64: call void @"??1Q@@QEAA@XZ"(ptr {{[^,]*}} %[[TMP]]) diff --git a/clang/test/CodeGenCXX/member-function-pointer-calls.cpp b/clang/test/CodeGenCXX/member-function-pointer-calls.cpp index ff511c0243801..f06cda8b7684e 100644 --- a/clang/test/CodeGenCXX/member-function-pointer-calls.cpp +++ b/clang/test/CodeGenCXX/member-function-pointer-calls.cpp @@ -16,7 +16,7 @@ int f(A* a, int (A::*fp)()) { // CHECK-NOT: } // CHECK: ret i32 1 // MINGW64-LABEL: define dso_local noundef i32 @_Z2g1v() -// MINGW64: call noundef i32 @_Z1fP1AMS_FivE(ptr noundef %{{.*}}, ptr noundef %{{.*}}) +// MINGW64: call noundef i32 @_Z1fP1AMS_FivE(ptr noundef %{{.*}}, ptr dead_on_return noundef %{{.*}}) int g1() { A a; return f(&a, &A::vf1); @@ -26,7 +26,7 @@ int g1() { // CHECK-NOT: } // CHECK: ret i32 2 // MINGW64-LABEL: define dso_local noundef i32 @_Z2g2v() -// MINGW64: call noundef i32 @_Z1fP1AMS_FivE(ptr noundef %{{.*}}, ptr noundef %{{.*}}) +// MINGW64: call noundef i32 @_Z1fP1AMS_FivE(ptr noundef %{{.*}}, ptr dead_on_return noundef %{{.*}}) int g2() { A a; return f(&a, &A::vf2); diff --git a/clang/test/CodeGenCXX/microsoft-abi-arg-order.cpp b/clang/test/CodeGenCXX/microsoft-abi-arg-order.cpp index b551df747c073..63a4d5525336b 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-arg-order.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-arg-order.cpp @@ -24,7 +24,7 @@ void foo(A a, A b, A c) { // X86: ret void // X64-LABEL: define dso_local void @"?foo@@YAXUA@@00@Z" -// X64: (ptr noundef %[[a:[^,]*]], ptr noundef %[[b:[^,]*]], ptr noundef %[[c:[^)]*]]) +// X64: (ptr dead_on_return noundef %[[a:[^,]*]], ptr dead_on_return noundef %[[b:[^,]*]], ptr dead_on_return noundef %[[c:[^)]*]]) // X64: call void @"??1A@@QEAA@XZ"(ptr {{[^,]*}} %[[a]]) // X64: call void @"??1A@@QEAA@XZ"(ptr {{[^,]*}} %[[b]]) // X64: call void @"??1A@@QEAA@XZ"(ptr {{[^,]*}} %[[c]]) @@ -64,7 +64,7 @@ void call_foo() { // X64: invoke noundef ptr @"??0A@@QEAA@H@Z"(ptr {{[^,]*}} %[[arg2:[^,]*]], i32 noundef 2) // X64: invoke noundef ptr @"??0A@@QEAA@H@Z"(ptr {{[^,]*}} %[[arg1:[^,]*]], i32 noundef 1) // X64: call void @"?foo@@YAXUA@@00@Z" -// X64: (ptr noundef %[[arg1]], ptr noundef %[[arg2]], ptr noundef %[[arg3]]) +// X64: (ptr dead_on_return noundef %[[arg1]], ptr dead_on_return noundef %[[arg2]], ptr dead_on_return noundef %[[arg3]]) // X64: ret void // // lpad2: diff --git a/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp b/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp index 5654db3ba8151..813abb03a7810 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp @@ -22,10 +22,10 @@ C::C() {} // force emission // CHECK32-NEXT: ret void // CHECK64-LABEL: define linkonce_odr dso_local void @"?foo@C@byval_thunk@@W7EAAXUAgg@2@@Z" -// CHECK64: (ptr noundef %this, ptr noundef %x) +// CHECK64: (ptr noundef %this, ptr dead_on_return noundef %x) // CHECK64: getelementptr i8, ptr %{{.*}}, i32 -8 // CHECK64: call void @"?foo@C@byval_thunk@@UEAAXUAgg@2@@Z" -// CHECK64: (ptr {{[^,]*}} %{{.*}}, ptr noundef %x) +// CHECK64: (ptr {{[^,]*}} %{{.*}}, ptr dead_on_return noundef %x) // CHECK64-NOT: call // CHECK64: ret void } @@ -54,10 +54,10 @@ C::C() {} // force emission // CHECK32-NEXT: ret void // CHECK64-LABEL: define linkonce_odr dso_local void @"?foo@C@stdcall_thunk@@W7EAAXUAgg@2@@Z" -// CHECK64: (ptr noundef %this, ptr noundef %x) +// CHECK64: (ptr noundef %this, ptr dead_on_return noundef %x) // CHECK64: getelementptr i8, ptr %{{.*}}, i32 -8 // CHECK64: call void @"?foo@C@stdcall_thunk@@UEAAXUAgg@2@@Z" -// CHECK64: (ptr {{[^,]*}} %{{.*}}, ptr noundef %x) +// CHECK64: (ptr {{[^,]*}} %{{.*}}, ptr dead_on_return noundef %x) // CHECK64-NOT: call // CHECK64: ret void } @@ -86,10 +86,10 @@ C::C() {} // force emission // CHECK32-NEXT: ret ptr %[[rv]] // CHECK64-LABEL: define linkonce_odr dso_local void @"?foo@C@sret_thunk@@W7EAA?AUAgg@2@U32@@Z" -// CHECK64: (ptr noundef %this, ptr dead_on_unwind noalias writable sret(%"struct.sret_thunk::Agg") align 4 %agg.result, ptr noundef %x) +// CHECK64: (ptr noundef %this, ptr dead_on_unwind noalias writable sret(%"struct.sret_thunk::Agg") align 4 %agg.result, ptr dead_on_return noundef %x) // CHECK64: getelementptr i8, ptr %{{.*}}, i32 -8 // CHECK64: call void @"?foo@C@sret_thunk@@UEAA?AUAgg@2@U32@@Z" -// CHECK64: (ptr {{[^,]*}} %{{.*}}, ptr dead_on_unwind writable sret(%"struct.sret_thunk::Agg") align 4 %agg.result, ptr noundef %x) +// CHECK64: (ptr {{[^,]*}} %{{.*}}, ptr dead_on_unwind writable sret(%"struct.sret_thunk::Agg") align 4 %agg.result, ptr dead_on_return noundef %x) // CHECK64-NOT: call // CHECK64: ret void } diff --git a/clang/test/CodeGenCXX/microsoft-abi-member-pointers.cpp b/clang/test/CodeGenCXX/microsoft-abi-member-pointers.cpp index 806bc5b63ef02..8defb68c668b2 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-member-pointers.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-member-pointers.cpp @@ -428,7 +428,7 @@ bool nullTestDataUnspecified(int Unspecified::*mp) { // Pass this large type indirectly. // X64-LABEL: define dso_local noundef zeroext i1 @"?nullTestDataUnspecified@@ -// X64: (ptr noundef %0) +// X64: (ptr dead_on_return noundef %0) } bool nullTestFunctionUnspecified(void (Unspecified::*mp)()) { @@ -590,7 +590,7 @@ bool unspecFuncMemptrEq(void (Unspecified::*l)(), void (Unspecified::*r)()) { // CHECK: } // X64-LABEL: define dso_local noundef zeroext i1 @"?unspecFuncMemptrEq@@ -// X64: (ptr noundef %0, ptr noundef %1) +// X64: (ptr dead_on_return noundef %0, ptr dead_on_return noundef %1) } bool unspecFuncMemptrNeq(void (Unspecified::*l)(), void (Unspecified::*r)()) { @@ -635,7 +635,7 @@ bool unspecDataMemptrEq(int Unspecified::*l, int Unspecified::*r) { // CHECK: } // X64-LABEL: define dso_local noundef zeroext i1 @"?unspecDataMemptrEq@@ -// X64: (ptr noundef %0, ptr noundef %1) +// X64: (ptr dead_on_return noundef %0, ptr dead_on_return noundef %1) } void (Multiple::*convertB2FuncToMultiple(void (B2::*mp)()))() { diff --git a/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp b/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp index 6e8ba3953b2cf..767bf168633ae 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp @@ -175,7 +175,7 @@ void multibyte_arg(Multibyte s) {} void packed_arg(Packed s) {} // LINUX-LABEL: define{{.*}} void @_Z10packed_arg6Packed(ptr noundef byval(%struct.Packed) align 4 %s) // WIN32: define dso_local void @"?packed_arg@@YAXUPacked@@@Z"(ptr noundef byval(%struct.Packed) align 4 %s) -// WIN64: define dso_local void @"?packed_arg@@YAXUPacked@@@Z"(ptr noundef %s) +// WIN64: define dso_local void @"?packed_arg@@YAXUPacked@@@Z"(ptr dead_on_return noundef %s) // Test that dtors are invoked in the callee. void small_arg_with_dtor(SmallWithDtor s) {} @@ -190,7 +190,7 @@ void small_arg_with_dtor(SmallWithDtor s) {} // WOA64: } // FIXME: MSVC incompatible! -// WOA: define dso_local arm_aapcs_vfpcc void @"?small_arg_with_dtor@@YAXUSmallWithDtor@@@Z"(ptr noundef %s) {{.*}} { +// WOA: define dso_local arm_aapcs_vfpcc void @"?small_arg_with_dtor@@YAXUSmallWithDtor@@@Z"(ptr dead_on_return noundef %s) {{.*}} { // WOA: call arm_aapcs_vfpcc void @"??1SmallWithDtor@@QAA@XZ"(ptr {{[^,]*}} %s) // WOA: } @@ -220,7 +220,7 @@ void ref_small_arg_with_dtor(const SmallWithDtor &s) { } // WIN64-LABEL: define dso_local void @"?ref_small_arg_with_dtor@@YAXAEBUSmallWithDtor@@@Z"(ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %s) void big_arg_with_dtor(BigWithDtor s) {} -// WIN64-LABEL: define dso_local void @"?big_arg_with_dtor@@YAXUBigWithDtor@@@Z"(ptr noundef %s) +// WIN64-LABEL: define dso_local void @"?big_arg_with_dtor@@YAXUBigWithDtor@@@Z"(ptr dead_on_return noundef %s) // WIN64: call void @"??1BigWithDtor@@QEAA@XZ" // WIN64: } @@ -231,7 +231,7 @@ void call_big_arg_with_dtor() { // larger than 8 bytes and is passed indirectly. // WIN64-LABEL: define dso_local void @"?call_big_arg_with_dtor@@YAXXZ"() // WIN64: call noundef ptr @"??0BigWithDtor@@QEAA@XZ" -// WIN64: call void @"?big_arg_with_dtor@@YAXUBigWithDtor@@@Z"(ptr noundef %{{.*}}) +// WIN64: call void @"?big_arg_with_dtor@@YAXUBigWithDtor@@@Z"(ptr dead_on_return noundef %{{.*}}) // WIN64-NOT: call void @"??1BigWithDtor@@QEAA@XZ" // WIN64: ret void @@ -259,22 +259,22 @@ void eh_cleanup_arg_with_dtor() { // WIN32: } void small_arg_with_vftable(SmallWithVftable s) {} -// LINUX-LABEL: define{{.*}} void @_Z22small_arg_with_vftable16SmallWithVftable(ptr noundef %s) +// LINUX-LABEL: define{{.*}} void @_Z22small_arg_with_vftable16SmallWithVftable(ptr dead_on_return noundef %s) // WIN32: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr inalloca(<{ %struct.SmallWithVftable }>) %0) -// WIN64: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr noundef %s) -// WOA64: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr noundef %s) +// WIN64: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr dead_on_return noundef %s) +// WOA64: define dso_local void @"?small_arg_with_vftable@@YAXUSmallWithVftable@@@Z"(ptr dead_on_return noundef %s) void medium_arg_with_copy_ctor(MediumWithCopyCtor s) {} -// LINUX-LABEL: define{{.*}} void @_Z25medium_arg_with_copy_ctor18MediumWithCopyCtor(ptr noundef %s) +// LINUX-LABEL: define{{.*}} void @_Z25medium_arg_with_copy_ctor18MediumWithCopyCtor(ptr dead_on_return noundef %s) // WIN32: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr inalloca(<{ %struct.MediumWithCopyCtor }>) %0) -// WIN64: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr noundef %s) -// WOA: define dso_local arm_aapcs_vfpcc void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr noundef %s) -// WOA64: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr noundef %s) +// WIN64: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr dead_on_return noundef %s) +// WOA: define dso_local arm_aapcs_vfpcc void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr dead_on_return noundef %s) +// WOA64: define dso_local void @"?medium_arg_with_copy_ctor@@YAXUMediumWithCopyCtor@@@Z"(ptr dead_on_return noundef %s) void big_arg(Big s) {} // LINUX-LABEL: define{{.*}} void @_Z7big_arg3Big(ptr noundef byval(%struct.Big) align 4 %s) // WIN32: define dso_local void @"?big_arg@@YAXUBig@@@Z"(ptr noundef byval(%struct.Big) align 4 %s) -// WIN64: define dso_local void @"?big_arg@@YAXUBig@@@Z"(ptr noundef %s) +// WIN64: define dso_local void @"?big_arg@@YAXUBig@@@Z"(ptr dead_on_return noundef %s) // PR27607: We would attempt to load i32 value out of the reference instead of // just loading the pointer from the struct during argument expansion. @@ -346,7 +346,7 @@ class Class { void thiscall_method_arg(Big s) {} // LINUX: define {{.*}} void @_ZN5Class19thiscall_method_argE3Big(ptr {{[^,]*}} %this, ptr noundef byval(%struct.Big) align 4 %s) // WIN32: define {{.*}} void @"?thiscall_method_arg@Class@@QAEXUBig@@@Z"(ptr {{[^,]*}} %this, ptr noundef byval(%struct.Big) align 4 %s) - // WIN64: define linkonce_odr dso_local void @"?thiscall_method_arg@Class@@QEAAXUBig@@@Z"(ptr {{[^,]*}} %this, ptr noundef %s) + // WIN64: define linkonce_odr dso_local void @"?thiscall_method_arg@Class@@QEAAXUBig@@@Z"(ptr {{[^,]*}} %this, ptr dead_on_return noundef %s) }; void use_class() { diff --git a/clang/test/CodeGenCXX/microsoft-abi-unknown-arch.cpp b/clang/test/CodeGenCXX/microsoft-abi-unknown-arch.cpp index 9e37e71e257fd..b7653632cf882 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-unknown-arch.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-unknown-arch.cpp @@ -18,4 +18,4 @@ A B::foo(A x) { return x; } -// CHECK-LABEL: define{{.*}} void @"?foo@B@@QEAA?AUA@@U2@@Z"(ptr {{[^,]*}} %this, ptr dead_on_unwind noalias writable sret(%struct.A) align 4 %agg.result, ptr noundef %x) +// CHECK-LABEL: define{{.*}} void @"?foo@B@@QEAA?AUA@@U2@@Z"(ptr {{[^,]*}} %this, ptr dead_on_unwind noalias writable sret(%struct.A) align 4 %agg.result, ptr dead_on_return noundef %x) diff --git a/clang/test/CodeGenCXX/ms-property.cpp b/clang/test/CodeGenCXX/ms-property.cpp index 744de224b2f9a..d9fbf46dacb25 100644 --- a/clang/test/CodeGenCXX/ms-property.cpp +++ b/clang/test/CodeGenCXX/ms-property.cpp @@ -105,7 +105,7 @@ int main(int argc, char **argv) { // CHECK: [[ARGC:%.+]] = load i32, ptr % // CHECK: [[P1_X_ARGC_0:%.+]] = call noundef i32 @"?GetX@S@@QEAAHHH@Z"(ptr {{[^,]*}} [[P1]], i32 noundef [[ARGC]], i32 noundef 0) // CHECK: [[CAST:%.+]] = trunc i32 [[P1_X_ARGC_0]] to i8 - // CHECK: [[P2_Y_p1_X_ARGC_0_T:%.+]] = call noundef i8 @"?GetY@?$St@M@@QEAADDVTest1@@@Z"(ptr {{[^,]*}} [[P2_2]], i8 noundef [[CAST]], ptr noundef %{{.+}}) + // CHECK: [[P2_Y_p1_X_ARGC_0_T:%.+]] = call noundef i8 @"?GetY@?$St@M@@QEAADDVTest1@@@Z"(ptr {{[^,]*}} [[P2_2]], i8 noundef [[CAST]], ptr dead_on_return noundef %{{.+}}) // CHECK: [[CAST:%.+]] = sitofp i8 [[P2_Y_p1_X_ARGC_0_T]] to float // CHECK: [[J:%.+]] = load i32, ptr % // CHECK: [[CAST1:%.+]] = sitofp i32 [[J]] to float @@ -124,6 +124,6 @@ int main(int argc, char **argv) { // CHECK: call noundef i32 @"?GetX@?$St@H@@QEAAHHH@Z"(ptr {{[^,]*}} [[BAR]], i32 noundef %{{.+}} i32 noundef %{{.+}}) // CHECK: call void @"?PutY@?$St@H@@QEAAXDHN@Z"(ptr {{[^,]*}} [[BAR]], i8 noundef %{{.+}}, i32 noundef %{{.+}}, double noundef %{{.+}} // CHECK: call noundef i32 @"?GetX@?$St@H@@QEAAHHH@Z"(ptr {{[^,]*}} [[BAR]], i32 noundef %{{.+}} i32 noundef %{{.+}}) -// CHECK: call noundef i8 @"?GetY@?$St@H@@QEAADDVTest1@@@Z"(ptr {{[^,]*}} [[BAR]], i8 noundef %{{.+}}, ptr noundef %{{.+}}) +// CHECK: call noundef i8 @"?GetY@?$St@H@@QEAADDVTest1@@@Z"(ptr {{[^,]*}} [[BAR]], i8 noundef %{{.+}}, ptr dead_on_return noundef %{{.+}}) // CHECK: call noundef i32 @"?PutX@?$St@H@@QEAAHHHH@Z"(ptr {{[^,]*}} [[BAR]], i32 noundef %{{.+}}, i32 noundef %{{.+}}, i32 noundef %{{.+}}) #endif //HEADER diff --git a/clang/test/CodeGenCXX/nrvo.cpp b/clang/test/CodeGenCXX/nrvo.cpp index 1141bc35de582..5b0fc914120e3 100644 --- a/clang/test/CodeGenCXX/nrvo.cpp +++ b/clang/test/CodeGenCXX/nrvo.cpp @@ -2197,7 +2197,7 @@ void test16() { // http://wg21.link/p2025r2#ex-9 // CHECK-EH-11-NEXT: br i1 [[CMP9]], label [[IF_THEN10:%.*]], label [[IF_END11:%.*]] // CHECK-EH-11: if.then10: // CHECK-EH-11-NEXT: store i32 3, ptr [[CLEANUP_DEST_SLOT]], align 4 -// CHECK-EH-11-NEXT: br label [[CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK-EH-11-NEXT: br label [[CLEANUP]], !llvm.loop [[LOOP4:![0-9]+]] // CHECK-EH-11: if.end11: // CHECK-EH-11-NEXT: [[TMP4:%.*]] = load i32, ptr [[I_ADDR]], align 4 // CHECK-EH-11-NEXT: [[CMP12:%.*]] = icmp eq i32 [[TMP4]], 3 @@ -2239,7 +2239,7 @@ void test16() { // http://wg21.link/p2025r2#ex-9 // CHECK-EH-11-NEXT: i32 2, label [[IMPOSSIBLE]] // CHECK-EH-11-NEXT: ] // CHECK-EH-11: cleanup.cont: -// CHECK-EH-11-NEXT: br label [[WHILE_BODY]], !llvm.loop [[LOOP3]] +// CHECK-EH-11-NEXT: br label [[WHILE_BODY]], !llvm.loop [[LOOP4]] // CHECK-EH-11: while.end: // CHECK-EH-11-NEXT: call void @_ZN1XC1Ev(ptr noundef nonnull align 1 dereferenceable(1) [[AGG_RESULT]]) // CHECK-EH-11-NEXT: br label [[RETURN]] diff --git a/clang/test/CodeGenCXX/pass-by-value-noalias.cpp b/clang/test/CodeGenCXX/pass-by-value-noalias.cpp index 773cf6b81c3b2..947379d9b8b92 100644 --- a/clang/test/CodeGenCXX/pass-by-value-noalias.cpp +++ b/clang/test/CodeGenCXX/pass-by-value-noalias.cpp @@ -14,8 +14,8 @@ struct Foo { // Make sure noalias is added to indirect arguments with trivially copyable types // if -fpass-by-value-is-noalias is provided. -// WITH_NOALIAS: define{{.*}} void @_Z4take3Foo(ptr noalias noundef %arg) -// NO_NOALIAS: define{{.*}} void @_Z4take3Foo(ptr noundef %arg) +// WITH_NOALIAS: define{{.*}} void @_Z4take3Foo(ptr dead_on_return noalias noundef %arg) +// NO_NOALIAS: define{{.*}} void @_Z4take3Foo(ptr dead_on_return noundef %arg) void take(Foo arg) {} int G; @@ -38,8 +38,8 @@ struct NonTrivial { // Make sure noalias is not added to indirect arguments that are not trivially // copyable even if -fpass-by-value-is-noalias is provided. -// WITH_NOALIAS: define{{.*}} void @_Z4take10NonTrivial(ptr noundef %arg) -// NO_NOALIAS: define{{.*}} void @_Z4take10NonTrivial(ptr noundef %arg) +// WITH_NOALIAS: define{{.*}} void @_Z4take10NonTrivial(ptr dead_on_return noundef %arg) +// NO_NOALIAS: define{{.*}} void @_Z4take10NonTrivial(ptr dead_on_return noundef %arg) void take(NonTrivial arg) {} // Escape examples. Pointers to the objects passed to take() may escape, depending on whether a temporary copy is created or not (e.g. due to NRVO). @@ -54,8 +54,8 @@ struct A { }; A *p; -// WITH_NOALIAS: define{{.*}} void @_Z4take1A(ptr noalias noundef %arg) -// NO_NOALIAS: define{{.*}} void @_Z4take1A(ptr noundef %arg) +// WITH_NOALIAS: define{{.*}} void @_Z4take1A(ptr dead_on_return noalias noundef %arg) +// NO_NOALIAS: define{{.*}} void @_Z4take1A(ptr dead_on_return noundef %arg) void take(A arg) {} // WITH_NOALIAS: define{{.*}} void @_Z7CreateAPP1A(ptr dead_on_unwind noalias writable sret(%struct.A) align 1 %agg.result, ptr noundef %where) diff --git a/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp b/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp index daeea77774ec8..0310535362e3d 100644 --- a/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp +++ b/clang/test/CodeGenCXX/ptrauth-qualifier-struct.cpp @@ -24,7 +24,7 @@ struct __attribute__((trivial_abi)) TrivialSA { // Check that TrivialSA is passed indirectly despite being annotated with // 'trivial_abi'. -// CHECK: define {{.*}}void @_Z18testParamTrivialSA9TrivialSA(ptr noundef %{{.*}}) +// CHECK: define {{.*}}void @_Z18testParamTrivialSA9TrivialSA(ptr dead_on_return noundef %{{.*}}) void testParamTrivialSA(TrivialSA a) { } diff --git a/clang/test/CodeGenCXX/regparm.cpp b/clang/test/CodeGenCXX/regparm.cpp index b9735485db8de..a31394016fb55 100644 --- a/clang/test/CodeGenCXX/regparm.cpp +++ b/clang/test/CodeGenCXX/regparm.cpp @@ -11,7 +11,7 @@ struct S1 { }; void __attribute__((regparm(3))) foo2(S1 a, int b); -// CHECK: declare void @_Z4foo22S1i(ptr inreg noundef, i32 inreg noundef) +// CHECK: declare void @_Z4foo22S1i(ptr dead_on_return inreg noundef, i32 inreg noundef) void bar2(S1 a, int b) { foo2(a, b); } diff --git a/clang/test/CodeGenCXX/trivial_abi.cpp b/clang/test/CodeGenCXX/trivial_abi.cpp index b8cc0d1cc6528..eacbde594e517 100644 --- a/clang/test/CodeGenCXX/trivial_abi.cpp +++ b/clang/test/CodeGenCXX/trivial_abi.cpp @@ -140,7 +140,7 @@ void testIgnoredSmall() { testReturnSmall(); } -// CHECK: define{{.*}} void @_Z14testParamLarge5Large(ptr noundef %[[A:.*]]) +// CHECK: define{{.*}} void @_Z14testParamLarge5Large(ptr dead_on_return noundef %[[A:.*]]) // CHECK: %[[CALL:.*]] = call noundef ptr @_ZN5LargeD1Ev(ptr {{[^,]*}} %[[A]]) // CHECK: ret void // CHECK: } @@ -163,7 +163,7 @@ Large testReturnLarge() { // CHECK: %[[AGG_TMP:.*]] = alloca %[[STRUCT_LARGE]], align 8 // CHECK: %[[CALL:.*]] = call noundef ptr @_ZN5LargeC1Ev(ptr {{[^,]*}} %[[T]]) // CHECK: %[[CALL1:.*]] = call noundef ptr @_ZN5LargeC1ERKS_(ptr {{[^,]*}} %[[AGG_TMP]], ptr noundef nonnull align 8 dereferenceable(520) %[[T]]) -// CHECK: call void @_Z14testParamLarge5Large(ptr noundef %[[AGG_TMP]]) +// CHECK: call void @_Z14testParamLarge5Large(ptr dead_on_return noundef %[[AGG_TMP]]) // CHECK: %[[CALL2:.*]] = call noundef ptr @_ZN5LargeD1Ev(ptr {{[^,]*}} %[[T]]) // CHECK: ret void // CHECK: } @@ -176,7 +176,7 @@ void testCallLarge0() { // CHECK: define{{.*}} void @_Z14testCallLarge1v() // CHECK: %[[AGG_TMP:.*]] = alloca %[[STRUCT_LARGE:.*]], align 8 // CHECK: call void @_Z15testReturnLargev(ptr dead_on_unwind writable sret(%[[STRUCT_LARGE]]) align 8 %[[AGG_TMP]]) -// CHECK: call void @_Z14testParamLarge5Large(ptr noundef %[[AGG_TMP]]) +// CHECK: call void @_Z14testParamLarge5Large(ptr dead_on_return noundef %[[AGG_TMP]]) // CHECK: ret void // CHECK: } @@ -244,7 +244,7 @@ void testExceptionSmall() { // CHECK: call noundef ptr @_ZN5LargeC1Ev(ptr {{[^,]*}} %[[AGG_TMP]]) // CHECK: invoke noundef ptr @_ZN5LargeC1Ev(ptr {{[^,]*}} %[[AGG_TMP1]]) -// CHECK: call void @_Z20calleeExceptionLarge5LargeS_(ptr noundef %[[AGG_TMP]], ptr noundef %[[AGG_TMP1]]) +// CHECK: call void @_Z20calleeExceptionLarge5LargeS_(ptr dead_on_return noundef %[[AGG_TMP]], ptr dead_on_return noundef %[[AGG_TMP1]]) // CHECK-NEXT: ret void // CHECK: landingpad { ptr, i32 } diff --git a/clang/test/CodeGenCXX/uncopyable-args.cpp b/clang/test/CodeGenCXX/uncopyable-args.cpp index 31192b65cc362..2d09732f9d3c2 100644 --- a/clang/test/CodeGenCXX/uncopyable-args.cpp +++ b/clang/test/CodeGenCXX/uncopyable-args.cpp @@ -59,12 +59,12 @@ void bar() { // CHECK-LABEL: define{{.*}} void @_ZN9move_ctor3barEv() // CHECK: call void @_Z{{.*}}C1Ev( // CHECK-NOT: call -// NEWABI: call void @_ZN9move_ctor3fooENS_1AE(ptr noundef %{{.*}}) +// NEWABI: call void @_ZN9move_ctor3fooENS_1AE(ptr dead_on_return noundef %{{.*}}) // OLDABI: call void @_ZN9move_ctor3fooENS_1AE(ptr %{{.*}}) -// NEWABI-LABEL: declare void @_ZN9move_ctor3fooENS_1AE(ptr noundef) +// NEWABI-LABEL: declare void @_ZN9move_ctor3fooENS_1AE(ptr dead_on_return noundef) // OLDABI-LABEL: declare void @_ZN9move_ctor3fooENS_1AE(ptr) -// WIN64-LABEL: declare dso_local void @"?foo@move_ctor@@YAXUA@1@@Z"(ptr noundef) +// WIN64-LABEL: declare dso_local void @"?foo@move_ctor@@YAXUA@1@@Z"(ptr dead_on_return noundef) } namespace all_deleted { @@ -81,12 +81,12 @@ void bar() { // CHECK-LABEL: define{{.*}} void @_ZN11all_deleted3barEv() // CHECK: call void @_Z{{.*}}C1Ev( // CHECK-NOT: call -// NEWABI: call void @_ZN11all_deleted3fooENS_1AE(ptr noundef %{{.*}}) +// NEWABI: call void @_ZN11all_deleted3fooENS_1AE(ptr dead_on_return noundef %{{.*}}) // OLDABI: call void @_ZN11all_deleted3fooENS_1AE(ptr %{{.*}}) -// NEWABI-LABEL: declare void @_ZN11all_deleted3fooENS_1AE(ptr noundef) +// NEWABI-LABEL: declare void @_ZN11all_deleted3fooENS_1AE(ptr dead_on_return noundef) // OLDABI-LABEL: declare void @_ZN11all_deleted3fooENS_1AE(ptr) -// WIN64-LABEL: declare dso_local void @"?foo@all_deleted@@YAXUA@1@@Z"(ptr noundef) +// WIN64-LABEL: declare dso_local void @"?foo@all_deleted@@YAXUA@1@@Z"(ptr dead_on_return noundef) } namespace implicitly_deleted { @@ -102,14 +102,14 @@ void bar() { // CHECK-LABEL: define{{.*}} void @_ZN18implicitly_deleted3barEv() // CHECK: call void @_Z{{.*}}C1Ev( // CHECK-NOT: call -// NEWABI: call void @_ZN18implicitly_deleted3fooENS_1AE(ptr noundef %{{.*}}) +// NEWABI: call void @_ZN18implicitly_deleted3fooENS_1AE(ptr dead_on_return noundef %{{.*}}) // OLDABI: call void @_ZN18implicitly_deleted3fooENS_1AE(ptr %{{.*}}) -// NEWABI-LABEL: declare void @_ZN18implicitly_deleted3fooENS_1AE(ptr noundef) +// NEWABI-LABEL: declare void @_ZN18implicitly_deleted3fooENS_1AE(ptr dead_on_return noundef) // OLDABI-LABEL: declare void @_ZN18implicitly_deleted3fooENS_1AE(ptr) // In MSVC 2013, the copy ctor is not deleted by a move assignment. In MSVC 2015, it is. // WIN64-18-LABEL: declare dso_local void @"?foo@implicitly_deleted@@YAXUA@1@@Z"(i64 -// WIN64-19-LABEL: declare dso_local void @"?foo@implicitly_deleted@@YAXUA@1@@Z"(ptr noundef) +// WIN64-19-LABEL: declare dso_local void @"?foo@implicitly_deleted@@YAXUA@1@@Z"(ptr dead_on_return noundef) } namespace one_deleted { @@ -125,12 +125,12 @@ void bar() { // CHECK-LABEL: define{{.*}} void @_ZN11one_deleted3barEv() // CHECK: call void @_Z{{.*}}C1Ev( // CHECK-NOT: call -// NEWABI: call void @_ZN11one_deleted3fooENS_1AE(ptr noundef %{{.*}}) +// NEWABI: call void @_ZN11one_deleted3fooENS_1AE(ptr dead_on_return noundef %{{.*}}) // OLDABI: call void @_ZN11one_deleted3fooENS_1AE(ptr %{{.*}}) -// NEWABI-LABEL: declare void @_ZN11one_deleted3fooENS_1AE(ptr noundef) +// NEWABI-LABEL: declare void @_ZN11one_deleted3fooENS_1AE(ptr dead_on_return noundef) // OLDABI-LABEL: declare void @_ZN11one_deleted3fooENS_1AE(ptr) -// WIN64-LABEL: declare dso_local void @"?foo@one_deleted@@YAXUA@1@@Z"(ptr noundef) +// WIN64-LABEL: declare dso_local void @"?foo@one_deleted@@YAXUA@1@@Z"(ptr dead_on_return noundef) } namespace copy_defaulted { @@ -170,7 +170,7 @@ void bar() { // CHECK: call void @_ZN14move_defaulted3fooENS_1AE(ptr %{{.*}}) // CHECK-LABEL: declare void @_ZN14move_defaulted3fooENS_1AE(ptr) -// WIN64-LABEL: declare dso_local void @"?foo@move_defaulted@@YAXUA@1@@Z"(ptr noundef) +// WIN64-LABEL: declare dso_local void @"?foo@move_defaulted@@YAXUA@1@@Z"(ptr dead_on_return noundef) } namespace trivial_defaulted { @@ -207,12 +207,12 @@ void bar() { } // CHECK-LABEL: define{{.*}} void @_ZN14two_copy_ctors3barEv() // CHECK: call void @_Z{{.*}}C1Ev( -// NEWABI: call void @_ZN14two_copy_ctors3fooENS_1BE(ptr noundef %{{.*}}) +// NEWABI: call void @_ZN14two_copy_ctors3fooENS_1BE(ptr dead_on_return noundef %{{.*}}) // OLDABI: call void @_ZN14two_copy_ctors3fooENS_1BE(ptr noundef byval -// NEWABI-LABEL: declare void @_ZN14two_copy_ctors3fooENS_1BE(ptr noundef) +// NEWABI-LABEL: declare void @_ZN14two_copy_ctors3fooENS_1BE(ptr dead_on_return noundef) // OLDABI-LABEL: declare void @_ZN14two_copy_ctors3fooENS_1BE(ptr noundef byval -// WIN64-LABEL: declare dso_local void @"?foo@two_copy_ctors@@YAXUB@1@@Z"(ptr noundef) +// WIN64-LABEL: declare dso_local void @"?foo@two_copy_ctors@@YAXUB@1@@Z"(ptr dead_on_return noundef) } namespace definition_only { diff --git a/clang/test/CodeGenCXX/wasm-args-returns.cpp b/clang/test/CodeGenCXX/wasm-args-returns.cpp index fbb152ac1bb3d..7b1c27bf0808d 100644 --- a/clang/test/CodeGenCXX/wasm-args-returns.cpp +++ b/clang/test/CodeGenCXX/wasm-args-returns.cpp @@ -46,17 +46,17 @@ struct copy_ctor { copy_ctor(copy_ctor const &); }; test(copy_ctor); -// CHECK: define void @_Z7forward9copy_ctor(ptr dead_on_unwind noalias {{[^,]*}} sret(%struct.copy_ctor) align 8 %{{.*}}, ptr nonnull %{{.*}}) +// CHECK: define void @_Z7forward9copy_ctor(ptr dead_on_unwind noalias {{[^,]*}} sret(%struct.copy_ctor) align 8 %{{.*}}, ptr dead_on_return nonnull %{{.*}}) // // CHECK: declare ptr @_ZN9copy_ctorC1ERKS_(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align 8 dereferenceable(8)) // // CHECK: define void @_Z14test_copy_ctorv() // CHECK: %[[tmp:.*]] = alloca %struct.copy_ctor, align 8 // CHECK: call void @_Z13def_copy_ctorv(ptr dead_on_unwind nonnull writable sret(%struct.copy_ctor) align 8 %[[tmp]]) -// CHECK: call void @_Z3use9copy_ctor(ptr nonnull %[[tmp]]) +// CHECK: call void @_Z3use9copy_ctor(ptr dead_on_return nonnull %[[tmp]]) // CHECK: ret void // -// CHECK: declare void @_Z3use9copy_ctor(ptr) +// CHECK: declare void @_Z3use9copy_ctor(ptr dead_on_return) // CHECK: declare void @_Z13def_copy_ctorv(ptr dead_on_unwind writable sret(%struct.copy_ctor) align 8) struct __attribute__((aligned(16))) aligned_copy_ctor { @@ -64,17 +64,17 @@ struct __attribute__((aligned(16))) aligned_copy_ctor { aligned_copy_ctor(aligned_copy_ctor const &); }; test(aligned_copy_ctor); -// CHECK: define void @_Z7forward17aligned_copy_ctor(ptr dead_on_unwind noalias {{[^,]*}} sret(%struct.aligned_copy_ctor) align 16 %{{.*}}, ptr nonnull %{{.*}}) +// CHECK: define void @_Z7forward17aligned_copy_ctor(ptr dead_on_unwind noalias {{[^,]*}} sret(%struct.aligned_copy_ctor) align 16 %{{.*}}, ptr dead_on_return nonnull %{{.*}}) // // CHECK: declare ptr @_ZN17aligned_copy_ctorC1ERKS_(ptr {{[^,]*}} returned {{[^,]*}}, ptr nonnull align 16 dereferenceable(16)) // // CHECK: define void @_Z22test_aligned_copy_ctorv() // CHECK: %[[tmp:.*]] = alloca %struct.aligned_copy_ctor, align 16 // CHECK: call void @_Z21def_aligned_copy_ctorv(ptr dead_on_unwind nonnull writable sret(%struct.aligned_copy_ctor) align 16 %[[tmp]]) -// CHECK: call void @_Z3use17aligned_copy_ctor(ptr nonnull %[[tmp]]) +// CHECK: call void @_Z3use17aligned_copy_ctor(ptr dead_on_return nonnull %[[tmp]]) // CHECK: ret void // -// CHECK: declare void @_Z3use17aligned_copy_ctor(ptr) +// CHECK: declare void @_Z3use17aligned_copy_ctor(ptr dead_on_return) // CHECK: declare void @_Z21def_aligned_copy_ctorv(ptr dead_on_unwind writable sret(%struct.aligned_copy_ctor) align 16) struct empty {}; diff --git a/clang/test/CodeGenCXX/windows-x86-swiftcall.cpp b/clang/test/CodeGenCXX/windows-x86-swiftcall.cpp index 9927d0b24799e..78c45753b5bbd 100644 --- a/clang/test/CodeGenCXX/windows-x86-swiftcall.cpp +++ b/clang/test/CodeGenCXX/windows-x86-swiftcall.cpp @@ -18,7 +18,7 @@ struct NonTrivial { SWIFTCALL int receiveNonTrivial(NonTrivial o) { return o.o; } -// CHECK-LABEL: define dso_local swiftcc noundef i32 @"?receiveNonTrivial@@YSHUNonTrivial@@@Z"(ptr noundef %o) +// CHECK-LABEL: define dso_local swiftcc noundef i32 @"?receiveNonTrivial@@YSHUNonTrivial@@@Z"(ptr dead_on_return noundef %o) int passNonTrivial() { return receiveNonTrivial({}); @@ -26,4 +26,4 @@ int passNonTrivial() { // CHECK-LABEL: define dso_local noundef i32 @"?passNonTrivial@@YAHXZ"() // CHECK-NOT: stacksave -// CHECK: call swiftcc noundef i32 @"?receiveNonTrivial@@YSHUNonTrivial@@@Z"(ptr noundef %{{.*}}) +// CHECK: call swiftcc noundef i32 @"?receiveNonTrivial@@YSHUNonTrivial@@@Z"(ptr dead_on_return noundef %{{.*}}) diff --git a/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m b/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m index d2a954ae26a04..ba8a04b52716e 100644 --- a/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m +++ b/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m @@ -42,7 +42,7 @@ void testStrongException(void) { // CHECK: call void @genWeak(ptr dead_on_unwind writable sret(%[[STRUCT_WEAK]]) align 8 %[[AGG_TMP]]) // CHECK: invoke void @genWeak(ptr dead_on_unwind writable sret(%[[STRUCT_WEAK]]) align 8 %[[AGG_TMP1]]) -// CHECK: call void @calleeWeak(ptr noundef %[[AGG_TMP]], ptr noundef %[[AGG_TMP1]]) +// CHECK: call void @calleeWeak(ptr dead_on_return noundef %[[AGG_TMP]], ptr dead_on_return noundef %[[AGG_TMP1]]) // CHECK: ret void // CHECK: landingpad { ptr, i32 } diff --git a/clang/test/CodeGenObjC/pass-by-value-noalias.m b/clang/test/CodeGenObjC/pass-by-value-noalias.m index ed94d4c80b525..86a4ba36a1b8f 100644 --- a/clang/test/CodeGenObjC/pass-by-value-noalias.m +++ b/clang/test/CodeGenObjC/pass-by-value-noalias.m @@ -17,6 +17,6 @@ @interface Bar Bar *__weak f; }; -// WITH_NOALIAS: define{{.*}} void @take(ptr noundef %arg) -// NO_NOALIAS: define{{.*}} void @take(ptr noundef %arg) +// WITH_NOALIAS: define{{.*}} void @take(ptr dead_on_return noundef %arg) +// NO_NOALIAS: define{{.*}} void @take(ptr dead_on_return noundef %arg) void take(struct Foo arg) {} diff --git a/clang/test/CodeGenObjC/weak-in-c-struct.m b/clang/test/CodeGenObjC/weak-in-c-struct.m index be80edd1ff11d..6809360d03da1 100644 --- a/clang/test/CodeGenObjC/weak-in-c-struct.m +++ b/clang/test/CodeGenObjC/weak-in-c-struct.m @@ -130,7 +130,7 @@ void test_move_assignment_Weak(Weak *p) { *p = getWeak(); } -// COMMON: define{{.*}} void @test_parameter_Weak(ptr noundef %[[A:.*]]) +// COMMON: define{{.*}} void @test_parameter_Weak(ptr dead_on_return noundef %[[A:.*]]) // COMMON: call void @__destructor_{{.*}}(ptr %[[A]]) void test_parameter_Weak(Weak a) { @@ -142,7 +142,7 @@ void test_parameter_Weak(Weak a) { // COMMON: store ptr %[[A]], ptr %[[A_ADDR]] // COMMON: %[[V0:.*]] = load ptr, ptr %[[A_ADDR]] // COMMON: call void @__copy_constructor_{{.*}}(ptr %[[AGG_TMP]], ptr %[[V0]]) -// COMMON: call void @calleeWeak(ptr noundef %[[AGG_TMP]]) +// COMMON: call void @calleeWeak(ptr dead_on_return noundef %[[AGG_TMP]]) // COMMON-NEXT: ret void test_argument_Weak(Weak *a) { @@ -164,7 +164,7 @@ Weak test_return_Weak(Weak *a) { // COMMON: %[[AGG_TMP:.*]] = alloca %[[STRUCT_WEAK]] // COMMON: br i1 -// COMMON: call void @objc_msgSend({{.*}}, ptr noundef %[[AGG_TMP]]) +// COMMON: call void @objc_msgSend({{.*}}, ptr dead_on_return noundef %[[AGG_TMP]]) // COMMON: br // COMMON: call void @__destructor_{{.*}}(ptr %[[AGG_TMP]]) diff --git a/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm b/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm index 4d76796d86d1f..3a043c4892981 100644 --- a/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm +++ b/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm @@ -75,7 +75,7 @@ - (void)passStrongWeak:(StrongWeak)a; - (void)passNonTrivial:(NonTrivial)a; @end -// CHECK: define{{.*}} void @_Z19testParamStrongWeak10StrongWeak(ptr noundef %{{.*}}) +// CHECK: define{{.*}} void @_Z19testParamStrongWeak10StrongWeak(ptr dead_on_return noundef %{{.*}}) // CHECK: call noundef ptr @_ZN10StrongWeakD1Ev( // CHECK-NEXT: ret void @@ -88,7 +88,7 @@ void testParamStrongWeak(StrongWeak a) { // CHECK: store ptr %[[A]], ptr %[[A_ADDR]], align 8 // CHECK: %[[V0:.*]] = load ptr, ptr %[[A_ADDR]], align 8 // CHECK: %[[CALL:.*]] = call noundef ptr @_ZN10StrongWeakC1ERKS_(ptr {{[^,]*}} %[[AGG_TMP]], ptr noundef nonnull align 8 dereferenceable(16) %[[V0]]) -// CHECK: call void @_Z19testParamStrongWeak10StrongWeak(ptr noundef %[[AGG_TMP]]) +// CHECK: call void @_Z19testParamStrongWeak10StrongWeak(ptr dead_on_return noundef %[[AGG_TMP]]) // CHECK-NOT: call // CHECK: ret void @@ -107,13 +107,13 @@ StrongWeak testReturnStrongWeak(StrongWeak *a) { return *a; } -// CHECK: define{{.*}} void @_Z27testParamContainsStrongWeak18ContainsStrongWeak(ptr noundef %[[A:.*]]) +// CHECK: define{{.*}} void @_Z27testParamContainsStrongWeak18ContainsStrongWeak(ptr dead_on_return noundef %[[A:.*]]) // CHECK: call noundef ptr @_ZN18ContainsStrongWeakD1Ev(ptr {{[^,]*}} %[[A]]) void testParamContainsStrongWeak(ContainsStrongWeak a) { } -// CHECK: define{{.*}} void @_Z26testParamDerivedStrongWeak17DerivedStrongWeak(ptr noundef %[[A:.*]]) +// CHECK: define{{.*}} void @_Z26testParamDerivedStrongWeak17DerivedStrongWeak(ptr dead_on_return noundef %[[A:.*]]) // CHECK: call noundef ptr @_ZN17DerivedStrongWeakD1Ev(ptr {{[^,]*}} %[[A]]) void testParamDerivedStrongWeak(DerivedStrongWeak a) { @@ -163,7 +163,7 @@ Strong testReturnStrong(Strong *a) { return *a; } -// CHECK: define{{.*}} void @_Z21testParamWeakTemplate1SIU6__weakP11objc_objectE(ptr noundef %{{.*}}) +// CHECK: define{{.*}} void @_Z21testParamWeakTemplate1SIU6__weakP11objc_objectE(ptr dead_on_return noundef %{{.*}}) // CHECK: call noundef ptr @_ZN1SIU6__weakP11objc_objectED1Ev( // CHECK-NEXT: ret void @@ -237,7 +237,7 @@ void test0(C *c) { // CHECK: %[[AGG_TMP:.*]] = alloca %[[STRUCT_STRONGWEAK]], align 8 // CHECK: br i1 -// CHECK: call void @objc_msgSend({{.*}}, ptr noundef %[[AGG_TMP]]) +// CHECK: call void @objc_msgSend({{.*}}, ptr dead_on_return noundef %[[AGG_TMP]]) // CHECK: br // CHECK: %[[CALL1:.*]] = call noundef ptr @_ZN10StrongWeakD1Ev(ptr noundef nonnull align 8 dereferenceable(16) %[[AGG_TMP]]) diff --git a/clang/test/CodeGenObjCXX/property-objects.mm b/clang/test/CodeGenObjCXX/property-objects.mm index 7ae20f66177bd..8354794254933 100644 --- a/clang/test/CodeGenObjCXX/property-objects.mm +++ b/clang/test/CodeGenObjCXX/property-objects.mm @@ -60,7 +60,7 @@ - (struct CGRect)extent {return bounds;} // CHECK-LABEL: define{{.*}} i32 @main // CHECK: call void @_ZN1SC1ERKS_(ptr {{[^,]*}} [[AGGTMP:%[a-zA-Z0-9\.]+]], ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) {{%[a-zA-Z0-9\.]+}}) -// CHECK: call void @objc_msgSend(ptr noundef {{%[a-zA-Z0-9\.]+}}, ptr noundef {{%[a-zA-Z0-9\.]+}}, ptr noundef [[AGGTMP]]) +// CHECK: call void @objc_msgSend(ptr noundef {{%[a-zA-Z0-9\.]+}}, ptr noundef {{%[a-zA-Z0-9\.]+}}, ptr dead_on_return noundef [[AGGTMP]]) // CHECK-NEXT: ret i32 0 int main() { I *i; diff --git a/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm b/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm index e5cb71bad47c0..9428940d6da48 100644 --- a/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm +++ b/clang/test/CodeGenObjCXX/ptrauth-struct-cxx-abi.mm @@ -15,7 +15,7 @@ // Check that AddrDiscStrong0 is destructed in the callee. -// CHECK: define void @_Z24testParamAddrDiscStrong015AddrDiscStrong0(ptr noundef %[[A:.*]]) +// CHECK: define void @_Z24testParamAddrDiscStrong015AddrDiscStrong0(ptr dead_on_return noundef %[[A:.*]]) // CHECK: call noundef ptr @_ZN15AddrDiscStrong0D1Ev(ptr noundef nonnull align {{[0-9]+}} dereferenceable(16) %[[A]]) // CHECK: ret void diff --git a/clang/test/Headers/stdarg.cpp b/clang/test/Headers/stdarg.cpp index 20bf17caf15f7..bfc3af11a23b6 100644 --- a/clang/test/Headers/stdarg.cpp +++ b/clang/test/Headers/stdarg.cpp @@ -15,8 +15,8 @@ #include -// AARCH64-C: define {{.*}} @f(i32 noundef %n, ptr noundef %list) -// AARCH64-CXX: define {{.*}} @_Z1fiSt9__va_list(i32 noundef %n, ptr noundef %list) +// AARCH64-C: define {{.*}} @f(i32 noundef %n, ptr dead_on_return noundef %list) +// AARCH64-CXX: define {{.*}} @_Z1fiSt9__va_list(i32 noundef %n, ptr dead_on_return noundef %list) // X86_64-C: define {{.*}} @f(i32 noundef %n, ptr noundef %list) // X86_64-CXX: define {{.*}} @_Z1fiP13__va_list_tag(i32 noundef %n, ptr noundef %list) // PPC64-C: define {{.*}} @f(i32 noundef signext %n, ptr noundef %list) diff --git a/clang/test/OpenMP/for_firstprivate_codegen.cpp b/clang/test/OpenMP/for_firstprivate_codegen.cpp index 0255e1e3d4aea..83b5939799642 100644 --- a/clang/test/OpenMP/for_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/for_firstprivate_codegen.cpp @@ -427,7 +427,7 @@ int main() { // CHECK1-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1 // CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2) // CHECK1-NEXT: store ptr [[TEST]], ptr [[VAR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 4, ptr @_Z5tmainIiET_v.omp_outlined, ptr [[T_VAR]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP0]]) // CHECK1-NEXT: store i32 0, ptr [[RETVAL]], align 4 // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0 @@ -469,7 +469,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -543,12 +543,12 @@ int main() { // CHECK1-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -572,7 +572,7 @@ int main() { // CHECK1-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP6]] // CHECK1-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE6]], label [[OMP_ARRAYCPY_BODY]] // CHECK1: omp.arraycpy.done6: -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP8]]) // CHECK1-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP7]], ptr noundef [[AGG_TMP8]]) // CHECK1-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP8]]) #[[ATTR2]] @@ -611,7 +611,7 @@ int main() { // CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC4]], i64 0, i64 [[IDXPROM]] // CHECK1-NEXT: store i32 [[TMP16]], ptr [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP9]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP9]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: [[TMP19:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM11:%.*]] = sext i32 [[TMP19]] to i64 // CHECK1-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i64 0, i64 [[IDXPROM11]] @@ -711,7 +711,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1044,8 +1044,8 @@ int main() { // CHECK4-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK4-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK4-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr @g1, align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] +// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr @g1, align 8, !nonnull [[META3]], !align [[META4]] // CHECK4-NEXT: store ptr [[TMP1]], ptr [[TMP]], align 8 // CHECK4-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK4-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -1086,7 +1086,7 @@ int main() { // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK4-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK4-NEXT: store i32 1, ptr [[G]], align 4 -// CHECK4-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP13:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK4-NEXT: store volatile i32 1, ptr [[TMP13]], align 4 // CHECK4-NEXT: store i32 2, ptr [[SIVAR3]], align 4 // CHECK4-NEXT: [[BLOCK_ISA:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[BLOCK]], i32 0, i32 0 @@ -1103,7 +1103,7 @@ int main() { // CHECK4-NEXT: [[TMP14:%.*]] = load volatile i32, ptr [[G]], align 4 // CHECK4-NEXT: store volatile i32 [[TMP14]], ptr [[BLOCK_CAPTURED]], align 8 // CHECK4-NEXT: [[BLOCK_CAPTURED5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[BLOCK]], i32 0, i32 5 -// CHECK4-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP2]], align 8 +// CHECK4-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK4-NEXT: store ptr [[TMP15]], ptr [[BLOCK_CAPTURED5]], align 8 // CHECK4-NEXT: [[BLOCK_CAPTURED6:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[BLOCK]], i32 0, i32 7 // CHECK4-NEXT: [[TMP16:%.*]] = load i32, ptr [[SIVAR3]], align 4 @@ -1137,7 +1137,7 @@ int main() { // CHECK4-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 // CHECK4-NEXT: store i32 2, ptr [[BLOCK_CAPTURE_ADDR]], align 8 // CHECK4-NEXT: [[BLOCK_CAPTURE_ADDR1:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5 -// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR1]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK4-NEXT: store i32 2, ptr [[TMP0]], align 4 // CHECK4-NEXT: [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, i32, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 7 // CHECK4-NEXT: store i32 4, ptr [[BLOCK_CAPTURE_ADDR2]], align 4 diff --git a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp index 93e71b9a8312e..441e809dc59e5 100644 --- a/clang/test/OpenMP/parallel_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/parallel_firstprivate_codegen.cpp @@ -350,9 +350,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META4:![0-9]+]], !align [[META5:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC1]], ptr align 4 [[TMP0]], i32 8, i1 false) // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2 @@ -524,7 +524,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0 // CHECK1-NEXT: store i8 [[BF_SET]], ptr [[B]], align 4 // CHECK1-NEXT: [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: store ptr [[TMP0]], ptr [[C]], align 4 // CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[A3]], ptr [[A2]], align 4 @@ -535,22 +535,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32 // CHECK1-NEXT: store i32 [[BF_CAST]], ptr [[B4]], align 4 // CHECK1-NEXT: [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: store ptr [[TMP1]], ptr [[C7]], align 4 // CHECK1-NEXT: [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3 // CHECK1-NEXT: store ptr [[E9]], ptr [[E]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[B4]], align 4 // CHECK1-NEXT: store i32 [[TMP5]], ptr [[B_CASTED]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4 -// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4 +// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK1-NEXT: store i32 [[TMP8]], ptr [[C_CASTED]], align 4 // CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[C_CASTED]], align 4 -// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 4 +// CHECK1-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP9]], ptr [[TMP10]]) // CHECK1-NEXT: ret void // @@ -578,25 +578,25 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: store i32 [[C]], ptr [[C_ADDR]], align 4 // CHECK1-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 4 // CHECK1-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 4 // CHECK1-NEXT: store ptr [[TMP1]], ptr [[_TMP2]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4 +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[E3]], ptr align 4 [[TMP2]], i32 16, i1 false) // CHECK1-NEXT: store ptr [[E3]], ptr [[_TMP4]], align 4 -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK1-NEXT: store i32 [[INC]], ptr [[TMP3]], align 4 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[B_ADDR]], align 4 // CHECK1-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP5]], -1 // CHECK1-NEXT: store i32 [[DEC]], ptr [[B_ADDR]], align 4 -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[_TMP1]], align 4 +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 // CHECK1-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP7]], 1 // CHECK1-NEXT: store i32 [[DIV]], ptr [[TMP6]], align 4 -// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP4]], align 4 +// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP4]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP8]], i32 0, i32 2 // CHECK1-NEXT: store i32 1111, ptr [[ARRAYIDX]], align 4 // CHECK1-NEXT: ret void @@ -656,7 +656,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -738,9 +738,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i32 8, i1 false) // CHECK1-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0 // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2 @@ -840,7 +840,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: store i32 0, ptr [[A]], align 4 // CHECK1-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SST]], ptr [[THIS1]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[A3]], ptr [[A2]], align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A2]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A2]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[A_CASTED]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[A_CASTED]], align 4 @@ -862,7 +862,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4 // CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK1-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 4 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 // CHECK1-NEXT: store i32 [[INC]], ptr [[TMP1]], align 4 @@ -896,7 +896,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -965,7 +965,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK3-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0 // CHECK3-NEXT: store i8 [[BF_SET]], ptr [[B]], align 4 // CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META4:![0-9]+]], !align [[META5:![0-9]+]] // CHECK3-NEXT: store ptr [[TMP0]], ptr [[C]], align 4 // CHECK3-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[A3]], ptr [[A2]], align 4 @@ -976,22 +976,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK3-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32 // CHECK3-NEXT: store i32 [[BF_CAST]], ptr [[B4]], align 4 // CHECK3-NEXT: [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: store ptr [[TMP1]], ptr [[C7]], align 4 // CHECK3-NEXT: [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3 // CHECK3-NEXT: store ptr [[E9]], ptr [[E]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[B4]], align 4 // CHECK3-NEXT: store i32 [[TMP5]], ptr [[B_CASTED]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4 -// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4 +// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK3-NEXT: store i32 [[TMP8]], ptr [[C_CASTED]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[C_CASTED]], align 4 -// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 4 +// CHECK3-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP9]], ptr [[TMP10]]) // CHECK3-NEXT: ret void // @@ -1020,22 +1020,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK3-NEXT: store i32 [[C]], ptr [[C_ADDR]], align 4 // CHECK3-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 4 // CHECK3-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 4 // CHECK3-NEXT: store ptr [[TMP1]], ptr [[_TMP2]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4 +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[E3]], ptr align 4 [[TMP2]], i32 16, i1 false) // CHECK3-NEXT: store ptr [[E3]], ptr [[_TMP4]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP3]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: store ptr [[TMP5]], ptr [[TMP4]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2 // CHECK3-NEXT: store ptr [[B_ADDR]], ptr [[TMP6]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 4 +// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4 // CHECK3-NEXT: call void @_ZZN2SSC1ERiENKUlvE_clEv(ptr nonnull align 4 dereferenceable(16) [[REF_TMP]]) // CHECK3-NEXT: ret void @@ -1053,32 +1053,32 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK3-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0:%.*]], ptr [[THIS1]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 // CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK3-NEXT: store i32 [[INC]], ptr [[TMP3]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 // CHECK3-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP7]], -1 // CHECK3-NEXT: store i32 [[DEC]], ptr [[TMP6]], align 4 // CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4 +// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 // CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 1 // CHECK3-NEXT: store i32 [[DIV]], ptr [[TMP9]], align 4 // CHECK3-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 1 -// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 4 +// CHECK3-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 // CHECK3-NEXT: store i32 [[TMP13]], ptr [[A_CASTED]], align 4 // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[A_CASTED]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 2 -// CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 // CHECK3-NEXT: store i32 [[TMP17]], ptr [[B_CASTED]], align 4 // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[B_CASTED]], align 4 // CHECK3-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 3 -// CHECK3-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4 +// CHECK3-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 // CHECK3-NEXT: store i32 [[TMP21]], ptr [[C_CASTED]], align 4 // CHECK3-NEXT: [[TMP22:%.*]] = load i32, ptr [[C_CASTED]], align 4 @@ -1106,14 +1106,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 4 // CHECK3-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK3-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 // CHECK3-NEXT: store i32 [[INC]], ptr [[TMP1]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4 // CHECK3-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP3]], -1 // CHECK3-NEXT: store i32 [[DEC]], ptr [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META4]], !align [[META5]] // CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK3-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP5]], 1 // CHECK3-NEXT: store i32 [[DIV]], ptr [[TMP4]], align 4 @@ -1252,7 +1252,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK4-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0 // CHECK4-NEXT: store i8 [[BF_SET]], ptr [[B]], align 4 // CHECK4-NEXT: [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 4, !nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]] // CHECK4-NEXT: store ptr [[TMP0]], ptr [[C]], align 4 // CHECK4-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK4-NEXT: store ptr [[A3]], ptr [[A2]], align 4 @@ -1263,22 +1263,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK4-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32 // CHECK4-NEXT: store i32 [[BF_CAST]], ptr [[B4]], align 4 // CHECK4-NEXT: [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: store ptr [[TMP1]], ptr [[C7]], align 4 // CHECK4-NEXT: [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3 // CHECK4-NEXT: store ptr [[E9]], ptr [[E]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK4-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 // CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[A_CASTED]], align 4 // CHECK4-NEXT: [[TMP5:%.*]] = load i32, ptr [[B4]], align 4 // CHECK4-NEXT: store i32 [[TMP5]], ptr [[B_CASTED]], align 4 // CHECK4-NEXT: [[TMP6:%.*]] = load i32, ptr [[B_CASTED]], align 4 -// CHECK4-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4 +// CHECK4-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK4-NEXT: store i32 [[TMP8]], ptr [[C_CASTED]], align 4 // CHECK4-NEXT: [[TMP9:%.*]] = load i32, ptr [[C_CASTED]], align 4 -// CHECK4-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i32 [[TMP4]], i32 [[TMP6]], i32 [[TMP9]], ptr [[TMP10]]) // CHECK4-NEXT: ret void // @@ -1307,11 +1307,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK4-NEXT: store i32 [[C]], ptr [[C_ADDR]], align 4 // CHECK4-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 4 // CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 4 // CHECK4-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 4 // CHECK4-NEXT: store ptr [[TMP1]], ptr [[_TMP2]], align 4 -// CHECK4-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4 +// CHECK4-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[E3]], ptr align 4 [[TMP2]], i32 16, i1 false) // CHECK4-NEXT: store ptr [[E3]], ptr [[_TMP4]], align 4 // CHECK4-NEXT: [[BLOCK_ISA:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 0 @@ -1327,13 +1327,13 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK4-NEXT: [[BLOCK_CAPTURED_THIS_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 5 // CHECK4-NEXT: store ptr [[TMP0]], ptr [[BLOCK_CAPTURED_THIS_ADDR]], align 4 // CHECK4-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 6 -// CHECK4-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: store ptr [[TMP3]], ptr [[BLOCK_CAPTURED]], align 4 // CHECK4-NEXT: [[BLOCK_CAPTURED5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 7 // CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4 // CHECK4-NEXT: store i32 [[TMP4]], ptr [[BLOCK_CAPTURED5]], align 4 // CHECK4-NEXT: [[BLOCK_CAPTURED6:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[BLOCK]], i32 0, i32 8 -// CHECK4-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: store ptr [[TMP5]], ptr [[BLOCK_CAPTURED6]], align 4 // CHECK4-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], ptr [[BLOCK]], i32 0, i32 3 // CHECK4-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4 @@ -1354,7 +1354,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK4-NEXT: [[BLOCK_CAPTURED_THIS:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5 // CHECK4-NEXT: [[THIS:%.*]] = load ptr, ptr [[BLOCK_CAPTURED_THIS]], align 4 // CHECK4-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 -// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR]], align 4 +// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 // CHECK4-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4 @@ -1363,12 +1363,12 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK4-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP2]], -1 // CHECK4-NEXT: store i32 [[DEC]], ptr [[BLOCK_CAPTURE_ADDR1]], align 4 // CHECK4-NEXT: [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 8 -// CHECK4-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR2]], align 4 +// CHECK4-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR2]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 // CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP4]], 1 // CHECK4-NEXT: store i32 [[DIV]], ptr [[TMP3]], align 4 // CHECK4-NEXT: [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 -// CHECK4-NEXT: [[TMP5:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR3]], align 4 +// CHECK4-NEXT: [[TMP5:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR3]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 // CHECK4-NEXT: store i32 [[TMP6]], ptr [[A_CASTED]], align 4 // CHECK4-NEXT: [[TMP7:%.*]] = load i32, ptr [[A_CASTED]], align 4 @@ -1377,7 +1377,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK4-NEXT: store i32 [[TMP8]], ptr [[B_CASTED]], align 4 // CHECK4-NEXT: [[TMP9:%.*]] = load i32, ptr [[B_CASTED]], align 4 // CHECK4-NEXT: [[BLOCK_CAPTURE_ADDR5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, i32, ptr }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 8 -// CHECK4-NEXT: [[TMP10:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR5]], align 4 +// CHECK4-NEXT: [[TMP10:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR5]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 // CHECK4-NEXT: store i32 [[TMP11]], ptr [[C_CASTED]], align 4 // CHECK4-NEXT: [[TMP12:%.*]] = load i32, ptr [[C_CASTED]], align 4 @@ -1405,14 +1405,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK4-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 4 // CHECK4-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK4-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 // CHECK4-NEXT: store i32 [[INC]], ptr [[TMP1]], align 4 // CHECK4-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4 // CHECK4-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP3]], -1 // CHECK4-NEXT: store i32 [[DEC]], ptr [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 4 +// CHECK4-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 4, !nonnull [[META6]], !align [[META7]] // CHECK4-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK4-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP5]], 1 // CHECK4-NEXT: store i32 [[DIV]], ptr [[TMP4]], align 4 @@ -1530,9 +1530,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK9-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK9-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC1]], ptr align 4 [[TMP0]], i64 8, i1 false) // CHECK9-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0 // CHECK9-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2 @@ -1704,7 +1704,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0 // CHECK9-NEXT: store i8 [[BF_SET]], ptr [[B]], align 4 // CHECK9-NEXT: [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: store ptr [[TMP0]], ptr [[C]], align 8 // CHECK9-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK9-NEXT: store ptr [[A3]], ptr [[A2]], align 8 @@ -1715,22 +1715,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32 // CHECK9-NEXT: store i32 [[BF_CAST]], ptr [[B4]], align 4 // CHECK9-NEXT: [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: store ptr [[TMP1]], ptr [[C7]], align 8 // CHECK9-NEXT: [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3 // CHECK9-NEXT: store ptr [[E9]], ptr [[E]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8 +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK9-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 // CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8 // CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[B4]], align 4 // CHECK9-NEXT: store i32 [[TMP5]], ptr [[B_CASTED]], align 4 // CHECK9-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8 -// CHECK9-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8 +// CHECK9-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK9-NEXT: store i32 [[TMP8]], ptr [[C_CASTED]], align 4 // CHECK9-NEXT: [[TMP9:%.*]] = load i64, ptr [[C_CASTED]], align 8 -// CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 8 +// CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP9]], ptr [[TMP10]]) // CHECK9-NEXT: ret void // @@ -1758,25 +1758,25 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store i64 [[C]], ptr [[C_ADDR]], align 8 // CHECK9-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 // CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 8 // CHECK9-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 8 // CHECK9-NEXT: store ptr [[TMP1]], ptr [[_TMP2]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8 +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[E3]], ptr align 4 [[TMP2]], i64 16, i1 false) // CHECK9-NEXT: store ptr [[E3]], ptr [[_TMP4]], align 8 -// CHECK9-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 // CHECK9-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK9-NEXT: store i32 [[INC]], ptr [[TMP3]], align 4 // CHECK9-NEXT: [[TMP5:%.*]] = load i32, ptr [[B_ADDR]], align 4 // CHECK9-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP5]], -1 // CHECK9-NEXT: store i32 [[DEC]], ptr [[B_ADDR]], align 4 -// CHECK9-NEXT: [[TMP6:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK9-NEXT: [[TMP6:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 // CHECK9-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP7]], 1 // CHECK9-NEXT: store i32 [[DIV]], ptr [[TMP6]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP4]], align 8 +// CHECK9-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP4]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i32], ptr [[TMP8]], i64 0, i64 2 // CHECK9-NEXT: store i32 1111, ptr [[ARRAYIDX]], align 4 // CHECK9-NEXT: ret void @@ -1836,7 +1836,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK9-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1918,9 +1918,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK9-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK9-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i64 8, i1 false) // CHECK9-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0 // CHECK9-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2 @@ -2020,7 +2020,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store i32 0, ptr [[A]], align 4 // CHECK9-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SST]], ptr [[THIS1]], i32 0, i32 0 // CHECK9-NEXT: store ptr [[A3]], ptr [[A2]], align 8 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A2]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A2]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK9-NEXT: store i32 [[TMP1]], ptr [[A_CASTED]], align 4 // CHECK9-NEXT: [[TMP2:%.*]] = load i64, ptr [[A_CASTED]], align 8 @@ -2042,7 +2042,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 // CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 8 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK9-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 // CHECK9-NEXT: store i32 [[INC]], ptr [[TMP1]], align 4 @@ -2076,7 +2076,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK9-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -2145,7 +2145,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0 // CHECK11-NEXT: store i8 [[BF_SET]], ptr [[B]], align 4 // CHECK11-NEXT: [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] // CHECK11-NEXT: store ptr [[TMP0]], ptr [[C]], align 8 // CHECK11-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK11-NEXT: store ptr [[A3]], ptr [[A2]], align 8 @@ -2156,22 +2156,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32 // CHECK11-NEXT: store i32 [[BF_CAST]], ptr [[B4]], align 4 // CHECK11-NEXT: [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: store ptr [[TMP1]], ptr [[C7]], align 8 // CHECK11-NEXT: [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3 // CHECK11-NEXT: store ptr [[E9]], ptr [[E]], align 8 -// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8 +// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK11-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 // CHECK11-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8 // CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[B4]], align 4 // CHECK11-NEXT: store i32 [[TMP5]], ptr [[B_CASTED]], align 4 // CHECK11-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8 -// CHECK11-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8 +// CHECK11-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK11-NEXT: store i32 [[TMP8]], ptr [[C_CASTED]], align 4 // CHECK11-NEXT: [[TMP9:%.*]] = load i64, ptr [[C_CASTED]], align 8 -// CHECK11-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 8 +// CHECK11-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP9]], ptr [[TMP10]]) // CHECK11-NEXT: ret void // @@ -2200,22 +2200,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: store i64 [[C]], ptr [[C_ADDR]], align 8 // CHECK11-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 // CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 8 // CHECK11-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 8 // CHECK11-NEXT: store ptr [[TMP1]], ptr [[_TMP2]], align 8 -// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8 +// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[E3]], ptr align 4 [[TMP2]], i64 16, i1 false) // CHECK11-NEXT: store ptr [[E3]], ptr [[_TMP4]], align 8 // CHECK11-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0 // CHECK11-NEXT: store ptr [[TMP0]], ptr [[TMP3]], align 8 // CHECK11-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK11-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: store ptr [[TMP5]], ptr [[TMP4]], align 8 // CHECK11-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2 // CHECK11-NEXT: store ptr [[B_ADDR]], ptr [[TMP6]], align 8 // CHECK11-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK11-NEXT: [[TMP8:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8 // CHECK11-NEXT: call void @_ZZN2SSC1ERiENKUlvE_clEv(ptr nonnull align 8 dereferenceable(32) [[REF_TMP]]) // CHECK11-NEXT: ret void @@ -2233,32 +2233,32 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0:%.*]], ptr [[THIS1]], i32 0, i32 0 // CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 // CHECK11-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8 +// CHECK11-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 // CHECK11-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK11-NEXT: store i32 [[INC]], ptr [[TMP3]], align 4 // CHECK11-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8 +// CHECK11-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 // CHECK11-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP7]], -1 // CHECK11-NEXT: store i32 [[DEC]], ptr [[TMP6]], align 4 // CHECK11-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8 +// CHECK11-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4 // CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP10]], 1 // CHECK11-NEXT: store i32 [[DIV]], ptr [[TMP9]], align 4 // CHECK11-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 1 -// CHECK11-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 8 +// CHECK11-NEXT: [[TMP12:%.*]] = load ptr, ptr [[TMP11]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 // CHECK11-NEXT: store i32 [[TMP13]], ptr [[A_CASTED]], align 4 // CHECK11-NEXT: [[TMP14:%.*]] = load i64, ptr [[A_CASTED]], align 8 // CHECK11-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 2 -// CHECK11-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8 +// CHECK11-NEXT: [[TMP16:%.*]] = load ptr, ptr [[TMP15]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 // CHECK11-NEXT: store i32 [[TMP17]], ptr [[B_CASTED]], align 4 // CHECK11-NEXT: [[TMP18:%.*]] = load i64, ptr [[B_CASTED]], align 8 // CHECK11-NEXT: [[TMP19:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[THIS1]], i32 0, i32 3 -// CHECK11-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8 +// CHECK11-NEXT: [[TMP20:%.*]] = load ptr, ptr [[TMP19]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4 // CHECK11-NEXT: store i32 [[TMP21]], ptr [[C_CASTED]], align 4 // CHECK11-NEXT: [[TMP22:%.*]] = load i64, ptr [[C_CASTED]], align 8 @@ -2286,14 +2286,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK11-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 8 // CHECK11-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 8 -// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK11-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 // CHECK11-NEXT: store i32 [[INC]], ptr [[TMP1]], align 4 // CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4 // CHECK11-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP3]], -1 // CHECK11-NEXT: store i32 [[DEC]], ptr [[B_ADDR]], align 4 -// CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK11-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK11-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK11-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP5]], 1 // CHECK11-NEXT: store i32 [[DIV]], ptr [[TMP4]], align 4 @@ -2432,7 +2432,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK12-NEXT: [[BF_SET:%.*]] = or i8 [[BF_CLEAR]], 0 // CHECK12-NEXT: store i8 [[BF_SET]], ptr [[B]], align 4 // CHECK12-NEXT: [[C:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[D_ADDR]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]] // CHECK12-NEXT: store ptr [[TMP0]], ptr [[C]], align 8 // CHECK12-NEXT: [[A3:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0 // CHECK12-NEXT: store ptr [[A3]], ptr [[A2]], align 8 @@ -2443,22 +2443,22 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK12-NEXT: [[BF_CAST:%.*]] = sext i8 [[BF_ASHR]] to i32 // CHECK12-NEXT: store i32 [[BF_CAST]], ptr [[B4]], align 4 // CHECK12-NEXT: [[C8:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 2 -// CHECK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8 +// CHECK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[C8]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: store ptr [[TMP1]], ptr [[C7]], align 8 // CHECK12-NEXT: [[E9:%.*]] = getelementptr inbounds nuw [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 3 // CHECK12-NEXT: store ptr [[E9]], ptr [[E]], align 8 -// CHECK12-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8 +// CHECK12-NEXT: [[TMP2:%.*]] = load ptr, ptr [[A2]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 // CHECK12-NEXT: store i32 [[TMP3]], ptr [[A_CASTED]], align 4 // CHECK12-NEXT: [[TMP4:%.*]] = load i64, ptr [[A_CASTED]], align 8 // CHECK12-NEXT: [[TMP5:%.*]] = load i32, ptr [[B4]], align 4 // CHECK12-NEXT: store i32 [[TMP5]], ptr [[B_CASTED]], align 4 // CHECK12-NEXT: [[TMP6:%.*]] = load i64, ptr [[B_CASTED]], align 8 -// CHECK12-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8 +// CHECK12-NEXT: [[TMP7:%.*]] = load ptr, ptr [[C7]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 // CHECK12-NEXT: store i32 [[TMP8]], ptr [[C_CASTED]], align 4 // CHECK12-NEXT: [[TMP9:%.*]] = load i64, ptr [[C_CASTED]], align 8 -// CHECK12-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 8 +// CHECK12-NEXT: [[TMP10:%.*]] = load ptr, ptr [[E]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 5, ptr @_ZN2SSC2ERi.omp_outlined, ptr [[THIS1]], i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP9]], ptr [[TMP10]]) // CHECK12-NEXT: ret void // @@ -2487,11 +2487,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK12-NEXT: store i64 [[C]], ptr [[C_ADDR]], align 8 // CHECK12-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 // CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 -// CHECK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// CHECK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[E_ADDR]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 8 // CHECK12-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 8 // CHECK12-NEXT: store ptr [[TMP1]], ptr [[_TMP2]], align 8 -// CHECK12-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8 +// CHECK12-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_TMP2]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[E3]], ptr align 4 [[TMP2]], i64 16, i1 false) // CHECK12-NEXT: store ptr [[E3]], ptr [[_TMP4]], align 8 // CHECK12-NEXT: [[BLOCK_ISA:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 0 @@ -2507,13 +2507,13 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK12-NEXT: [[BLOCK_CAPTURED_THIS_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 5 // CHECK12-NEXT: store ptr [[TMP0]], ptr [[BLOCK_CAPTURED_THIS_ADDR]], align 8 // CHECK12-NEXT: [[BLOCK_CAPTURED:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 6 -// CHECK12-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK12-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: store ptr [[TMP3]], ptr [[BLOCK_CAPTURED]], align 8 // CHECK12-NEXT: [[BLOCK_CAPTURED5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 8 // CHECK12-NEXT: [[TMP4:%.*]] = load i32, ptr [[B_ADDR]], align 4 // CHECK12-NEXT: store i32 [[TMP4]], ptr [[BLOCK_CAPTURED5]], align 8 // CHECK12-NEXT: [[BLOCK_CAPTURED6:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[BLOCK]], i32 0, i32 7 -// CHECK12-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK12-NEXT: [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: store ptr [[TMP5]], ptr [[BLOCK_CAPTURED6]], align 8 // CHECK12-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___BLOCK_LITERAL_GENERIC:%.*]], ptr [[BLOCK]], i32 0, i32 3 // CHECK12-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8 @@ -2534,7 +2534,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK12-NEXT: [[BLOCK_CAPTURED_THIS:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 5 // CHECK12-NEXT: [[THIS:%.*]] = load ptr, ptr [[BLOCK_CAPTURED_THIS]], align 8 // CHECK12-NEXT: [[BLOCK_CAPTURE_ADDR:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 -// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR]], align 8 +// CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 // CHECK12-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1 // CHECK12-NEXT: store i32 [[INC]], ptr [[TMP0]], align 4 @@ -2543,12 +2543,12 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK12-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP2]], -1 // CHECK12-NEXT: store i32 [[DEC]], ptr [[BLOCK_CAPTURE_ADDR1]], align 8 // CHECK12-NEXT: [[BLOCK_CAPTURE_ADDR2:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 7 -// CHECK12-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR2]], align 8 +// CHECK12-NEXT: [[TMP3:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR2]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4 // CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP4]], 1 // CHECK12-NEXT: store i32 [[DIV]], ptr [[TMP3]], align 4 // CHECK12-NEXT: [[BLOCK_CAPTURE_ADDR3:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 6 -// CHECK12-NEXT: [[TMP5:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR3]], align 8 +// CHECK12-NEXT: [[TMP5:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR3]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 // CHECK12-NEXT: store i32 [[TMP6]], ptr [[A_CASTED]], align 4 // CHECK12-NEXT: [[TMP7:%.*]] = load i64, ptr [[A_CASTED]], align 8 @@ -2557,7 +2557,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK12-NEXT: store i32 [[TMP8]], ptr [[B_CASTED]], align 4 // CHECK12-NEXT: [[TMP9:%.*]] = load i64, ptr [[B_CASTED]], align 8 // CHECK12-NEXT: [[BLOCK_CAPTURE_ADDR5:%.*]] = getelementptr inbounds nuw <{ ptr, i32, i32, ptr, ptr, ptr, ptr, ptr, i32 }>, ptr [[DOTBLOCK_DESCRIPTOR]], i32 0, i32 7 -// CHECK12-NEXT: [[TMP10:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR5]], align 8 +// CHECK12-NEXT: [[TMP10:%.*]] = load ptr, ptr [[BLOCK_CAPTURE_ADDR5]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 // CHECK12-NEXT: store i32 [[TMP11]], ptr [[C_CASTED]], align 4 // CHECK12-NEXT: [[TMP12:%.*]] = load i64, ptr [[C_CASTED]], align 8 @@ -2585,14 +2585,14 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK12-NEXT: [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK12-NEXT: store ptr [[A_ADDR]], ptr [[TMP]], align 8 // CHECK12-NEXT: store ptr [[C_ADDR]], ptr [[_TMP1]], align 8 -// CHECK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK12-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK12-NEXT: [[INC:%.*]] = add nsw i32 [[TMP2]], 1 // CHECK12-NEXT: store i32 [[INC]], ptr [[TMP1]], align 4 // CHECK12-NEXT: [[TMP3:%.*]] = load i32, ptr [[B_ADDR]], align 4 // CHECK12-NEXT: [[DEC:%.*]] = add nsw i32 [[TMP3]], -1 // CHECK12-NEXT: store i32 [[DEC]], ptr [[B_ADDR]], align 4 -// CHECK12-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK12-NEXT: [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK12-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK12-NEXT: [[DIV:%.*]] = sdiv i32 [[TMP5]], 1 // CHECK12-NEXT: store i32 [[DIV]], ptr [[TMP4]], align 4 @@ -2660,11 +2660,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: store i64 [[VLA2]], ptr [[VLA_ADDR3]], align 8 // CHECK17-NEXT: store i64 [[VLA4]], ptr [[VLA_ADDR5]], align 8 // CHECK17-NEXT: store ptr [[VLA26]], ptr [[VLA2_ADDR]], align 8 -// CHECK17-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8 +// CHECK17-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] // CHECK17-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8 // CHECK17-NEXT: [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8 // CHECK17-NEXT: [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8 -// CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8 +// CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META3]], !align [[META5:![0-9]+]] // CHECK17-NEXT: [[TMP5:%.*]] = call ptr @llvm.stacksave.p0() // CHECK17-NEXT: store ptr [[TMP5]], ptr [[SAVED_STACK]], align 8 // CHECK17-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP2]], [[TMP3]] @@ -2751,8 +2751,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK17-NEXT: [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8 // CHECK17-NEXT: [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8 -// CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8 -// CHECK17-NEXT: [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 8 +// CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META3]], !align [[META5]] +// CHECK17-NEXT: [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK17-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP2]], [[TMP3]] // CHECK17-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8 // CHECK17-NEXT: [[TMP8:%.*]] = add nuw i64 [[TMP7]], 127 diff --git a/clang/test/OpenMP/sections_firstprivate_codegen.cpp b/clang/test/OpenMP/sections_firstprivate_codegen.cpp index 7c6d1839fb10e..32c5826e6f75d 100644 --- a/clang/test/OpenMP/sections_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/sections_firstprivate_codegen.cpp @@ -448,7 +448,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -517,10 +517,10 @@ int main() { // CHECK1-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_SECTIONS_LB_]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_SECTIONS_UB_]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_SECTIONS_ST_]], align 4 @@ -668,7 +668,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -996,7 +996,7 @@ int main() { // CHECK4-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK4-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK4-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] // CHECK4-NEXT: store i32 0, ptr [[DOTOMP_SECTIONS_LB_]], align 4 // CHECK4-NEXT: store i32 1, ptr [[DOTOMP_SECTIONS_UB_]], align 4 // CHECK4-NEXT: store i32 1, ptr [[DOTOMP_SECTIONS_ST_]], align 4 diff --git a/clang/test/OpenMP/single_firstprivate_codegen.cpp b/clang/test/OpenMP/single_firstprivate_codegen.cpp index 27cd220adf225..31ea1ca4952fc 100644 --- a/clang/test/OpenMP/single_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/single_firstprivate_codegen.cpp @@ -403,7 +403,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -467,10 +467,10 @@ int main() { // CHECK1-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[T_VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_single(ptr @[[GLOB1]], i32 [[TMP5]]) @@ -585,7 +585,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META3]], !align [[META4]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -908,7 +908,7 @@ int main() { // CHECK4-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK4-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 // CHECK4-NEXT: store ptr [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8 +// CHECK4-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SIVAR_ADDR]], align 8, !nonnull [[META3:![0-9]+]], !align [[META4:![0-9]+]] // CHECK4-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 // CHECK4-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 // CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_single(ptr @[[GLOB1]], i32 [[TMP2]]) diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp index aa50d8fb3aabd..a171827a18646 100644 --- a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp @@ -354,9 +354,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 @@ -396,9 +396,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -550,12 +550,12 @@ int main() { // CHECK1-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1 // CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2) // CHECK1-NEXT: store ptr [[TEST]], ptr [[VAR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 8 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[VEC]], ptr [[TMP4]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -655,7 +655,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -711,14 +711,14 @@ int main() { // CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK1-NEXT: ret void // @@ -752,9 +752,9 @@ int main() { // CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -776,7 +776,7 @@ int main() { // CHECK1-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK1-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK1: omp.arraycpy.done4: -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK1-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK1-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -815,7 +815,7 @@ int main() { // CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i64 0, i64 [[IDXPROM]] // CHECK1-NEXT: store i32 [[TMP13]], ptr [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP16]] to i64 // CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i64 0, i64 [[IDXPROM9]] @@ -912,7 +912,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1155,9 +1155,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 @@ -1197,9 +1197,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -1349,12 +1349,12 @@ int main() { // CHECK3-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1 // CHECK3-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2) // CHECK3-NEXT: store ptr [[TEST]], ptr [[VAR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 4 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[VEC]], ptr [[TMP4]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -1454,7 +1454,7 @@ int main() { // CHECK3-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1510,14 +1510,14 @@ int main() { // CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK3-NEXT: ret void // @@ -1551,9 +1551,9 @@ int main() { // CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -1575,7 +1575,7 @@ int main() { // CHECK3-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK3-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK3: omp.arraycpy.done4: -// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -1613,7 +1613,7 @@ int main() { // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP14]] // CHECK3-NEXT: store i32 [[TMP13]], ptr [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 // CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP16]] // CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP15]], i32 4, i1 false) @@ -1709,7 +1709,7 @@ int main() { // CHECK3-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1877,7 +1877,7 @@ int main() { // CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4 // CHECK9-NEXT: store i32 [[TMP0]], ptr [[G_CASTED]], align 4 // CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]] // CHECK9-NEXT: [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4 // CHECK9-NEXT: store i32 [[TMP3]], ptr [[G1_CASTED]], align 4 // CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8 @@ -1943,13 +1943,13 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: store i32 1, ptr [[G_ADDR]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK9-NEXT: store volatile i32 1, ptr [[TMP8]], align 4 // CHECK9-NEXT: store i32 2, ptr [[SIVAR_ADDR]], align 4 // CHECK9-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0 // CHECK9-NEXT: store ptr [[G_ADDR]], ptr [[TMP9]], align 8 // CHECK9-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1 -// CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK9-NEXT: store ptr [[TMP11]], ptr [[TMP10]], align 8 // CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2 // CHECK9-NEXT: store ptr [[SIVAR_ADDR]], ptr [[TMP12]], align 8 diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp index 78e40e54671ac..678770520f677 100644 --- a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp @@ -415,9 +415,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 @@ -459,9 +459,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -616,9 +616,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: [[TMP3:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 @@ -741,12 +741,12 @@ int main() { // CHECK1-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1 // CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2) // CHECK1-NEXT: store ptr [[TEST]], ptr [[VAR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 8 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store ptr [[VEC]], ptr [[TMP4]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -846,7 +846,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -902,14 +902,14 @@ int main() { // CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK1-NEXT: ret void // @@ -944,9 +944,9 @@ int main() { // CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 @@ -968,7 +968,7 @@ int main() { // CHECK1-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK1-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK1: omp.arraycpy.done4: -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK1-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK1-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -1005,7 +1005,7 @@ int main() { // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP16]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined, i64 [[TMP13]], i64 [[TMP15]], ptr [[VEC2]], i64 [[TMP17]], ptr [[S_ARR3]], ptr [[TMP18]]) // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: @@ -1082,9 +1082,9 @@ int main() { // CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -1112,7 +1112,7 @@ int main() { // CHECK1-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]] // CHECK1-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE5]], label [[OMP_ARRAYCPY_BODY]] // CHECK1: omp.arraycpy.done5: -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) // CHECK1-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]]) // CHECK1-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR2]] @@ -1151,7 +1151,7 @@ int main() { // CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP16]] to i64 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC3]], i64 0, i64 [[IDXPROM]] // CHECK1-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP18]] to i64 // CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i64 0, i64 [[IDXPROM10]] @@ -1233,7 +1233,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1476,9 +1476,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 @@ -1520,9 +1520,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -1675,9 +1675,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 @@ -1796,12 +1796,12 @@ int main() { // CHECK3-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1 // CHECK3-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2) // CHECK3-NEXT: store ptr [[TEST]], ptr [[VAR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 4 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: store ptr [[VEC]], ptr [[TMP4]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -1901,7 +1901,7 @@ int main() { // CHECK3-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1957,14 +1957,14 @@ int main() { // CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK3-NEXT: ret void // @@ -1999,9 +1999,9 @@ int main() { // CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 @@ -2023,7 +2023,7 @@ int main() { // CHECK3-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK3-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK3: omp.arraycpy.done4: -// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -2058,7 +2058,7 @@ int main() { // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP14]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined, i32 [[TMP12]], i32 [[TMP13]], ptr [[VEC2]], i32 [[TMP15]], ptr [[S_ARR3]], ptr [[TMP16]]) // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: @@ -2135,9 +2135,9 @@ int main() { // CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -2163,7 +2163,7 @@ int main() { // CHECK3-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]] // CHECK3-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK3: omp.arraycpy.done4: -// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -2201,7 +2201,7 @@ int main() { // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP16]] // CHECK3-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP18]] // CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP17]], i32 4, i1 false) @@ -2282,7 +2282,7 @@ int main() { // CHECK3-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -2450,7 +2450,7 @@ int main() { // CHECK5-NEXT: [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4 // CHECK5-NEXT: store i32 [[TMP0]], ptr [[G_CASTED]], align 4 // CHECK5-NEXT: [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8 -// CHECK5-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK5-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]] // CHECK5-NEXT: [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4 // CHECK5-NEXT: store i32 [[TMP3]], ptr [[G1_CASTED]], align 4 // CHECK5-NEXT: [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8 @@ -2520,7 +2520,7 @@ int main() { // CHECK5-NEXT: [[TMP11:%.*]] = load i32, ptr [[G_ADDR]], align 4 // CHECK5-NEXT: store i32 [[TMP11]], ptr [[G_CASTED]], align 4 // CHECK5-NEXT: [[TMP12:%.*]] = load i64, ptr [[G_CASTED]], align 8 -// CHECK5-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK5-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK5-NEXT: [[TMP14:%.*]] = load volatile i32, ptr [[TMP13]], align 4 // CHECK5-NEXT: store i32 [[TMP14]], ptr [[G1_CASTED]], align 4 // CHECK5-NEXT: [[TMP15:%.*]] = load i64, ptr [[G1_CASTED]], align 8 @@ -2607,13 +2607,13 @@ int main() { // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK5-NEXT: store i32 1, ptr [[G_ADDR]], align 4 -// CHECK5-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK5-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK5-NEXT: store volatile i32 1, ptr [[TMP10]], align 4 // CHECK5-NEXT: store i32 2, ptr [[SIVAR_ADDR]], align 4 // CHECK5-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0 // CHECK5-NEXT: store ptr [[G_ADDR]], ptr [[TMP11]], align 8 // CHECK5-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1 -// CHECK5-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK5-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK5-NEXT: store ptr [[TMP13]], ptr [[TMP12]], align 8 // CHECK5-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2 // CHECK5-NEXT: store ptr [[SIVAR_ADDR]], ptr [[TMP14]], align 8 @@ -2659,9 +2659,9 @@ int main() { // CHECK13-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK13-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]] +// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK13-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 @@ -2703,9 +2703,9 @@ int main() { // CHECK13-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK13-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK13-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK13-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -2860,9 +2860,9 @@ int main() { // CHECK13-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK13-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK13-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK13-NEXT: [[TMP3:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 @@ -2987,14 +2987,14 @@ int main() { // CHECK13-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK13-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK13-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK13-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK13-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK13-NEXT: ret void // @@ -3029,9 +3029,9 @@ int main() { // CHECK13-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK13-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK13-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 @@ -3053,7 +3053,7 @@ int main() { // CHECK13-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK13-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK13: omp.arraycpy.done4: -// CHECK13-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK13-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK13-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK13-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR3]] @@ -3090,7 +3090,7 @@ int main() { // CHECK13-NEXT: [[TMP16:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK13-NEXT: store i32 [[TMP16]], ptr [[T_VAR_CASTED]], align 4 // CHECK13-NEXT: [[TMP17:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK13-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8 +// CHECK13-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined, i64 [[TMP13]], i64 [[TMP15]], ptr [[VEC2]], i64 [[TMP17]], ptr [[S_ARR3]], ptr [[TMP18]]) // CHECK13-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK13: omp.inner.for.inc: @@ -3167,9 +3167,9 @@ int main() { // CHECK13-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK13-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK13-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK13-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK13-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -3197,7 +3197,7 @@ int main() { // CHECK13-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]] // CHECK13-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE5]], label [[OMP_ARRAYCPY_BODY]] // CHECK13: omp.arraycpy.done5: -// CHECK13-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK13-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) // CHECK13-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]]) // CHECK13-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR3]] @@ -3236,7 +3236,7 @@ int main() { // CHECK13-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP16]] to i64 // CHECK13-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC3]], i64 0, i64 [[IDXPROM]] // CHECK13-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4 -// CHECK13-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8 +// CHECK13-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK13-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP18]] to i64 // CHECK13-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i64 0, i64 [[IDXPROM10]] @@ -3321,7 +3321,7 @@ int main() { // CHECK13-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK13-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK13-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -3352,7 +3352,7 @@ int main() { // CHECK13-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK13-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK13-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK13-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK13-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK13-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK13-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -3379,9 +3379,9 @@ int main() { // CHECK15-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK15-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9:![0-9]+]], !align [[META10:![0-9]+]] +// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK15-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK15-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 @@ -3423,9 +3423,9 @@ int main() { // CHECK15-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK15-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK15-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK15-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -3578,9 +3578,9 @@ int main() { // CHECK15-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK15-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK15-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK15-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 @@ -3701,14 +3701,14 @@ int main() { // CHECK15-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK15-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK15-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK15-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK15-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK15-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK15-NEXT: ret void // @@ -3743,9 +3743,9 @@ int main() { // CHECK15-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK15-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 @@ -3767,7 +3767,7 @@ int main() { // CHECK15-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK15-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK15: omp.arraycpy.done4: -// CHECK15-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK15-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK15-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK15-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR3]] @@ -3802,7 +3802,7 @@ int main() { // CHECK15-NEXT: [[TMP14:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK15-NEXT: store i32 [[TMP14]], ptr [[T_VAR_CASTED]], align 4 // CHECK15-NEXT: [[TMP15:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK15-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4 +// CHECK15-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined, i32 [[TMP12]], i32 [[TMP13]], ptr [[VEC2]], i32 [[TMP15]], ptr [[S_ARR3]], ptr [[TMP16]]) // CHECK15-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK15: omp.inner.for.inc: @@ -3879,9 +3879,9 @@ int main() { // CHECK15-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK15-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] +// CHECK15-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK15-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK15-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -3907,7 +3907,7 @@ int main() { // CHECK15-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]] // CHECK15-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK15: omp.arraycpy.done4: -// CHECK15-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK15-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK15-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]]) // CHECK15-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR3]] @@ -3945,7 +3945,7 @@ int main() { // CHECK15-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 // CHECK15-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP16]] // CHECK15-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4 -// CHECK15-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4 +// CHECK15-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK15-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP18]] // CHECK15-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP17]], i32 4, i1 false) @@ -4029,7 +4029,7 @@ int main() { // CHECK15-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK15-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK15-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -4060,7 +4060,7 @@ int main() { // CHECK15-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK15-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK15-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK15-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META9]], !align [[META10]] // CHECK15-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK15-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK15-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -4089,7 +4089,7 @@ int main() { // CHECK17-NEXT: [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4 // CHECK17-NEXT: store i32 [[TMP0]], ptr [[G_CASTED]], align 4 // CHECK17-NEXT: [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8 -// CHECK17-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK17-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META6:![0-9]+]], !align [[META7:![0-9]+]] // CHECK17-NEXT: [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4 // CHECK17-NEXT: store i32 [[TMP3]], ptr [[G1_CASTED]], align 4 // CHECK17-NEXT: [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8 @@ -4159,7 +4159,7 @@ int main() { // CHECK17-NEXT: [[TMP11:%.*]] = load i32, ptr [[G_ADDR]], align 4 // CHECK17-NEXT: store i32 [[TMP11]], ptr [[G_CASTED]], align 4 // CHECK17-NEXT: [[TMP12:%.*]] = load i64, ptr [[G_CASTED]], align 8 -// CHECK17-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK17-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META6]], !align [[META7]] // CHECK17-NEXT: [[TMP14:%.*]] = load volatile i32, ptr [[TMP13]], align 4 // CHECK17-NEXT: store i32 [[TMP14]], ptr [[G1_CASTED]], align 4 // CHECK17-NEXT: [[TMP15:%.*]] = load i64, ptr [[G1_CASTED]], align 8 @@ -4246,13 +4246,13 @@ int main() { // CHECK17-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK17-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK17-NEXT: store i32 1, ptr [[G_ADDR]], align 4 -// CHECK17-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK17-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META6]], !align [[META7]] // CHECK17-NEXT: store volatile i32 1, ptr [[TMP10]], align 4 // CHECK17-NEXT: store i32 2, ptr [[SIVAR_ADDR]], align 4 // CHECK17-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 0 // CHECK17-NEXT: store ptr [[G_ADDR]], ptr [[TMP11]], align 8 // CHECK17-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 1 -// CHECK17-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK17-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META6]], !align [[META7]] // CHECK17-NEXT: store ptr [[TMP13]], ptr [[TMP12]], align 8 // CHECK17-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 2 // CHECK17-NEXT: store ptr [[SIVAR_ADDR]], ptr [[TMP14]], align 8 diff --git a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp index 294fcba7872b3..f3c9565a17656 100644 --- a/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_firstprivate_codegen.cpp @@ -357,9 +357,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 @@ -399,9 +399,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -553,14 +553,14 @@ int main() { // CHECK1-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1 // CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2) // CHECK1-NEXT: store ptr [[TEST]], ptr [[VAR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 8 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store i64 [[TMP2]], ptr [[TMP6]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -660,7 +660,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -716,14 +716,14 @@ int main() { // CHECK1-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK1-NEXT: ret void // @@ -757,9 +757,9 @@ int main() { // CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -781,7 +781,7 @@ int main() { // CHECK1-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK1-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK1: omp.arraycpy.done4: -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK1-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK1-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -820,7 +820,7 @@ int main() { // CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i64 0, i64 [[IDXPROM]] // CHECK1-NEXT: store i32 [[TMP13]], ptr [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8 +// CHECK1-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM9:%.*]] = sext i32 [[TMP16]] to i64 // CHECK1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i64 0, i64 [[IDXPROM9]] @@ -917,7 +917,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1160,9 +1160,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 @@ -1202,9 +1202,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -1354,14 +1354,14 @@ int main() { // CHECK3-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1 // CHECK3-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2) // CHECK3-NEXT: store ptr [[TEST]], ptr [[VAR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 4 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: store i32 [[TMP2]], ptr [[TMP6]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -1461,7 +1461,7 @@ int main() { // CHECK3-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1517,14 +1517,14 @@ int main() { // CHECK3-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK3-NEXT: ret void // @@ -1558,9 +1558,9 @@ int main() { // CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -1582,7 +1582,7 @@ int main() { // CHECK3-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK3-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK3: omp.arraycpy.done4: -// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -1620,7 +1620,7 @@ int main() { // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[I]], align 4 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP14]] // CHECK3-NEXT: store i32 [[TMP13]], ptr [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4 +// CHECK3-NEXT: [[TMP15:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 // CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP16]] // CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP15]], i32 4, i1 false) @@ -1716,7 +1716,7 @@ int main() { // CHECK3-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1884,7 +1884,7 @@ int main() { // CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4 // CHECK9-NEXT: store i32 [[TMP0]], ptr [[G_CASTED]], align 4 // CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]] // CHECK9-NEXT: [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4 // CHECK9-NEXT: store i32 [[TMP3]], ptr [[G1_CASTED]], align 4 // CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8 @@ -1950,13 +1950,13 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: store i32 1, ptr [[G_ADDR]], align 4 -// CHECK9-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP8:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK9-NEXT: store volatile i32 1, ptr [[TMP8]], align 4 // CHECK9-NEXT: store i32 2, ptr [[SIVAR_ADDR]], align 4 // CHECK9-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0 // CHECK9-NEXT: store ptr [[G_ADDR]], ptr [[TMP9]], align 8 // CHECK9-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1 -// CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK9-NEXT: store ptr [[TMP11]], ptr [[TMP10]], align 8 // CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2 // CHECK9-NEXT: store ptr [[SIVAR_ADDR]], ptr [[TMP12]], align 8 diff --git a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp index d742b0a85af42..037aa12d57226 100644 --- a/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_distribute_parallel_for_firstprivate_codegen.cpp @@ -391,9 +391,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7:![0-9]+]], !align [[META8:![0-9]+]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 @@ -435,9 +435,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -592,9 +592,9 @@ int main() { // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK1-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK1-NEXT: [[TMP3:%.*]] = load i64, ptr [[DOTPREVIOUS_LB__ADDR]], align 8 @@ -717,14 +717,14 @@ int main() { // CHECK1-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i64 1 // CHECK1-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef signext 2) // CHECK1-NEXT: store ptr [[TEST]], ptr [[VAR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 8 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4 // CHECK1-NEXT: store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK1-NEXT: store i64 [[TMP2]], ptr [[TMP6]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -824,7 +824,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -880,14 +880,14 @@ int main() { // CHECK1-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i64 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK1-NEXT: ret void // @@ -922,9 +922,9 @@ int main() { // CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 @@ -946,7 +946,7 @@ int main() { // CHECK1-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK1-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK1: omp.arraycpy.done4: -// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK1-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK1-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -983,7 +983,7 @@ int main() { // CHECK1-NEXT: [[TMP16:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK1-NEXT: store i32 [[TMP16]], ptr [[T_VAR_CASTED]], align 4 // CHECK1-NEXT: [[TMP17:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 -// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8 +// CHECK1-NEXT: [[TMP18:%.*]] = load ptr, ptr [[_TMP7]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined, i64 [[TMP13]], i64 [[TMP15]], ptr [[VEC2]], i64 [[TMP17]], ptr [[S_ARR3]], ptr [[TMP18]]) // CHECK1-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK1: omp.inner.for.inc: @@ -1060,9 +1060,9 @@ int main() { // CHECK1-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK1-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] +// CHECK1-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 // CHECK1-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK1-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -1090,7 +1090,7 @@ int main() { // CHECK1-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]] // CHECK1-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE5]], label [[OMP_ARRAYCPY_BODY]] // CHECK1: omp.arraycpy.done5: -// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK1-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) // CHECK1-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]]) // CHECK1-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR2]] @@ -1129,7 +1129,7 @@ int main() { // CHECK1-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP16]] to i64 // CHECK1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC3]], i64 0, i64 [[IDXPROM]] // CHECK1-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4 -// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8 +// CHECK1-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP8]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK1-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP18]] to i64 // CHECK1-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i64 0, i64 [[IDXPROM10]] @@ -1211,7 +1211,7 @@ int main() { // CHECK1-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK1-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK1-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META7]], !align [[META8]] // CHECK1-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1454,9 +1454,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 @@ -1498,9 +1498,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 @@ -1653,9 +1653,9 @@ int main() { // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTPREVIOUS_LB__ADDR]], align 4 @@ -1774,14 +1774,14 @@ int main() { // CHECK3-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[S_ARR]], i32 1 // CHECK3-NEXT: call void @_ZN1SIiEC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYINIT_ELEMENT]], i32 noundef 2) // CHECK3-NEXT: store ptr [[TEST]], ptr [[VAR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP0]], ptr [[TMP]], align 4 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[T_VAR]], align 4 // CHECK3-NEXT: store i32 [[TMP1]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 // CHECK3-NEXT: store i32 [[TMP2]], ptr [[TMP6]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 @@ -1881,7 +1881,7 @@ int main() { // CHECK3-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1937,14 +1937,14 @@ int main() { // CHECK3-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 4, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined, ptr [[TMP0]], i32 [[TMP4]], ptr [[TMP1]], ptr [[TMP5]]) // CHECK3-NEXT: ret void // @@ -1979,9 +1979,9 @@ int main() { // CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_COMB_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_COMB_UB]], align 4 @@ -2003,7 +2003,7 @@ int main() { // CHECK3-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]] // CHECK3-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK3: omp.arraycpy.done4: -// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -2038,7 +2038,7 @@ int main() { // CHECK3-NEXT: [[TMP14:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP14]], ptr [[T_VAR_CASTED]], align 4 // CHECK3-NEXT: [[TMP15:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 -// CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4 +// CHECK3-NEXT: [[TMP16:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 6, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined.omp_outlined, i32 [[TMP12]], i32 [[TMP13]], ptr [[VEC2]], i32 [[TMP15]], ptr [[S_ARR3]], ptr [[TMP16]]) // CHECK3-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] // CHECK3: omp.inner.for.inc: @@ -2115,9 +2115,9 @@ int main() { // CHECK3-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK3-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK3-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 4 // CHECK3-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 // CHECK3-NEXT: store i32 1, ptr [[DOTOMP_UB]], align 4 @@ -2143,7 +2143,7 @@ int main() { // CHECK3-NEXT: [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]] // CHECK3-NEXT: br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]] // CHECK3: omp.arraycpy.done4: -// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4 +// CHECK3-NEXT: [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]]) // CHECK3-NEXT: call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR2]] @@ -2181,7 +2181,7 @@ int main() { // CHECK3-NEXT: [[TMP16:%.*]] = load i32, ptr [[I]], align 4 // CHECK3-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VEC2]], i32 0, i32 [[TMP16]] // CHECK3-NEXT: store i32 [[TMP15]], ptr [[ARRAYIDX]], align 4 -// CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4 +// CHECK3-NEXT: [[TMP17:%.*]] = load ptr, ptr [[_TMP7]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[TMP18:%.*]] = load i32, ptr [[I]], align 4 // CHECK3-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 [[TMP18]] // CHECK3-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[ARRAYIDX9]], ptr align 4 [[TMP17]], i32 4, i1 false) @@ -2262,7 +2262,7 @@ int main() { // CHECK3-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK3-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK3-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK3-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -2430,7 +2430,7 @@ int main() { // CHECK9-NEXT: [[TMP0:%.*]] = load i32, ptr [[G_ADDR]], align 4 // CHECK9-NEXT: store i32 [[TMP0]], ptr [[G_CASTED]], align 4 // CHECK9-NEXT: [[TMP1:%.*]] = load i64, ptr [[G_CASTED]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]] // CHECK9-NEXT: [[TMP3:%.*]] = load volatile i32, ptr [[TMP2]], align 4 // CHECK9-NEXT: store i32 [[TMP3]], ptr [[G1_CASTED]], align 4 // CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[G1_CASTED]], align 8 @@ -2500,7 +2500,7 @@ int main() { // CHECK9-NEXT: [[TMP11:%.*]] = load i32, ptr [[G_ADDR]], align 4 // CHECK9-NEXT: store i32 [[TMP11]], ptr [[G_CASTED]], align 4 // CHECK9-NEXT: [[TMP12:%.*]] = load i64, ptr [[G_CASTED]], align 8 -// CHECK9-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK9-NEXT: [[TMP14:%.*]] = load volatile i32, ptr [[TMP13]], align 4 // CHECK9-NEXT: store i32 [[TMP14]], ptr [[G1_CASTED]], align 4 // CHECK9-NEXT: [[TMP15:%.*]] = load i64, ptr [[G1_CASTED]], align 8 @@ -2587,13 +2587,13 @@ int main() { // CHECK9-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK9-NEXT: store i32 [[ADD]], ptr [[I]], align 4 // CHECK9-NEXT: store i32 1, ptr [[G_ADDR]], align 4 -// CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP10:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK9-NEXT: store volatile i32 1, ptr [[TMP10]], align 4 // CHECK9-NEXT: store i32 2, ptr [[SIVAR_ADDR]], align 4 // CHECK9-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 0 // CHECK9-NEXT: store ptr [[G_ADDR]], ptr [[TMP11]], align 8 // CHECK9-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 1 -// CHECK9-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK9-NEXT: [[TMP13:%.*]] = load ptr, ptr [[TMP]], align 8, !nonnull [[META5]], !align [[META6]] // CHECK9-NEXT: store ptr [[TMP13]], ptr [[TMP12]], align 8 // CHECK9-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[CLASS_ANON_0]], ptr [[REF_TMP]], i32 0, i32 2 // CHECK9-NEXT: store ptr [[SIVAR_ADDR]], ptr [[TMP14]], align 8 diff --git a/clang/test/OpenMP/teams_firstprivate_codegen.cpp b/clang/test/OpenMP/teams_firstprivate_codegen.cpp index 6f21c9e31bd8d..fec8fcb78f91e 100644 --- a/clang/test/OpenMP/teams_firstprivate_codegen.cpp +++ b/clang/test/OpenMP/teams_firstprivate_codegen.cpp @@ -458,9 +458,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK9-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK9-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META11:![0-9]+]], !align [[META12:![0-9]+]] +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] // CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK9-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 @@ -493,9 +493,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK9-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 // CHECK9-NEXT: store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[VEC1]], ptr align 4 [[TMP0]], i64 8, i1 false) // CHECK9-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0 // CHECK9-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i64 2 @@ -808,7 +808,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] // CHECK9-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK9-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -872,9 +872,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 8 // CHECK9-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK9-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] // CHECK9-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK9-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK9-NEXT: [[TMP4:%.*]] = load i64, ptr [[T_VAR_CASTED]], align 8 @@ -902,9 +902,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8 // CHECK9-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8 // CHECK9-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8 -// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8 -// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] +// CHECK9-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] +// CHECK9-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] // CHECK9-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i64 8, i1 false) // CHECK9-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0 // CHECK9-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i64 2 @@ -1032,7 +1032,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK9-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8 // CHECK9-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK9-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8 +// CHECK9-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8, !nonnull [[META11]], !align [[META12]] // CHECK9-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK9-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK9-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1250,9 +1250,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK11-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK11-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META12:![0-9]+]], !align [[META13:![0-9]+]] +// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] +// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] // CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK11-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 @@ -1285,9 +1285,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK11-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 // CHECK11-NEXT: store i32 [[SIVAR]], ptr [[SIVAR_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] +// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] +// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[VEC1]], ptr align 4 [[TMP0]], i32 8, i1 false) // CHECK11-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0 // CHECK11-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN]], i32 2 @@ -1600,7 +1600,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] // CHECK11-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S]], ptr [[TMP0]], i32 0, i32 0 // CHECK11-NEXT: [[TMP1:%.*]] = load float, ptr [[F2]], align 4 // CHECK11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1664,9 +1664,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: store ptr [[VEC]], ptr [[VEC_ADDR]], align 4 // CHECK11-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK11-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] +// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] +// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] // CHECK11-NEXT: [[TMP3:%.*]] = load i32, ptr [[T_VAR_ADDR]], align 4 // CHECK11-NEXT: store i32 [[TMP3]], ptr [[T_VAR_CASTED]], align 4 // CHECK11-NEXT: [[TMP4:%.*]] = load i32, ptr [[T_VAR_CASTED]], align 4 @@ -1694,9 +1694,9 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4 // CHECK11-NEXT: store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4 // CHECK11-NEXT: store ptr [[VAR]], ptr [[VAR_ADDR]], align 4 -// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4 -// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4 -// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VEC_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] +// CHECK11-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S_ARR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] +// CHECK11-NEXT: [[TMP2:%.*]] = load ptr, ptr [[VAR_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] // CHECK11-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 128 [[VEC1]], ptr align 128 [[TMP0]], i32 8, i1 false) // CHECK11-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR2]], i32 0, i32 0 // CHECK11-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN]], i32 2 @@ -1824,7 +1824,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK11-NEXT: store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4 // CHECK11-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK11-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0:%.*]], ptr [[THIS1]], i32 0, i32 0 -// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4 +// CHECK11-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4, !nonnull [[META12]], !align [[META13]] // CHECK11-NEXT: [[F2:%.*]] = getelementptr inbounds nuw [[STRUCT_S_0]], ptr [[TMP0]], i32 0, i32 0 // CHECK11-NEXT: [[TMP1:%.*]] = load i32, ptr [[F2]], align 4 // CHECK11-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_ST:%.*]], ptr [[T]], i32 0, i32 0 @@ -1997,7 +1997,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8 // CHECK17-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8 // CHECK17-NEXT: [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8 -// CHECK17-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8 +// CHECK17-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]] // CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[S_ADDR]], align 8 // CHECK17-NEXT: [[TMP5:%.*]] = load ptr, ptr [[VLA1_ADDR]], align 8 // CHECK17-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8 @@ -2031,11 +2031,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: store i64 [[VLA2]], ptr [[VLA_ADDR3]], align 8 // CHECK17-NEXT: store i64 [[VLA4]], ptr [[VLA_ADDR5]], align 8 // CHECK17-NEXT: store ptr [[VLA26]], ptr [[VLA2_ADDR]], align 8 -// CHECK17-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8 +// CHECK17-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 8, !nonnull [[META8]], !align [[META10:![0-9]+]] // CHECK17-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8 // CHECK17-NEXT: [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8 // CHECK17-NEXT: [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8 -// CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8 +// CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK17-NEXT: [[TMP5:%.*]] = call ptr @llvm.stacksave.p0() // CHECK17-NEXT: store ptr [[TMP5]], ptr [[SAVED_STACK]], align 8 // CHECK17-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP2]], [[TMP3]] @@ -2235,7 +2235,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8 // CHECK17-NEXT: [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8 // CHECK17-NEXT: [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8 -// CHECK17-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8 +// CHECK17-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] // CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK17-NEXT: [[TMP5:%.*]] = load ptr, ptr [[VLA1_ADDR]], align 8 // CHECK17-NEXT: [[TMP6:%.*]] = load ptr, ptr [[S_ADDR]], align 8 @@ -2273,8 +2273,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK17-NEXT: [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 // CHECK17-NEXT: [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR3]], align 8 // CHECK17-NEXT: [[TMP3:%.*]] = load i64, ptr [[VLA_ADDR5]], align 8 -// CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8 -// CHECK17-NEXT: [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 8 +// CHECK17-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 8, !nonnull [[META8]], !align [[META9]] +// CHECK17-NEXT: [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 8, !nonnull [[META8]], !align [[META10]] // CHECK17-NEXT: [[TMP6:%.*]] = call ptr @llvm.stacksave.p0() // CHECK17-NEXT: store ptr [[TMP6]], ptr [[SAVED_STACK]], align 8 // CHECK17-NEXT: [[TMP7:%.*]] = mul nuw i64 [[TMP2]], [[TMP3]] @@ -2460,7 +2460,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK19-NEXT: [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4 // CHECK19-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR3]], align 4 // CHECK19-NEXT: [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR5]], align 4 -// CHECK19-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4 +// CHECK19-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4, !nonnull [[META8:![0-9]+]], !align [[META9:![0-9]+]] // CHECK19-NEXT: [[TMP4:%.*]] = load ptr, ptr [[S_ADDR]], align 4 // CHECK19-NEXT: [[TMP5:%.*]] = load ptr, ptr [[VLA1_ADDR]], align 4 // CHECK19-NEXT: [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 4 @@ -2494,11 +2494,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK19-NEXT: store i32 [[VLA2]], ptr [[VLA_ADDR3]], align 4 // CHECK19-NEXT: store i32 [[VLA4]], ptr [[VLA_ADDR5]], align 4 // CHECK19-NEXT: store ptr [[VLA26]], ptr [[VLA2_ADDR]], align 4 -// CHECK19-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4 +// CHECK19-NEXT: [[TMP0:%.*]] = load ptr, ptr [[N_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK19-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4 // CHECK19-NEXT: [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR3]], align 4 // CHECK19-NEXT: [[TMP3:%.*]] = load i32, ptr [[VLA_ADDR5]], align 4 -// CHECK19-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4 +// CHECK19-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK19-NEXT: [[TMP5:%.*]] = call ptr @llvm.stacksave.p0() // CHECK19-NEXT: store ptr [[TMP5]], ptr [[SAVED_STACK]], align 4 // CHECK19-NEXT: [[TMP6:%.*]] = mul nuw i32 [[TMP2]], [[TMP3]] @@ -2696,7 +2696,7 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK19-NEXT: [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4 // CHECK19-NEXT: [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR3]], align 4 // CHECK19-NEXT: [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR5]], align 4 -// CHECK19-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4 +// CHECK19-NEXT: [[TMP3:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK19-NEXT: [[TMP4:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK19-NEXT: [[TMP5:%.*]] = load ptr, ptr [[VLA1_ADDR]], align 4 // CHECK19-NEXT: [[TMP6:%.*]] = load ptr, ptr [[S_ADDR]], align 4 @@ -2734,8 +2734,8 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) { // CHECK19-NEXT: [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4 // CHECK19-NEXT: [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR3]], align 4 // CHECK19-NEXT: [[TMP3:%.*]] = load i32, ptr [[VLA_ADDR5]], align 4 -// CHECK19-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4 -// CHECK19-NEXT: [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 4 +// CHECK19-NEXT: [[TMP4:%.*]] = load ptr, ptr [[VLA2_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] +// CHECK19-NEXT: [[TMP5:%.*]] = load ptr, ptr [[N_ADDR]], align 4, !nonnull [[META8]], !align [[META9]] // CHECK19-NEXT: [[TMP6:%.*]] = call ptr @llvm.stacksave.p0() // CHECK19-NEXT: store ptr [[TMP6]], ptr [[SAVED_STACK]], align 4 // CHECK19-NEXT: [[TMP7:%.*]] = mul nuw i32 [[TMP2]], [[TMP3]] From 4fbe88fc46989b5b4e3b8913a915c7a3cd188bdf Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Fri, 18 Jul 2025 18:23:46 +0800 Subject: [PATCH 323/813] [NFC] Add parentheses around arithmetic operand (#149489) --- llvm/lib/TargetParser/X86TargetParser.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp index 57fbc71fa22ee..9cd35e35d4bc9 100644 --- a/llvm/lib/TargetParser/X86TargetParser.cpp +++ b/llvm/lib/TargetParser/X86TargetParser.cpp @@ -176,10 +176,10 @@ constexpr FeatureBitset FeaturesArrowlakeS = FeaturesArrowlake | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 | FeatureSM4; constexpr FeatureBitset FeaturesPantherlake = - FeaturesArrowlakeS ^ FeatureWIDEKL | FeaturePREFETCHI; + (FeaturesArrowlakeS ^ FeatureWIDEKL) | FeaturePREFETCHI; constexpr FeatureBitset FeaturesClearwaterforest = - FeaturesSierraforest ^ FeatureWIDEKL | FeatureAVXVNNIINT16 | FeatureSHA512 | - FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR; + (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 | + FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR; // Geode Processor. constexpr FeatureBitset FeaturesGeode = From c9d8b68676dbf51996a76475313088f750697343 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Fri, 18 Jul 2025 11:31:52 +0100 Subject: [PATCH 324/813] [DebugInfo] Suppress lots of users of DbgValueInst (#149476) This is another prune of dead code -- we never generate debug intrinsics nowadays, therefore there's no need for these codepaths to run. --------- Co-authored-by: Nikita Popov --- .../llvm/Transforms/Utils/SSAUpdater.h | 4 - llvm/lib/CodeGen/CodeGenPrepare.cpp | 59 +----- llvm/lib/CodeGen/MachineDebugify.cpp | 18 +- llvm/lib/Transforms/Coroutines/SpillUtils.cpp | 6 +- llvm/lib/Transforms/IPO/MergeFunctions.cpp | 19 +- .../InstCombine/InstructionCombining.cpp | 11 +- llvm/lib/Transforms/Scalar/JumpThreading.cpp | 39 +--- .../Transforms/Scalar/LoopStrengthReduce.cpp | 179 ++++++++---------- llvm/lib/Transforms/Utils/Debugify.cpp | 7 - llvm/lib/Transforms/Utils/Local.cpp | 30 +-- .../Transforms/Utils/LoopRotationUtils.cpp | 23 +-- llvm/lib/Transforms/Utils/SSAUpdater.cpp | 22 +-- llvm/tools/llvm-dis/llvm-dis.cpp | 14 -- llvm/unittests/CodeGen/LexicalScopesTest.cpp | 2 +- llvm/unittests/IR/DebugInfoTest.cpp | 2 + 15 files changed, 109 insertions(+), 326 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h index 4e5da81a7e885..9500b1f160ea9 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h @@ -29,7 +29,6 @@ template class SSAUpdaterTraits; class Type; class Use; class Value; -class DbgValueInst; /// Helper class for SSA formation on a set of values defined in /// multiple blocks. @@ -122,8 +121,6 @@ class SSAUpdater { /// the instruction. Anything outside of its block will have its /// value set to the new SSA value if available, and undef if not. void UpdateDebugValues(Instruction *I); - void UpdateDebugValues(Instruction *I, - SmallVectorImpl &DbgValues); void UpdateDebugValues(Instruction *I, SmallVectorImpl &DbgValues); @@ -136,7 +133,6 @@ class SSAUpdater { private: Value *GetValueAtEndOfBlockInternal(BasicBlock *BB); - void UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue); void UpdateDebugValue(Instruction *I, DbgVariableRecord *DbgValue); }; diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index d9d41f1d72e35..dc8184394f74d 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -445,7 +445,6 @@ class CodeGenPrepare { bool optimizeSwitchInst(SwitchInst *SI); bool optimizeExtractElementInst(Instruction *Inst); bool dupRetToEnableTailCallOpts(BasicBlock *BB, ModifyDT &ModifiedDT); - bool fixupDbgValue(Instruction *I); bool fixupDbgVariableRecord(DbgVariableRecord &I); bool fixupDbgVariableRecordsOnInst(Instruction &I); bool placeDbgValues(Function &F); @@ -2762,9 +2761,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) { case Intrinsic::fshl: case Intrinsic::fshr: return optimizeFunnelShift(II); - case Intrinsic::dbg_assign: - case Intrinsic::dbg_value: - return fixupDbgValue(II); case Intrinsic::masked_gather: return optimizeGatherScatterInst(II, II->getArgOperand(0)); case Intrinsic::masked_scatter: @@ -3554,8 +3550,6 @@ class TypePromotionTransaction { /// Keep track of the original uses (pair Instruction, Index). SmallVector OriginalUses; /// Keep track of the debug users. - SmallVector DbgValues; - /// And non-instruction debug-users too. SmallVector DbgVariableRecords; /// Keep track of the new value so that we can undo it by replacing @@ -3577,7 +3571,9 @@ class TypePromotionTransaction { } // Record the debug uses separately. They are not in the instruction's // use list, but they are replaced by RAUW. + SmallVector DbgValues; findDbgValues(DbgValues, Inst, &DbgVariableRecords); + assert(DbgValues.empty()); // Now, we can replace the uses. Inst->replaceAllUsesWith(New); @@ -3591,11 +3587,7 @@ class TypePromotionTransaction { // RAUW has replaced all original uses with references to the new value, // including the debug uses. Since we are undoing the replacements, // the original debug uses must also be reinstated to maintain the - // correctness and utility of debug value instructions. - for (auto *DVI : DbgValues) - DVI->replaceVariableLocationOp(New, Inst); - // Similar story with DbgVariableRecords, the non-instruction - // representation of dbg.values. + // correctness and utility of debug value records. for (DbgVariableRecord *DVR : DbgVariableRecords) DVR->replaceVariableLocationOp(New, Inst); } @@ -8933,32 +8925,6 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, ModifyDT &ModifiedDT) { return MadeChange; } -// Some CGP optimizations may move or alter what's computed in a block. Check -// whether a dbg.value intrinsic could be pointed at a more appropriate operand. -bool CodeGenPrepare::fixupDbgValue(Instruction *I) { - assert(isa(I)); - DbgValueInst &DVI = *cast(I); - - // Does this dbg.value refer to a sunk address calculation? - bool AnyChange = false; - SmallDenseSet LocationOps(DVI.location_ops().begin(), - DVI.location_ops().end()); - for (Value *Location : LocationOps) { - WeakTrackingVH SunkAddrVH = SunkAddrs[Location]; - Value *SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr; - if (SunkAddr) { - // Point dbg.value at locally computed address, which should give the best - // opportunity to be accurately lowered. This update may change the type - // of pointer being referred to; however this makes no difference to - // debugging information, and we can't generate bitcasts that may affect - // codegen. - DVI.replaceVariableLocationOp(Location, SunkAddr); - AnyChange = true; - } - } - return AnyChange; -} - bool CodeGenPrepare::fixupDbgVariableRecordsOnInst(Instruction &I) { bool AnyChange = false; for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) @@ -8993,14 +8959,6 @@ bool CodeGenPrepare::fixupDbgVariableRecord(DbgVariableRecord &DVR) { return AnyChange; } -static void DbgInserterHelper(DbgValueInst *DVI, BasicBlock::iterator VI) { - DVI->removeFromParent(); - if (isa(VI)) - DVI->insertBefore(VI->getParent()->getFirstInsertionPt()); - else - DVI->insertAfter(VI); -} - static void DbgInserterHelper(DbgVariableRecord *DVR, BasicBlock::iterator VI) { DVR->removeFromParent(); BasicBlock *VIBB = VI->getParent(); @@ -9065,15 +9023,8 @@ bool CodeGenPrepare::placeDbgValues(Function &F) { for (BasicBlock &BB : F) { for (Instruction &Insn : llvm::make_early_inc_range(BB)) { - // Process dbg.value intrinsics. - DbgValueInst *DVI = dyn_cast(&Insn); - if (DVI) { - DbgProcessor(DVI, DVI); - continue; - } - - // If this isn't a dbg.value, process any attached DbgVariableRecord - // records attached to this instruction. + // Process any DbgVariableRecord records attached to this + // instruction. for (DbgVariableRecord &DVR : llvm::make_early_inc_range( filterDbgVars(Insn.getDbgRecordRange()))) { if (DVR.Type != DbgVariableRecord::LocationType::Value) diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp index 9b9cebc74054d..1a20fe586e951 100644 --- a/llvm/lib/CodeGen/MachineDebugify.cpp +++ b/llvm/lib/CodeGen/MachineDebugify.cpp @@ -63,24 +63,9 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, // which cover a wide range of lines can help stress the debug info passes: // if we can't do that, fall back to using the local variable which precedes // all the others. - Function *DbgValF = M.getFunction("llvm.dbg.value"); - DbgValueInst *EarliestDVI = nullptr; DbgVariableRecord *EarliestDVR = nullptr; DenseMap Line2Var; DIExpression *Expr = nullptr; - if (DbgValF) { - for (const Use &U : DbgValF->uses()) { - auto *DVI = dyn_cast(U.getUser()); - if (!DVI || DVI->getFunction() != &F) - continue; - unsigned Line = DVI->getDebugLoc().getLine(); - assert(Line != 0 && "debugify should not insert line 0 locations"); - Line2Var[Line] = DVI->getVariable(); - if (!EarliestDVI || Line < EarliestDVI->getDebugLoc().getLine()) - EarliestDVI = DVI; - Expr = DVI->getExpression(); - } - } for (BasicBlock &BB : F) { for (Instruction &I : BB) { for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) { @@ -125,8 +110,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, unsigned Line = MI.getDebugLoc().getLine(); auto It = Line2Var.find(Line); if (It == Line2Var.end()) { - Line = EarliestDVI ? EarliestDVI->getDebugLoc().getLine() - : EarliestDVR->getDebugLoc().getLine(); + Line = EarliestDVR->getDebugLoc().getLine(); It = Line2Var.find(Line); assert(It != Line2Var.end()); } diff --git a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp index 8017db1cfe146..5fd5f7d9dad09 100644 --- a/llvm/lib/Transforms/Coroutines/SpillUtils.cpp +++ b/llvm/lib/Transforms/Coroutines/SpillUtils.cpp @@ -514,7 +514,7 @@ void collectSpillsAndAllocasFromInsts( void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F, const SuspendCrossingInfo &Checker) { // We don't want the layout of coroutine frame to be affected - // by debug information. So we only choose to salvage DbgValueInst for + // by debug information. So we only choose to salvage dbg.values for // whose value is already in the frame. // We would handle the dbg.values for allocas specially for (auto &Iter : Spills) { @@ -522,9 +522,7 @@ void collectSpillsFromDbgInfo(SpillInfo &Spills, Function &F, SmallVector DVIs; SmallVector DVRs; findDbgValues(DVIs, V, &DVRs); - for (DbgValueInst *DVI : DVIs) - if (Checker.isDefinitionAcrossSuspend(*V, DVI)) - Spills[V].push_back(DVI); + assert(DVIs.empty()); // Add the instructions which carry debug info that is in the frame. for (DbgVariableRecord *DVR : DVRs) if (Checker.isDefinitionAcrossSuspend(*V, DVR->Marker->MarkedInstr)) diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp index d4555e9435f1d..f5525deb0172f 100644 --- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp +++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp @@ -572,7 +572,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI( // Work out whether a dbg.value intrinsic or an equivalent DbgVariableRecord // is a parameter to be preserved. - auto ExamineDbgValue = [](auto *DbgVal, auto &Container) { + auto ExamineDbgValue = [&PDVRRelated](DbgVariableRecord *DbgVal) { LLVM_DEBUG(dbgs() << " Deciding: "); LLVM_DEBUG(DbgVal->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); @@ -581,7 +581,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI( LLVM_DEBUG(dbgs() << " Include (parameter): "); LLVM_DEBUG(DbgVal->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); - Container.insert(DbgVal); + PDVRRelated.insert(DbgVal); } else { LLVM_DEBUG(dbgs() << " Delete (!parameter): "); LLVM_DEBUG(DbgVal->print(dbgs())); @@ -589,7 +589,8 @@ void MergeFunctions::filterInstsUnrelatedToPDI( } }; - auto ExamineDbgDeclare = [&PDIRelated](auto *DbgDecl, auto &Container) { + auto ExamineDbgDeclare = [&PDIRelated, + &PDVRRelated](DbgVariableRecord *DbgDecl) { LLVM_DEBUG(dbgs() << " Deciding: "); LLVM_DEBUG(DbgDecl->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); @@ -616,7 +617,7 @@ void MergeFunctions::filterInstsUnrelatedToPDI( LLVM_DEBUG(dbgs() << " Include: "); LLVM_DEBUG(DbgDecl->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); - Container.insert(DbgDecl); + PDVRRelated.insert(DbgDecl); } else { LLVM_DEBUG(dbgs() << " Delete (!parameter): "); LLVM_DEBUG(SI->print(dbgs())); @@ -647,18 +648,14 @@ void MergeFunctions::filterInstsUnrelatedToPDI( // they connected to parameters? for (DbgVariableRecord &DVR : filterDbgVars(BI->getDbgRecordRange())) { if (DVR.isDbgValue() || DVR.isDbgAssign()) { - ExamineDbgValue(&DVR, PDVRRelated); + ExamineDbgValue(&DVR); } else { assert(DVR.isDbgDeclare()); - ExamineDbgDeclare(&DVR, PDVRRelated); + ExamineDbgDeclare(&DVR); } } - if (auto *DVI = dyn_cast(&*BI)) { - ExamineDbgValue(DVI, PDIRelated); - } else if (auto *DDI = dyn_cast(&*BI)) { - ExamineDbgDeclare(DDI, PDIRelated); - } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) { + if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) { LLVM_DEBUG(dbgs() << " Will Include Terminator: "); LLVM_DEBUG(BI->print(dbgs())); LLVM_DEBUG(dbgs() << "\n"); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index a8bfd8c072d2f..503611a4fc32c 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -1420,21 +1420,16 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I, Value *IgnoredUser) { SmallVector DbgValues; SmallVector DbgVariableRecords; llvm::findDbgValues(DbgValues, I, &DbgVariableRecords); + assert(DbgValues.empty()); - auto InvertDbgValueUse = [&](auto *DbgVal) { + for (DbgVariableRecord *DbgVal : DbgVariableRecords) { SmallVector Ops = {dwarf::DW_OP_not}; for (unsigned Idx = 0, End = DbgVal->getNumVariableLocationOps(); Idx != End; ++Idx) if (DbgVal->getVariableLocationOp(Idx) == I) DbgVal->setExpression( DIExpression::appendOpsToArg(DbgVal->getExpression(), Ops, Idx)); - }; - - for (DbgValueInst *DVI : DbgValues) - InvertDbgValueUse(DVI); - - for (DbgVariableRecord *DVR : DbgVariableRecords) - InvertDbgValueUse(DVR); + } } /// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index b5dbef13289ac..4d1f44076db7e 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -1979,15 +1979,13 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB, // Find debug values outside of the block findDbgValues(DbgValues, &I, &DbgVariableRecords); - llvm::erase_if(DbgValues, [&](const DbgValueInst *DbgVal) { - return DbgVal->getParent() == BB; - }); + assert(DbgValues.empty()); llvm::erase_if(DbgVariableRecords, [&](const DbgVariableRecord *DbgVarRec) { return DbgVarRec->getParent() == BB; }); // If there are no uses outside the block, we're done with this instruction. - if (UsesToRename.empty() && DbgValues.empty() && DbgVariableRecords.empty()) + if (UsesToRename.empty() && DbgVariableRecords.empty()) continue; LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n"); @@ -2000,8 +1998,7 @@ void JumpThreadingPass::updateSSA(BasicBlock *BB, BasicBlock *NewBB, while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); - if (!DbgValues.empty() || !DbgVariableRecords.empty()) { - SSAUpdate.UpdateDebugValues(&I, DbgValues); + if (!DbgVariableRecords.empty()) { SSAUpdate.UpdateDebugValues(&I, DbgVariableRecords); DbgValues.clear(); DbgVariableRecords.clear(); @@ -2032,32 +2029,7 @@ void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping, // copy of the block 'NewBB'. If there are PHI nodes in the source basic // block, evaluate them to account for entry from PredBB. - // Retargets llvm.dbg.value to any renamed variables. - auto RetargetDbgValueIfPossible = [&](Instruction *NewInst) -> bool { - auto DbgInstruction = dyn_cast(NewInst); - if (!DbgInstruction) - return false; - - SmallSet, 16> OperandsToRemap; - for (auto DbgOperand : DbgInstruction->location_ops()) { - auto DbgOperandInstruction = dyn_cast(DbgOperand); - if (!DbgOperandInstruction) - continue; - - auto I = ValueMapping.find(DbgOperandInstruction); - if (I != ValueMapping.end()) { - OperandsToRemap.insert( - std::pair(DbgOperand, I->second)); - } - } - - for (auto &[OldOp, MappedOp] : OperandsToRemap) - DbgInstruction->replaceVariableLocationOp(OldOp, MappedOp); - return true; - }; - - // Duplicate implementation of the above dbg.value code, using - // DbgVariableRecords instead. + // Retargets dbg.value to any renamed variables. auto RetargetDbgVariableRecordIfPossible = [&](DbgVariableRecord *DVR) { SmallSet, 16> OperandsToRemap; for (auto *Op : DVR->location_ops()) { @@ -2116,9 +2088,6 @@ void JumpThreadingPass::cloneInstructions(ValueToValueMapTy &ValueMapping, if (const DebugLoc &DL = New->getDebugLoc()) mapAtomInstance(DL, ValueMapping); - if (RetargetDbgValueIfPossible(New)) - continue; - // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (Instruction *Inst = dyn_cast(New->getOperand(i))) { diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index dc8fa4379752f..636bd81ce0755 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -6630,13 +6630,10 @@ struct SCEVDbgValueBuilder { /// Holds all the required data to salvage a dbg.value using the pre-LSR SCEVs /// and DIExpression. struct DVIRecoveryRec { - DVIRecoveryRec(DbgValueInst *DbgValue) - : DbgRef(DbgValue), Expr(DbgValue->getExpression()), - HadLocationArgList(false) {} DVIRecoveryRec(DbgVariableRecord *DVR) : DbgRef(DVR), Expr(DVR->getExpression()), HadLocationArgList(false) {} - PointerUnion DbgRef; + DbgVariableRecord *DbgRef; DIExpression *Expr; bool HadLocationArgList; SmallVector LocationOps; @@ -6695,44 +6692,38 @@ static void updateDVIWithLocations(T &DbgVal, } /// Write the new expression and new location ops for the dbg.value. If possible -/// reduce the szie of the dbg.value intrinsic by omitting DIArglist. This +/// reduce the szie of the dbg.value by omitting DIArglist. This /// can be omitted if: /// 1. There is only a single location, refenced by a single DW_OP_llvm_arg. /// 2. The DW_OP_LLVM_arg is the first operand in the expression. -static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec, - SmallVectorImpl &NewLocationOps, - SmallVectorImpl &NewExpr) { - auto UpdateDbgValueInstImpl = [&](auto *DbgVal) { - unsigned NumLLVMArgs = numLLVMArgOps(NewExpr); - if (NumLLVMArgs == 0) { - // Location assumed to be on the stack. - updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr); - } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) { - // There is only a single DW_OP_llvm_arg at the start of the expression, - // so it can be omitted along with DIArglist. - assert(NewExpr[1] == 0 && - "Lone LLVM_arg in a DIExpression should refer to location-op 0."); - llvm::SmallVector ShortenedOps(llvm::drop_begin(NewExpr, 2)); - updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps); - } else { - // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary. - updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr); - } +static void UpdateDbgValue(DVIRecoveryRec &DVIRec, + SmallVectorImpl &NewLocationOps, + SmallVectorImpl &NewExpr) { + DbgVariableRecord *DbgVal = DVIRec.DbgRef; + unsigned NumLLVMArgs = numLLVMArgOps(NewExpr); + if (NumLLVMArgs == 0) { + // Location assumed to be on the stack. + updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr); + } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) { + // There is only a single DW_OP_llvm_arg at the start of the expression, + // so it can be omitted along with DIArglist. + assert(NewExpr[1] == 0 && + "Lone LLVM_arg in a DIExpression should refer to location-op 0."); + llvm::SmallVector ShortenedOps(llvm::drop_begin(NewExpr, 2)); + updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps); + } else { + // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary. + updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr); + } - // If the DIExpression was previously empty then add the stack terminator. - // Non-empty expressions have only had elements inserted into them and so - // the terminator should already be present e.g. stack_value or fragment. - DIExpression *SalvageExpr = DbgVal->getExpression(); - if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) { - SalvageExpr = - DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value}); - DbgVal->setExpression(SalvageExpr); - } - }; - if (isa(DVIRec.DbgRef)) - UpdateDbgValueInstImpl(cast(DVIRec.DbgRef)); - else - UpdateDbgValueInstImpl(cast(DVIRec.DbgRef)); + // If the DIExpression was previously empty then add the stack terminator. + // Non-empty expressions have only had elements inserted into them and so + // the terminator should already be present e.g. stack_value or fragment. + DIExpression *SalvageExpr = DbgVal->getExpression(); + if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) { + SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value}); + DbgVal->setExpression(SalvageExpr); + } } /// Cached location ops may be erased during LSR, in which case a poison is @@ -6746,39 +6737,34 @@ static Value *getValueOrPoison(WeakVH &VH, LLVMContext &C) { /// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values. static void restorePreTransformState(DVIRecoveryRec &DVIRec) { - auto RestorePreTransformStateImpl = [&](auto *DbgVal) { - LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n" - << "scev-salvage: post-LSR: " << *DbgVal << '\n'); - assert(DVIRec.Expr && "Expected an expression"); - DbgVal->setExpression(DVIRec.Expr); - - // Even a single location-op may be inside a DIArgList and referenced with - // DW_OP_LLVM_arg, which is valid only with a DIArgList. - if (!DVIRec.HadLocationArgList) { - assert(DVIRec.LocationOps.size() == 1 && - "Unexpected number of location ops."); - // LSR's unsuccessful salvage attempt may have added DIArgList, which in - // this case was not present before, so force the location back to a - // single uncontained Value. - Value *CachedValue = - getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext()); - DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue)); - } else { - SmallVector MetadataLocs; - for (WeakVH VH : DVIRec.LocationOps) { - Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext()); - MetadataLocs.push_back(ValueAsMetadata::get(CachedValue)); - } - auto ValArrayRef = llvm::ArrayRef(MetadataLocs); - DbgVal->setRawLocation( - llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef)); + DbgVariableRecord *DbgVal = DVIRec.DbgRef; + LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n" + << "scev-salvage: post-LSR: " << *DbgVal << '\n'); + assert(DVIRec.Expr && "Expected an expression"); + DbgVal->setExpression(DVIRec.Expr); + + // Even a single location-op may be inside a DIArgList and referenced with + // DW_OP_LLVM_arg, which is valid only with a DIArgList. + if (!DVIRec.HadLocationArgList) { + assert(DVIRec.LocationOps.size() == 1 && + "Unexpected number of location ops."); + // LSR's unsuccessful salvage attempt may have added DIArgList, which in + // this case was not present before, so force the location back to a + // single uncontained Value. + Value *CachedValue = + getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext()); + DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue)); + } else { + SmallVector MetadataLocs; + for (WeakVH VH : DVIRec.LocationOps) { + Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext()); + MetadataLocs.push_back(ValueAsMetadata::get(CachedValue)); } - LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n'); - }; - if (isa(DVIRec.DbgRef)) - RestorePreTransformStateImpl(cast(DVIRec.DbgRef)); - else - RestorePreTransformStateImpl(cast(DVIRec.DbgRef)); + auto ValArrayRef = llvm::ArrayRef(MetadataLocs); + DbgVal->setRawLocation( + llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef)); + } + LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n'); } static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, @@ -6786,9 +6772,7 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, const SCEV *SCEVInductionVar, SCEVDbgValueBuilder IterCountExpr) { - if (isa(DVIRec.DbgRef) - ? !cast(DVIRec.DbgRef)->isKillLocation() - : !cast(DVIRec.DbgRef)->isKillLocation()) + if (!DVIRec.DbgRef->isKillLocation()) return false; // LSR may have caused several changes to the dbg.value in the failed salvage @@ -6882,13 +6866,8 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE, DbgBuilder->appendToVectors(NewExpr, NewLocationOps); } - UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr); - if (isa(DVIRec.DbgRef)) - LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " - << *cast(DVIRec.DbgRef) << "\n"); - else - LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " - << *cast(DVIRec.DbgRef) << "\n"); + UpdateDbgValue(DVIRec, NewLocationOps, NewExpr); + LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DbgRef << "\n"); return true; } @@ -6934,21 +6913,23 @@ static void DbgRewriteSalvageableDVIs( /// cacheing and salvaging. static void DbgGatherSalvagableDVI( Loop *L, ScalarEvolution &SE, - SmallVector, 2> &SalvageableDVISCEVs, - SmallSet, 2> &DVIHandles) { + SmallVector, 2> &SalvageableDVISCEVs) { for (const auto &B : L->getBlocks()) { for (auto &I : *B) { - auto ProcessDbgValue = [&](auto *DbgVal) -> bool { + for (DbgVariableRecord &DbgVal : filterDbgVars(I.getDbgRecordRange())) { + if (!DbgVal.isDbgValue() && !DbgVal.isDbgAssign()) + continue; + // Ensure that if any location op is undef that the dbg.vlue is not // cached. - if (DbgVal->isKillLocation()) - return false; + if (DbgVal.isKillLocation()) + continue; // Check that the location op SCEVs are suitable for translation to // DIExpression. const auto &HasTranslatableLocationOps = - [&](const auto *DbgValToTranslate) -> bool { - for (const auto LocOp : DbgValToTranslate->location_ops()) { + [&](const DbgVariableRecord &DbgValToTranslate) -> bool { + for (const auto LocOp : DbgValToTranslate.location_ops()) { if (!LocOp) return false; @@ -6963,31 +6944,21 @@ static void DbgGatherSalvagableDVI( }; if (!HasTranslatableLocationOps(DbgVal)) - return false; + continue; std::unique_ptr NewRec = - std::make_unique(DbgVal); + std::make_unique(&DbgVal); // Each location Op may need a SCEVDbgValueBuilder in order to recover // it. Pre-allocating a vector will enable quick lookups of the builder // later during the salvage. - NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps()); - for (const auto LocOp : DbgVal->location_ops()) { + NewRec->RecoveryExprs.resize(DbgVal.getNumVariableLocationOps()); + for (const auto LocOp : DbgVal.location_ops()) { NewRec->SCEVs.push_back(SE.getSCEV(LocOp)); NewRec->LocationOps.push_back(LocOp); - NewRec->HadLocationArgList = DbgVal->hasArgList(); + NewRec->HadLocationArgList = DbgVal.hasArgList(); } SalvageableDVISCEVs.push_back(std::move(NewRec)); - return true; - }; - for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) { - if (DVR.isDbgValue() || DVR.isDbgAssign()) - ProcessDbgValue(&DVR); } - auto DVI = dyn_cast(&I); - if (!DVI) - continue; - if (ProcessDbgValue(DVI)) - DVIHandles.insert(DVI); } } } @@ -7036,8 +7007,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // Debug preservation - before we start removing anything identify which DVI // meet the salvageable criteria and store their DIExpression and SCEVs. SmallVector, 2> SalvageableDVIRecords; - SmallSet, 2> DVIHandles; - DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords, DVIHandles); + DbgGatherSalvagableDVI(L, SE, SalvageableDVIRecords); bool Changed = false; std::unique_ptr MSSAU; @@ -7105,7 +7075,6 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, for (auto &Rec : SalvageableDVIRecords) Rec->clear(); SalvageableDVIRecords.clear(); - DVIHandles.clear(); return Changed; } diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp index a1f030a336c15..4210ce6da1eb2 100644 --- a/llvm/lib/Transforms/Utils/Debugify.cpp +++ b/llvm/lib/Transforms/Utils/Debugify.cpp @@ -808,9 +808,6 @@ bool checkDebugifyMetadata(Module &M, // Find missing lines. for (Instruction &I : instructions(F)) { - if (isa(&I)) - continue; - auto DL = I.getDebugLoc(); if (DL && DL.getLine() != 0) { MissingLines.reset(DL.getLine() - 1); @@ -839,10 +836,6 @@ bool checkDebugifyMetadata(Module &M, for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) if (DVR.isDbgValue() || DVR.isDbgAssign()) CheckForMisSized(&DVR); - auto *DVI = dyn_cast(&I); - if (!DVI) - continue; - CheckForMisSized(DVI); } } diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index ee3e56c3c6db9..d481ad9dee181 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -1606,12 +1606,8 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar, SmallVector DbgValues; SmallVector DbgVariableRecords; findDbgValues(DbgValues, APN, &DbgVariableRecords); - for (auto *DVI : DbgValues) { - assert(is_contained(DVI->getValues(), APN)); - if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr)) - return true; - } - for (auto *DVR : DbgVariableRecords) { + assert(DbgValues.empty()); + for (DbgVariableRecord *DVR : DbgVariableRecords) { assert(is_contained(DVR->location_ops(), APN)); if ((DVR->getVariable() == DIVar) && (DVR->getExpression() == DIExpr)) return true; @@ -1970,7 +1966,6 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress, static void updateOneDbgValueForAlloca(const DebugLoc &Loc, DILocalVariable *DIVar, DIExpression *DIExpr, Value *NewAddress, - DbgValueInst *DVI, DbgVariableRecord *DVR, DIBuilder &Builder, int Offset) { assert(DIVar && "Missing variable"); @@ -1986,14 +1981,8 @@ static void updateOneDbgValueForAlloca(const DebugLoc &Loc, if (Offset) DIExpr = DIExpression::prepend(DIExpr, 0, Offset); - if (DVI) { - DVI->setExpression(DIExpr); - DVI->replaceVariableLocationOp(0u, NewAddress); - } else { - assert(DVR); - DVR->setExpression(DIExpr); - DVR->replaceVariableLocationOp(0u, NewAddress); - } + DVR->setExpression(DIExpr); + DVR->replaceVariableLocationOp(0u, NewAddress); } void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, @@ -2001,18 +1990,13 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress, SmallVector DbgUsers; SmallVector DPUsers; findDbgValues(DbgUsers, AI, &DPUsers); - - // Attempt to replace dbg.values that use this alloca. - for (auto *DVI : DbgUsers) - updateOneDbgValueForAlloca(DVI->getDebugLoc(), DVI->getVariable(), - DVI->getExpression(), NewAllocaAddress, DVI, - nullptr, Builder, Offset); + assert(DbgUsers.empty()); // Replace any DbgVariableRecords that use this alloca. for (DbgVariableRecord *DVR : DPUsers) updateOneDbgValueForAlloca(DVR->getDebugLoc(), DVR->getVariable(), - DVR->getExpression(), NewAllocaAddress, nullptr, - DVR, Builder, Offset); + DVR->getExpression(), NewAllocaAddress, DVR, + Builder, Offset); } /// Where possible to salvage debug information for \p I do so. diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 66d0573e83f65..06115e0741ade 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -161,29 +161,8 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, SmallVector DbgValues; SmallVector DbgVariableRecords; llvm::findDbgValues(DbgValues, OrigHeaderVal, &DbgVariableRecords); - for (auto &DbgValue : DbgValues) { - // The original users in the OrigHeader are already using the original - // definitions. - BasicBlock *UserBB = DbgValue->getParent(); - if (UserBB == OrigHeader) - continue; - - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped and anything else can be handled by - // the SSAUpdater. To avoid adding PHINodes, check if the value is - // available in UserBB, if not substitute poison. - Value *NewVal; - if (UserBB == OrigPreheader) - NewVal = OrigPreHeaderVal; - else if (SSA.HasValueForBlock(UserBB)) - NewVal = SSA.GetValueInMiddleOfBlock(UserBB); - else - NewVal = PoisonValue::get(OrigHeaderVal->getType()); - DbgValue->replaceVariableLocationOp(OrigHeaderVal, NewVal); - } + assert(DbgValues.empty()); - // RemoveDIs: duplicate implementation for non-instruction debug-info - // storage in DbgVariableRecords. for (DbgVariableRecord *DVR : DbgVariableRecords) { // The original users in the OrigHeader are already using the original // definitions. diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 5db7fc956c497..561c898ec55d8 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -200,11 +200,7 @@ void SSAUpdater::UpdateDebugValues(Instruction *I) { SmallVector DbgValues; SmallVector DbgVariableRecords; llvm::findDbgValues(DbgValues, I, &DbgVariableRecords); - for (auto &DbgValue : DbgValues) { - if (DbgValue->getParent() == I->getParent()) - continue; - UpdateDebugValue(I, DbgValue); - } + assert(DbgValues.empty()); for (auto &DVR : DbgVariableRecords) { if (DVR->getParent() == I->getParent()) continue; @@ -212,13 +208,6 @@ void SSAUpdater::UpdateDebugValues(Instruction *I) { } } -void SSAUpdater::UpdateDebugValues(Instruction *I, - SmallVectorImpl &DbgValues) { - for (auto &DbgValue : DbgValues) { - UpdateDebugValue(I, DbgValue); - } -} - void SSAUpdater::UpdateDebugValues( Instruction *I, SmallVectorImpl &DbgVariableRecords) { for (auto &DVR : DbgVariableRecords) { @@ -226,15 +215,6 @@ void SSAUpdater::UpdateDebugValues( } } -void SSAUpdater::UpdateDebugValue(Instruction *I, DbgValueInst *DbgValue) { - BasicBlock *UserBB = DbgValue->getParent(); - if (HasValueForBlock(UserBB)) { - Value *NewVal = GetValueAtEndOfBlock(UserBB); - DbgValue->replaceVariableLocationOp(I, NewVal); - } else - DbgValue->setKillLocation(); -} - void SSAUpdater::UpdateDebugValue(Instruction *I, DbgVariableRecord *DVR) { BasicBlock *UserBB = DVR->getParent(); if (HasValueForBlock(UserBB)) { diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp index 422eb855ba2cf..2b43d27f292a0 100644 --- a/llvm/tools/llvm-dis/llvm-dis.cpp +++ b/llvm/tools/llvm-dis/llvm-dis.cpp @@ -131,20 +131,6 @@ class CommentWriter : public AssemblyAnnotationWriter { printDebugLoc(DL,OS); OS << "]"; } - if (const DbgDeclareInst *DDI = dyn_cast(I)) { - if (!Padded) { - OS.PadToColumn(50); - OS << ";"; - } - OS << " [debug variable = " << DDI->getVariable()->getName() << "]"; - } - else if (const DbgValueInst *DVI = dyn_cast(I)) { - if (!Padded) { - OS.PadToColumn(50); - OS << ";"; - } - OS << " [debug variable = " << DVI->getVariable()->getName() << "]"; - } } } }; diff --git a/llvm/unittests/CodeGen/LexicalScopesTest.cpp b/llvm/unittests/CodeGen/LexicalScopesTest.cpp index 3d707462fa615..563d496d1e600 100644 --- a/llvm/unittests/CodeGen/LexicalScopesTest.cpp +++ b/llvm/unittests/CodeGen/LexicalScopesTest.cpp @@ -67,7 +67,7 @@ class LexicalScopesTest : public testing::Test { BeanInst.Opcode = 1; BeanInst.Size = 1; - memset(&DbgValueInst, 0, sizeof(DbgValueInst)); + memset(&DbgValueInst, 0, sizeof(MCInstrDesc)); DbgValueInst.Opcode = TargetOpcode::DBG_VALUE; DbgValueInst.Size = 1; DbgValueInst.Flags = 1U << MCID::Meta; diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp index 41bf863420304..baa13e1199eea 100644 --- a/llvm/unittests/IR/DebugInfoTest.cpp +++ b/llvm/unittests/IR/DebugInfoTest.cpp @@ -188,6 +188,7 @@ TEST(MetadataTest, DeleteInstUsedByDbgRecord) { SmallVector DVIs; SmallVector DVRs; findDbgValues(DVIs, &I, &DVRs); + assert(DVIs.empty()); // Delete %b. The dbg.value should now point to undef. I.eraseFromParent(); @@ -314,6 +315,7 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) { SmallVector DVIs; SmallVector DVRs; findDbgValues(DVIs, &I, &DVRs); + assert(DVIs.empty()); ASSERT_EQ(DVRs.size(), 2u); // Delete %b. The DbgVariableRecord should now point to undef. From 369f749dc434ec0339f5fb13376e1bc92e1d51d9 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 18 Jul 2025 12:46:56 +0200 Subject: [PATCH 325/813] [SLP] Remove lifetime.start on null pointer in test (NFC) --- .../SLPVectorizer/X86/split-node-reorder-node-with-ops.ll | 2 -- 1 file changed, 2 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll index 8e09847e9264e..cfff11758a37a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-node-reorder-node-with-ops.ll @@ -58,7 +58,6 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) { ; CHECK-NEXT: br label %[[BB54:.*]] ; CHECK: [[BB54]]: ; CHECK-NEXT: [[TMP54:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP17]]) -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 0, ptr null) ; CHECK-NEXT: [[TMP55:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[TMP21]]) ; CHECK-NEXT: [[TMP56:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP57:%.*]] = shufflevector <8 x float> [[TMP56]], <8 x float> poison, <8 x i32> @@ -198,7 +197,6 @@ define void @test(i32 %0, i8 %1, i64 %2, float %3) { %95 = or i64 %94, %91 %96 = or i64 %95, %37 store i64 %96, ptr null, align 1 - call void @llvm.lifetime.start.p0(i64 0, ptr null) store i64 %42, ptr null, align 1 %97 = bitcast float %3 to i32 %98 = icmp ult i32 %97, 1325400064 From 6c63316ee17462c97c722a960680b2b45d2fff4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Fri, 18 Jul 2025 04:17:15 -0700 Subject: [PATCH 326/813] [flang][cuda] Support device component in a pointer or allocatable derived-type (#149418) --- flang/lib/Lower/ConvertVariable.cpp | 26 ++++++++++++++-- flang/test/Lower/CUDA/cuda-set-allocator.cuf | 32 ++++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp index 6cda742874ccf..cacf4e249aa28 100644 --- a/flang/lib/Lower/ConvertVariable.cpp +++ b/flang/lib/Lower/ConvertVariable.cpp @@ -813,6 +813,10 @@ initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter, if (auto boxTy = mlir::dyn_cast(baseTy)) baseTy = boxTy.getEleTy(); baseTy = fir::unwrapRefType(baseTy); + + if (mlir::isa(baseTy)) + TODO(loc, "array of derived-type with device component"); + auto recTy = mlir::dyn_cast(fir::unwrapSequenceType(baseTy)); assert(recTy && "expected fir::RecordType"); @@ -823,7 +827,7 @@ initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter, if (Fortran::semantics::IsDeviceAllocatable(sym)) { unsigned fieldIdx = recTy.getFieldIndex(sym.name().ToString()); mlir::Type fieldTy; - std::vector coordinates; + llvm::SmallVector coordinates; if (fieldIdx != std::numeric_limits::max()) { // Field found in the base record type. @@ -866,8 +870,24 @@ initializeDeviceComponentAllocator(Fortran::lower::AbstractConverter &converter, TODO(loc, "device resident component in complex derived-type " "hierarchy"); - mlir::Value comp = builder.create( - loc, builder.getRefType(fieldTy), fir::getBase(exv), coordinates); + mlir::Value base = fir::getBase(exv); + mlir::Value comp; + if (mlir::isa(fir::unwrapRefType(base.getType()))) { + mlir::Value box = builder.create(loc, base); + mlir::Value addr = builder.create(loc, box); + llvm::SmallVector lenParams; + assert(coordinates.size() == 1 && "expect one coordinate"); + auto field = mlir::dyn_cast( + coordinates[0].getDefiningOp()); + comp = builder.create( + loc, builder.getRefType(fieldTy), addr, + /*component=*/field.getFieldName(), + /*componentShape=*/mlir::Value{}, + hlfir::DesignateOp::Subscripts{}); + } else { + comp = builder.create( + loc, builder.getRefType(fieldTy), base, coordinates); + } cuf::DataAttributeAttr dataAttr = Fortran::lower::translateSymbolCUFDataAttribute( builder.getContext(), sym); diff --git a/flang/test/Lower/CUDA/cuda-set-allocator.cuf b/flang/test/Lower/CUDA/cuda-set-allocator.cuf index ee89ea38a3fc7..e3bb181f65398 100644 --- a/flang/test/Lower/CUDA/cuda-set-allocator.cuf +++ b/flang/test/Lower/CUDA/cuda-set-allocator.cuf @@ -21,4 +21,36 @@ contains ! CHECK: %[[Z:.*]] = fir.coordinate_of %[[DT]]#0, z : (!fir.ref>>,y:i32,z:!fir.box>>}>>) -> !fir.ref>>> ! CHECK: cuf.set_allocator_idx %[[Z]] : !fir.ref>>> {data_attr = #cuf.cuda} + subroutine sub2() + type(ty_device), pointer :: d1 + end subroutine + +! CHECK-LABEL: func.func @_QMm1Psub2() +! CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.box>>,y:i32,z:!fir.box>>}>>> {bindc_name = "d1", data_attr = #cuf.cuda, uniq_name = "_QMm1Fsub2Ed1"} -> !fir.ref>>,y:i32,z:!fir.box>>}>>>> +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMm1Fsub2Ed1"} : (!fir.ref>>,y:i32,z:!fir.box>>}>>>>) -> (!fir.ref>>,y:i32,z:!fir.box>>}>>>>, !fir.ref>>,y:i32,z:!fir.box>>}>>>>) +! CHECK: %[[LOAD1:.*]] = fir.load %[[DECL]]#0 : !fir.ref>>,y:i32,z:!fir.box>>}>>>> +! CHECK: %[[ADDR1:.*]] = fir.box_addr %[[LOAD1]] : (!fir.box>>,y:i32,z:!fir.box>>}>>>) -> !fir.ptr>>,y:i32,z:!fir.box>>}>> +! CHECK: %[[DESIGNATE1:.*]] = hlfir.designate %[[ADDR1]]{"x"} : (!fir.ptr>>,y:i32,z:!fir.box>>}>>) -> !fir.ref>>> +! CHECK: cuf.set_allocator_idx %[[DESIGNATE1]] : !fir.ref>>> {data_attr = #cuf.cuda} +! CHECK: %[[LOAD2:.*]] = fir.load %[[DECL]]#0 : !fir.ref>>,y:i32,z:!fir.box>>}>>>> +! CHECK: %[[ADDR2:.*]] = fir.box_addr %[[LOAD2]] : (!fir.box>>,y:i32,z:!fir.box>>}>>>) -> !fir.ptr>>,y:i32,z:!fir.box>>}>> +! CHECK: %[[DESIGNATE2:.*]] = hlfir.designate %[[ADDR2]]{"z"} : (!fir.ptr>>,y:i32,z:!fir.box>>}>>) -> !fir.ref>>> +! CHECK: cuf.set_allocator_idx %[[DESIGNATE2]] : !fir.ref>>> {data_attr = #cuf.cuda} + + subroutine sub3() + type(ty_device), allocatable :: d1 + end subroutine + +! CHECK-LABEL: func.func @_QMm1Psub3() +! CHECK: %[[ALLOC:.*]] = cuf.alloc !fir.box>>,y:i32,z:!fir.box>>}>>> {bindc_name = "d1", data_attr = #cuf.cuda, uniq_name = "_QMm1Fsub3Ed1"} -> !fir.ref>>,y:i32,z:!fir.box>>}>>>> +! CHECK: %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]] {data_attr = #cuf.cuda, fortran_attrs = #fir.var_attrs, uniq_name = "_QMm1Fsub3Ed1"} : (!fir.ref>>,y:i32,z:!fir.box>>}>>>>) -> (!fir.ref>>,y:i32,z:!fir.box>>}>>>>, !fir.ref>>,y:i32,z:!fir.box>>}>>>>) +! CHECK: %[[LOAD1:.*]] = fir.load %[[DECL]]#0 : !fir.ref>>,y:i32,z:!fir.box>>}>>>> +! CHECK: %[[ADDR1:.*]] = fir.box_addr %[[LOAD1]] : (!fir.box>>,y:i32,z:!fir.box>>}>>>) -> !fir.heap>>,y:i32,z:!fir.box>>}>> +! CHECK: %[[DESIGNATE1:.*]] = hlfir.designate %[[ADDR1]]{"x"} : (!fir.heap>>,y:i32,z:!fir.box>>}>>) -> !fir.ref>>> +! CHECK: cuf.set_allocator_idx %[[DESIGNATE1]] : !fir.ref>>> {data_attr = #cuf.cuda} +! CHECK: %[[LOAD2:.*]] = fir.load %[[DECL]]#0 : !fir.ref>>,y:i32,z:!fir.box>>}>>>> +! CHECK: %[[ADDR2:.*]] = fir.box_addr %[[LOAD2]] : (!fir.box>>,y:i32,z:!fir.box>>}>>>) -> !fir.heap>>,y:i32,z:!fir.box>>}>> +! CHECK: %[[DESIGNATE2:.*]] = hlfir.designate %[[ADDR2]]{"z"} : (!fir.heap>>,y:i32,z:!fir.box>>}>>) -> !fir.ref>>> +! CHECK: cuf.set_allocator_idx %[[DESIGNATE2]] : !fir.ref>>> {data_attr = #cuf.cuda} + end module From 4bbc70ed28a85036fb718e86424bb1d8a643005f Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 18 Jul 2025 08:11:55 -0400 Subject: [PATCH 327/813] [gn] port d994487db78 (llvm-ir2vec) --- llvm/utils/gn/secondary/llvm/test/BUILD.gn | 1 + llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn | 9 +++++++++ 2 files changed, 10 insertions(+) create mode 100644 llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn diff --git a/llvm/utils/gn/secondary/llvm/test/BUILD.gn b/llvm/utils/gn/secondary/llvm/test/BUILD.gn index 946b63f8a54fb..7ed0d3c6824e2 100644 --- a/llvm/utils/gn/secondary/llvm/test/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/test/BUILD.gn @@ -286,6 +286,7 @@ group("test") { "//llvm/tools/llvm-extract", "//llvm/tools/llvm-gsymutil:llvm-gsymutil", "//llvm/tools/llvm-ifs", + "//llvm/tools/llvm-ir2vec", "//llvm/tools/llvm-isel-fuzzer", "//llvm/tools/llvm-jitlink", "//llvm/tools/llvm-jitlink/llvm-jitlink-executor", diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn new file mode 100644 index 0000000000000..07a795122c76c --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn @@ -0,0 +1,9 @@ +executable("llvm-ir2vec") { + deps = [ + "//llvm/lib/Analysis", + "//llvm/lib/IR", + "//llvm/lib/IRReader", + "//llvm/lib/Support", + ] + sources = [ "llvm-ir2vec.cpp" ] +} From 8b068149547cb3043e4427899851dc70ca1eb885 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Fri, 18 Jul 2025 12:12:24 +0000 Subject: [PATCH 328/813] [gn build] Port 8f3e78f9715c --- llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index d4adeddd9b4e4..3d11ce566207a 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -176,6 +176,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUPreLegalizerCombiner.cpp", "AMDGPUPreloadKernArgProlog.cpp", "AMDGPUPreloadKernelArguments.cpp", + "AMDGPUPrepareAGPRAlloc.cpp", "AMDGPUPrintfRuntimeBinding.cpp", "AMDGPUPromoteAlloca.cpp", "AMDGPUPromoteKernelArguments.cpp", From 4c701956341ff88f580d240be072461a1ba6d7f5 Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Fri, 18 Jul 2025 20:28:43 +0800 Subject: [PATCH 329/813] [mlir][transform] Fix ch2 and additional documentation (#148407) Fixed error code in example.In addition to this, the content in the documentation has been improved by adding links to the code repository. --- mlir/docs/Tutorials/transform/Ch2.md | 30 +++++++++++++++------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/mlir/docs/Tutorials/transform/Ch2.md b/mlir/docs/Tutorials/transform/Ch2.md index 0f45f5607bab9..08c4e0f9d859c 100644 --- a/mlir/docs/Tutorials/transform/Ch2.md +++ b/mlir/docs/Tutorials/transform/Ch2.md @@ -133,6 +133,8 @@ This will generate two files, `MyExtension.h.inc` and `MyExtension.cpp.inc`, tha ```c++ // In MyExtension.cpp. +#include "MyExtension.h" + #define GET_OP_CLASSES #include "MyExtension.cpp.inc" @@ -283,7 +285,7 @@ void registerMyExtension(::mlir::DialectRegistry ®istry) { } ``` -After registering the extension, it becomes possible to use our new operation in the Transform dialect interpreter. The upstream testing pass can be used as is. +After registering the extension, it becomes possible to use our new operation in the Transform dialect interpreter. The upstream testing pass can be used as is. It actually exists in `mlir/test/Examples/transform/Ch2/sequence.mlir`, which contains the `microkernel` implementation. ```mlir module attributes {transform.with_named_sequence} { @@ -300,7 +302,7 @@ module attributes {transform.with_named_sequence} { // The actual tiling transformation takes tile sizes as attributes. It // produces a handle to the loop generated during tiling. - %loop, %tiled = transform.structured.tile_using_forall %max + %tiled, %loop = transform.structured.tile_using_forall %max tile_sizes [8, 32] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) @@ -311,32 +313,32 @@ module attributes {transform.with_named_sequence} { // a single handle to all operations and give it to // `fuse_into_containing_op` that would take care of the ordering in this // case. - %add_fused = transform.structured.fuse_into_containing_op %add into %loop - : (!transform.any_op, !transform.any_op) -> !transform.any_op - %matmul_fused = transform.structured.fuse_into_containing_op %arg1 - into %loop + %add_fused, %loop2 = transform.structured.fuse_into_containing_op %add into %loop + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + %matmul_fused, %loop3 = transform.structured.fuse_into_containing_op %arg1 + into %loop2 : (!transform.op<"linalg.matmul">, !transform.any_op) - -> !transform.any_op + -> (!transform.any_op, !transform.any_op) // Tile again to get the desired size. Note that this time this tiles the // "add" operation and fuses matmul into the loop, but doesn't affect the // "max" operation. This illustrates the precise targeting with the // transform dialect. Otherwise, it is difficult to differentiate "add" and // "max", both of which having the same kind. - %loop_2, %tiled_2 = transform.structured.tile_using_forall %add_fused + %tiled_second, %loop_second = transform.structured.tile_using_forall %add_fused tile_sizes [4, 4] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - %matmul_fused_2 = transform.structured.fuse_into_containing_op %matmul_fused - into %loop_2 - : (!transform.any_op, !transform.any_op) -> !transform.any_op + %matmul_fused_2, %loop_second_2 = transform.structured.fuse_into_containing_op %matmul_fused + into %loop_second + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) // Since outlining is currently only implemented for region-holding // operations such as loops, use tiling to size 1 to materialize the outer // loop that is going to be outlined. - %outline_target, %_ = transform.structured.tile_using_forall %tiled_2 tile_sizes [1] + %_0, %loop_third = transform.structured.tile_using_forall %tiled_second tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op) - transform.structured.fuse_into_containing_op %matmul_fused_2 into %outline_target - : (!transform.any_op, !transform.any_op) -> !transform.any_op + %_1, %outline_target = transform.structured.fuse_into_containing_op %matmul_fused_2 into %loop_third + : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) %func, %call = transform.loop.outline %outline_target {func_name = "outlined"} : (!transform.any_op) -> (!transform.any_op, !transform.any_op) From a9f81430725cb3d9a776d9b743078a452cd8e3aa Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Fri, 18 Jul 2025 13:42:25 +0100 Subject: [PATCH 330/813] [LoopInterchange] Ignore the cost-model, force interchange if legal (#148858) This is and has been proven useful for testing purposes, to get more test coverage. --- .../lib/Transforms/Scalar/LoopInterchange.cpp | 24 +++++++++-- .../LoopInterchange/force-interchange.ll | 43 +++++++++++++++++++ 2 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 llvm/test/Transforms/LoopInterchange/force-interchange.ll diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index a0f9f3c4a35a5..70e9eee5339a7 100644 --- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -78,6 +78,7 @@ enum class RuleTy { PerLoopCacheAnalysis, PerInstrOrderCost, ForVectorization, + Ignore }; } // end anonymous namespace @@ -106,14 +107,20 @@ static cl::list Profitabilities( clEnumValN(RuleTy::PerInstrOrderCost, "instorder", "Prioritize the IVs order of each instruction"), clEnumValN(RuleTy::ForVectorization, "vectorize", - "Prioritize vectorization"))); + "Prioritize vectorization"), + clEnumValN(RuleTy::Ignore, "ignore", + "Ignore profitability, force interchange (does not " + "work with other options)"))); #ifndef NDEBUG -static bool noDuplicateRules(ArrayRef Rules) { +static bool noDuplicateRulesAndIgnore(ArrayRef Rules) { SmallSet Set; - for (RuleTy Rule : Rules) + for (RuleTy Rule : Rules) { if (!Set.insert(Rule).second) return false; + if (Rule == RuleTy::Ignore) + return false; + } return true; } @@ -1357,6 +1364,13 @@ std::optional LoopInterchangeProfitability::isProfitableForVectorization( bool LoopInterchangeProfitability::isProfitable( const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix, CacheCostManager &CCM) { + + // Return true if interchange is forced and the cost-model ignored. + if (Profitabilities.size() == 1 && Profitabilities[0] == RuleTy::Ignore) + return true; + assert(noDuplicateRulesAndIgnore(Profitabilities) && + "Duplicate rules and option 'ignore' are not allowed"); + // isProfitable() is structured to avoid endless loop interchange. If the // highest priority rule (isProfitablePerLoopCacheAnalysis by default) could // decide the profitability then, profitability check will stop and return the @@ -1365,7 +1379,6 @@ bool LoopInterchangeProfitability::isProfitable( // second highest priority rule (isProfitablePerInstrOrderCost by default). // Likewise, if it failed to analysis the profitability then only, the last // rule (isProfitableForVectorization by default) will decide. - assert(noDuplicateRules(Profitabilities) && "Detect duplicate rules"); std::optional shouldInterchange; for (RuleTy RT : Profitabilities) { switch (RT) { @@ -1382,6 +1395,9 @@ bool LoopInterchangeProfitability::isProfitable( shouldInterchange = isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); break; + case RuleTy::Ignore: + llvm_unreachable("Option 'ignore' is not supported with other options"); + break; } // If this rule could determine the profitability, don't call subsequent diff --git a/llvm/test/Transforms/LoopInterchange/force-interchange.ll b/llvm/test/Transforms/LoopInterchange/force-interchange.ll new file mode 100644 index 0000000000000..c33ecdf7d9905 --- /dev/null +++ b/llvm/test/Transforms/LoopInterchange/force-interchange.ll @@ -0,0 +1,43 @@ +; RUN: opt < %s -passes=loop-interchange -pass-remarks-output=%t -disable-output -loop-interchange-profitabilities=ignore -S +; RUN: FileCheck --input-file=%t %s + +; There should be no reason to interchange this, unless it is forced. +; +; for (int i = 0; i<1024; i++) +; for (int j = 0; j<1024; j++) +; A[i][j] = 42; +; +; CHECK: --- !Passed +; CHECK-NEXT: Pass: loop-interchange +; CHECK-NEXT: Name: Interchanged +; CHECK-NEXT: Function: f +; CHECK-NEXT: Args: +; CHECK-NEXT: - String: Loop interchanged with enclosing loop. +; CHECK-NEXT: ... + +@A = dso_local local_unnamed_addr global [1024 x [1024 x i32]] zeroinitializer, align 4 + +define dso_local void @f() local_unnamed_addr #0 { +entry: + br label %outer.header + +outer.header: + %i = phi i64 [ 0, %entry ], [ %i.next, %inner.header ] + br label %inner.body + +inner.header: + %i.next = add nuw nsw i64 %i, 1 + %exitcond20.not = icmp eq i64 %i.next, 1024 + br i1 %exitcond20.not, label %exit, label %outer.header + +inner.body: + %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ] + %arrayidx6 = getelementptr inbounds nuw [1024 x [1024 x i32]], ptr @A, i64 0, i64 %i, i64 %j + store i32 42, ptr %arrayidx6, align 4 + %j.next = add nuw nsw i64 %j, 1 + %exitcond.not = icmp eq i64 %j.next, 1024 + br i1 %exitcond.not, label %inner.header, label %inner.body + +exit: + ret void +} From 602d43cfd1fe7cc47146b6327d8df6e5e0ec47ae Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 18 Jul 2025 08:43:08 -0400 Subject: [PATCH 331/813] [Clang][AMDGPU] Add the missing builtin `__builtin_amdgcn_sqrt_bf16` (#149447) Co-authored-by: Mekhanoshin, Stanislav --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 1 + clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 1 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+) diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index ed51f1d5de447..a916af7e0c2df 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -673,6 +673,7 @@ TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts") TARGET_BUILTIN(__builtin_amdgcn_tanhh, "hh", "nc", "tanh-insts") TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts") +TARGET_BUILTIN(__builtin_amdgcn_sqrt_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts") TARGET_BUILTIN(__builtin_amdgcn_exp2_bf16, "yy", "nc", "bf16-trans-insts") diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index a7d796ecccc61..ee736a2816218 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -416,6 +416,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_sqrt: case AMDGPU::BI__builtin_amdgcn_sqrtf: case AMDGPU::BI__builtin_amdgcn_sqrth: + case AMDGPU::BI__builtin_amdgcn_sqrt_bf16: return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_sqrt); case AMDGPU::BI__builtin_amdgcn_rsq: diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 738b7ab7f2b75..a9ea17642d6ad 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -119,6 +119,25 @@ void test_rcp_bf16(global __bf16* out, __bf16 a) *out = __builtin_amdgcn_rcp_bf16(a); } +// CHECK-LABEL: @test_sqrt_bf16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2 +// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.sqrt.bf16(bfloat [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2 +// CHECK-NEXT: ret void +// +void test_sqrt_bf16(global __bf16* out, __bf16 a) +{ + *out = __builtin_amdgcn_sqrt_bf16(a); +} + // CHECK-LABEL: @test_rsq_bf16( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) From 311847be4ca911e191c67245799fafe2e4d8ba73 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 18 Jul 2025 14:27:34 +0100 Subject: [PATCH 332/813] [Offload] Allow "tagging" device info entries with offload keys (#147317) When generating the device info tree, nodes can be marked with an offload Device Info value. The nodes can also look up children based on this value. --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 11 +++++--- .../common/include/PluginInterface.h | 28 +++++++++++++++++-- offload/plugins-nextgen/cuda/src/rtl.cpp | 10 ++++--- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index b2fd950c9d500..d4400547f9568 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2610,7 +2610,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status2 = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor); if (Status == HSA_STATUS_SUCCESS && Status2 == HSA_STATUS_SUCCESS) Info.add("HSA Runtime Version", - std::to_string(Major) + "." + std::to_string(Minor)); + std::to_string(Major) + "." + std::to_string(Minor), "", + DeviceInfo::DRIVER_VERSION); Info.add("HSA OpenMP Device Number", DeviceId); @@ -2620,11 +2621,11 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Device Name", TmpChar); + Info.add("Device Name", TmpChar, "", DeviceInfo::NAME); Status = getDeviceAttrRaw(HSA_AGENT_INFO_VENDOR_NAME, TmpChar); if (Status == HSA_STATUS_SUCCESS) - Info.add("Vendor Name", TmpChar); + Info.add("Vendor Name", TmpChar, "", DeviceInfo::VENDOR); hsa_device_type_t DevType; Status = getDeviceAttrRaw(HSA_AGENT_INFO_DEVICE, DevType); @@ -2700,7 +2701,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Status = getDeviceAttrRaw(HSA_AGENT_INFO_WORKGROUP_MAX_DIM, WorkgrpMaxDim); if (Status == HSA_STATUS_SUCCESS) { - auto &MaxSize = *Info.add("Workgroup Max Size per Dimension"); + auto &MaxSize = + *Info.add("Workgroup Max Size per Dimension", std::monostate{}, "", + DeviceInfo::MAX_WORK_GROUP_SIZE); MaxSize.add("x", WorkgrpMaxDim[0]); MaxSize.add("y", WorkgrpMaxDim[1]); MaxSize.add("z", WorkgrpMaxDim[2]); diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 162b149ab483e..8c17a2ee07047 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -113,6 +113,12 @@ struct AsyncInfoWrapperTy { __tgt_async_info *AsyncInfoPtr; }; +enum class DeviceInfo { +#define OFFLOAD_DEVINFO(Name, _, Value) Name = Value, +#include "OffloadInfo.inc" +#undef OFFLOAD_DEVINFO +}; + /// Tree node for device information /// /// This information is either printed or used by liboffload to extract certain @@ -133,6 +139,8 @@ struct InfoTreeNode { // * The same key can appear multiple times std::unique_ptr> Children; + llvm::DenseMap DeviceInfoMap; + InfoTreeNode() : InfoTreeNode("", std::monostate{}, "") {} InfoTreeNode(std::string Key, VariantType Value, std::string Units) : Key(Key), Value(Value), Units(Units) {} @@ -140,10 +148,12 @@ struct InfoTreeNode { /// Add a new info entry as a child of this node. The entry requires at least /// a key string in \p Key. The value in \p Value is optional and can be any /// type that is representable as a string. The units in \p Units is optional - /// and must be a string. + /// and must be a string. Providing a device info key allows liboffload to + /// use that value for an appropriate olGetDeviceInfo query template InfoTreeNode *add(std::string Key, T Value = T(), - const std::string &Units = std::string()) { + const std::string &Units = std::string(), + std::optional DeviceInfoKey = std::nullopt) { assert(!Key.empty() && "Invalid info key"); if (!Children) @@ -157,7 +167,12 @@ struct InfoTreeNode { else ValueVariant = std::string{Value}; - return &Children->emplace_back(Key, ValueVariant, Units); + auto Ptr = &Children->emplace_back(Key, ValueVariant, Units); + + if (DeviceInfoKey) + DeviceInfoMap[*DeviceInfoKey] = Children->size() - 1; + + return Ptr; } std::optional get(StringRef Key) { @@ -171,6 +186,13 @@ struct InfoTreeNode { return It; } + std::optional get(DeviceInfo Info) { + auto Result = DeviceInfoMap.find(Info); + if (Result != DeviceInfoMap.end()) + return &(*Children)[Result->second]; + return std::nullopt; + } + /// Print all info entries in the tree void print() const { // Fake an additional indent so that values are offset from the keys diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b787376eb1770..728bf07c572bb 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -935,15 +935,16 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) // For consistency with other drivers, store the version as a string // rather than an integer - Info.add("CUDA Driver Version", std::to_string(TmpInt)); + Info.add("CUDA Driver Version", std::to_string(TmpInt), "", + DeviceInfo::DRIVER_VERSION); Info.add("CUDA OpenMP Device Number", DeviceId); Res = cuDeviceGetName(TmpChar, 1000, Device); if (Res == CUDA_SUCCESS) - Info.add("Device Name", TmpChar); + Info.add("Device Name", TmpChar, "", DeviceInfo::NAME); - Info.add("Vendor Name", "NVIDIA"); + Info.add("Vendor Name", "NVIDIA", "", DeviceInfo::VENDOR); Res = cuDeviceTotalMem(&TmpSt, Device); if (Res == CUDA_SUCCESS) @@ -978,7 +979,8 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) Info.add("Maximum Threads per Block", TmpInt); - auto &MaxBlock = *Info.add("Maximum Block Dimensions", ""); + auto &MaxBlock = *Info.add("Maximum Block Dimensions", std::monostate{}, "", + DeviceInfo::MAX_WORK_GROUP_SIZE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, TmpInt); if (Res == CUDA_SUCCESS) MaxBlock.add("x", TmpInt); From 03b7766dba2f63ee7c9e67f915ea8394f6426f9a Mon Sep 17 00:00:00 2001 From: Michael Buch Date: Fri, 18 Jul 2025 14:31:12 +0100 Subject: [PATCH 333/813] [lldb][Expression][NFC] Make LoadAddressResolver::m_target a reference (#149490) The only place that passes a target to `LoadAddressResolver` already checks for pointer validity. And inside of the resolver we have been dereferencing the target anyway without nullptr checks. So codify the non-nullness of `m_target` by making it a reference. --- lldb/source/Expression/IRExecutionUnit.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp index e445fa8833022..6f812b91a8b1d 100644 --- a/lldb/source/Expression/IRExecutionUnit.cpp +++ b/lldb/source/Expression/IRExecutionUnit.cpp @@ -700,7 +700,7 @@ void IRExecutionUnit::CollectCandidateCPlusPlusNames( class LoadAddressResolver { public: - LoadAddressResolver(Target *target, bool &symbol_was_missing_weak) + LoadAddressResolver(Target &target, bool &symbol_was_missing_weak) : m_target(target), m_symbol_was_missing_weak(symbol_was_missing_weak) {} std::optional Resolve(SymbolContextList &sc_list) { @@ -722,11 +722,11 @@ class LoadAddressResolver { // First try the symbol. if (candidate_sc.symbol) { - load_address = candidate_sc.symbol->ResolveCallableAddress(*m_target); + load_address = candidate_sc.symbol->ResolveCallableAddress(m_target); if (load_address == LLDB_INVALID_ADDRESS) { Address addr = candidate_sc.symbol->GetAddress(); - load_address = m_target->GetProcessSP() - ? addr.GetLoadAddress(m_target) + load_address = m_target.GetProcessSP() + ? addr.GetLoadAddress(&m_target) : addr.GetFileAddress(); } } @@ -734,8 +734,8 @@ class LoadAddressResolver { // If that didn't work, try the function. if (load_address == LLDB_INVALID_ADDRESS && candidate_sc.function) { Address addr = candidate_sc.function->GetAddress(); - load_address = m_target->GetProcessSP() ? addr.GetLoadAddress(m_target) - : addr.GetFileAddress(); + load_address = m_target.GetProcessSP() ? addr.GetLoadAddress(&m_target) + : addr.GetFileAddress(); } // We found a load address. @@ -766,7 +766,7 @@ class LoadAddressResolver { } private: - Target *m_target; + Target &m_target; bool &m_symbol_was_missing_weak; lldb::addr_t m_best_internal_load_address = LLDB_INVALID_ADDRESS; }; @@ -790,7 +790,7 @@ IRExecutionUnit::FindInSymbols(const std::vector &names, for (size_t i = 0; i < m_preferred_modules.GetSize(); ++i) non_local_images.Remove(m_preferred_modules.GetModuleAtIndex(i)); - LoadAddressResolver resolver(target, symbol_was_missing_weak); + LoadAddressResolver resolver(*target, symbol_was_missing_weak); ModuleFunctionSearchOptions function_options; function_options.include_symbols = true; From 6112ebde0cdd31694536d0ac20a38e5f70f6185a Mon Sep 17 00:00:00 2001 From: Sergei Barannikov Date: Fri, 18 Jul 2025 16:49:30 +0300 Subject: [PATCH 334/813] [RISCV] Guard CFI emission code with MF.needsFrameMoves() (#136060) Currently, AsmPrinter skips CFI instructions created by a backend if they are not needed. I'd like to change that so that it always prints/encodes CFI instructions if a backend created them. This change should slightly (perhaps negligibly) improve compile time as post-PEI passes no longer need to skip over these instructions in no-exceptions no-debug builds, and will allow to simplify convoluted logic in AsmPrinter once other targets stop emitting CFI instructions when they are not needed (that's my final goal). The changes in a test seem to be caused by slightly different post-RA scheduling in the absence of CFI instructions. --- llvm/lib/Target/RISCV/RISCVFrameLowering.cpp | 122 +++++++++++------- .../CodeGen/RISCV/short-forward-branch-opt.ll | 32 ++--- llvm/test/CodeGen/RISCV/zdinx-spill.ll | 26 ---- 3 files changed, 92 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 6c8e3da80b932..23b4554349003 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -95,6 +95,11 @@ static const std::pair FixedCSRFIQCIInterruptMap[] = { /* -21, -22, -23, -24 are reserved */ }; +/// Returns true if DWARF CFI instructions ("frame moves") should be emitted. +static bool needsDwarfCFI(const MachineFunction &MF) { + return MF.needsFrameMoves(); +} + // For now we use x3, a.k.a gp, as pointer to shadow call stack. // User should not use x3 in their asm. static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, @@ -141,6 +146,9 @@ static void emitSCSPrologue(MachineFunction &MF, MachineBasicBlock &MBB, .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameSetup); + if (!needsDwarfCFI(MF)) + return; + // Emit a CFI instruction that causes SlotSize to be subtracted from the value // of the shadow stack pointer when unwinding past this frame. char DwarfSCSReg = TRI->getDwarfRegNum(SCSPReg, /*IsEH*/ true); @@ -199,8 +207,10 @@ static void emitSCSEpilogue(MachineFunction &MF, MachineBasicBlock &MBB, .addReg(SCSPReg) .addImm(-SlotSize) .setMIFlag(MachineInstr::FrameDestroy); - // Restore the SCS pointer - CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); + if (needsDwarfCFI(MF)) { + // Restore the SCS pointer + CFIInstBuilder(MBB, MI, MachineInstr::FrameDestroy).buildRestore(SCSPReg); + } } // Insert instruction to swap mscratchsw with sp @@ -935,6 +945,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() + getUnmanagedCSI(MF, CSI).size()); CFIInstBuilder CFIBuilder(MBB, MBBI, MachineInstr::FrameSetup); + bool NeedsDwarfCFI = needsDwarfCFI(MF); // If libcalls are used to spill and restore callee-saved registers, the frame // has two sections; the opaque section managed by the libcalls, and the @@ -962,10 +973,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, alignTo((STI.getXLen() / 8) * LibCallRegs, getStackAlign()); RVFI->setLibCallStackSize(LibCallFrameSize); - CFIBuilder.buildDefCFAOffset(LibCallFrameSize); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + CFIBuilder.buildDefCFAOffset(LibCallFrameSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } // FIXME (note copied from Lanai): This appears to be overallocating. Needs @@ -996,14 +1009,17 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // could only be the next instruction. ++PossiblePush; - // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)` - // could be. The PUSH will also get its own CFI metadata for its own - // modifications, which should come after the PUSH. - CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, MachineInstr::FrameSetup); - PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); - for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) - PushCFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + // Insert the CFI metadata before where we think the `(QC.)CM.PUSH(FP)` + // could be. The PUSH will also get its own CFI metadata for its own + // modifications, which should come after the PUSH. + CFIInstBuilder PushCFIBuilder(MBB, PossiblePush, + MachineInstr::FrameSetup); + PushCFIBuilder.buildDefCFAOffset(QCIInterruptPushAmount); + for (const CalleeSavedInfo &CS : getQCISavedInfo(MF, CSI)) + PushCFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } if (RVFI->isPushable(MF) && PossiblePush != MBB.end() && @@ -1017,10 +1033,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, PossiblePush->getOperand(1).setImm(StackAdj); StackSize -= StackAdj; - CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), - MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) { + CFIBuilder.buildDefCFAOffset(RealStackSize - StackSize); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); + } } // Allocate space on the stack if necessary. @@ -1031,7 +1049,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, bool DynAllocation = MF.getInfo()->hasDynamicAllocation(); if (StackSize != 0) - allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true, + allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, NeedsDwarfCFI, NeedProbe, ProbeSize, DynAllocation, MachineInstr::FrameSetup); @@ -1049,8 +1067,10 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, // Iterate over list of callee-saved registers and emit .cfi_offset // directives. - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) - CFIBuilder.buildOffset(CS.getReg(), MFI.getObjectOffset(CS.getFrameIdx())); + if (NeedsDwarfCFI) + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildOffset(CS.getReg(), + MFI.getObjectOffset(CS.getFrameIdx())); // Generate new FP. if (hasFP(MF)) { @@ -1069,7 +1089,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); + if (NeedsDwarfCFI) + CFIBuilder.buildDefCFA(FPReg, RVFI->getVarArgsSaveSize()); } uint64_t SecondSPAdjustAmount = 0; @@ -1080,15 +1101,16 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, "SecondSPAdjustAmount should be greater than zero"); allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount, - getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe, - ProbeSize, DynAllocation, MachineInstr::FrameSetup); + getStackSizeWithRVVPadding(MF), NeedsDwarfCFI && !hasFP(MF), + NeedProbe, ProbeSize, DynAllocation, + MachineInstr::FrameSetup); } if (RVVStackSize) { if (NeedProbe) { allocateAndProbeStackForRVV(MF, MBB, MBBI, DL, RVVStackSize, - MachineInstr::FrameSetup, !hasFP(MF), - DynAllocation); + MachineInstr::FrameSetup, + NeedsDwarfCFI && !hasFP(MF), DynAllocation); } else { // We must keep the stack pointer aligned through any intermediate // updates. @@ -1097,14 +1119,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF, MachineInstr::FrameSetup, getStackAlign()); } - if (!hasFP(MF)) { + if (NeedsDwarfCFI && !hasFP(MF)) { // Emit .cfi_def_cfa_expression "sp + StackSize + RVVStackSize * vlenb". CFIBuilder.insertCFIInst(createDefCFAExpression( *RI, SPReg, getStackSizeWithRVVPadding(MF), RVVStackSize / 8)); } std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); - emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); + if (NeedsDwarfCFI) + emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF)); } if (hasFP(MF)) { @@ -1171,8 +1194,9 @@ void RISCVFrameLowering::deallocateStack(MachineFunction &MF, MachineInstr::FrameDestroy, getStackAlign()); StackSize = 0; - CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) - .buildDefCFAOffset(CFAOffset); + if (needsDwarfCFI(MF)) + CFIInstBuilder(MBB, MBBI, MachineInstr::FrameDestroy) + .buildDefCFAOffset(CFAOffset); } void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, @@ -1212,6 +1236,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size()); CFIInstBuilder CFIBuilder(MBB, FirstScalarCSRRestoreInsn, MachineInstr::FrameDestroy); + bool NeedsDwarfCFI = needsDwarfCFI(MF); uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount @@ -1232,10 +1257,11 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getScalable(RVVStackSize), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) - CFIBuilder.buildDefCFA(SPReg, RealStackSize); - - emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); + if (NeedsDwarfCFI) { + if (!hasFP(MF)) + CFIBuilder.buildDefCFA(SPReg, RealStackSize); + emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn); + } } if (FirstSPAdjustAmount) { @@ -1251,7 +1277,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, StackOffset::getFixed(SecondSPAdjustAmount), MachineInstr::FrameDestroy, getStackAlign()); - if (!hasFP(MF)) + if (NeedsDwarfCFI && !hasFP(MF)) CFIBuilder.buildDefCFAOffset(FirstSPAdjustAmount); } @@ -1272,7 +1298,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, getStackAlign()); } - if (hasFP(MF)) + if (NeedsDwarfCFI && hasFP(MF)) CFIBuilder.buildDefCFA(SPReg, RealStackSize); // Skip to after the restores of scalar callee-saved registers @@ -1295,8 +1321,9 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, } // Recover callee-saved registers. - for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) - CFIBuilder.buildRestore(CS.getReg()); + if (NeedsDwarfCFI) + for (const CalleeSavedInfo &CS : getUnmanagedCSI(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); if (RVFI->isPushable(MF) && MBBI != MBB.end() && isPop(MBBI->getOpcode())) { // Use available stack adjustment in pop instruction to deallocate stack @@ -1315,15 +1342,17 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF, auto NextI = next_nodbg(MBBI, MBB.end()); if (NextI == MBB.end() || NextI->getOpcode() != RISCV::PseudoRET) { ++MBBI; - CFIBuilder.setInsertPoint(MBBI); + if (NeedsDwarfCFI) { + CFIBuilder.setInsertPoint(MBBI); - for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) - CFIBuilder.buildRestore(CS.getReg()); + for (const CalleeSavedInfo &CS : getPushOrLibCallsSavedInfo(MF, CSI)) + CFIBuilder.buildRestore(CS.getReg()); - // Update CFA Offset. If this is a QCI interrupt function, there will be a - // leftover offset which is deallocated by `QC.C.MILEAVERET`, otherwise - // getQCIInterruptStackSize() will be 0. - CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize()); + // Update CFA Offset. If this is a QCI interrupt function, there will + // be a leftover offset which is deallocated by `QC.C.MILEAVERET`, + // otherwise getQCIInterruptStackSize() will be 0. + CFIBuilder.buildDefCFAOffset(RVFI->getQCIInterruptStackSize()); + } } } @@ -1812,7 +1841,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr( // allocateStack. bool DynAllocation = MF.getInfo()->hasDynamicAllocation(); - allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF), + allocateStack(MBB, MI, MF, -Amount, -Amount, + needsDwarfCFI(MF) && !hasFP(MF), /*NeedProbe=*/true, ProbeSize, DynAllocation, MachineInstr::NoFlags); } else { diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll index 061435c45ad0e..59a702ab6b17f 100644 --- a/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll +++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt.ll @@ -798,12 +798,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32 ; RV64SFBSIFIVEU74-LABEL: sextw_removal_ccor: ; RV64SFBSIFIVEU74: # %bb.0: # %bb ; RV64SFBSIFIVEU74-NEXT: addi sp, sp, -32 -; RV64SFBSIFIVEU74-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64SFBSIFIVEU74-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64SFBSIFIVEU74-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64SFBSIFIVEU74-NEXT: mv s0, a3 +; RV64SFBSIFIVEU74-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64SFBSIFIVEU74-NEXT: andi a0, a0, 1 ; RV64SFBSIFIVEU74-NEXT: mv s1, a2 +; RV64SFBSIFIVEU74-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64SFBSIFIVEU74-NEXT: beqz a0, .LBB15_4 ; RV64SFBSIFIVEU74-NEXT: # %bb.3: # %bb ; RV64SFBSIFIVEU74-NEXT: or s0, a3, a1 @@ -824,11 +824,11 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32 ; RV64SFBANDESAX45-LABEL: sextw_removal_ccor: ; RV64SFBANDESAX45: # %bb.0: # %bb ; RV64SFBANDESAX45-NEXT: addi sp, sp, -32 -; RV64SFBANDESAX45-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64SFBANDESAX45-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64SFBANDESAX45-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64SFBANDESAX45-NEXT: mv s0, a3 +; RV64SFBANDESAX45-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64SFBANDESAX45-NEXT: mv s1, a2 +; RV64SFBANDESAX45-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64SFBANDESAX45-NEXT: nds.bbc a0, 0, .LBB15_2 ; RV64SFBANDESAX45-NEXT: # %bb.1: ; RV64SFBANDESAX45-NEXT: or s0, s0, a1 @@ -848,12 +848,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32 ; ZICOND-LABEL: sextw_removal_ccor: ; ZICOND: # %bb.0: # %bb ; ZICOND-NEXT: addi sp, sp, -32 -; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; ZICOND-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; ZICOND-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; ZICOND-NEXT: mv s0, a3 +; ZICOND-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; ZICOND-NEXT: andi a0, a0, 1 ; ZICOND-NEXT: mv s1, a2 +; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; ZICOND-NEXT: beqz a0, .LBB15_4 ; ZICOND-NEXT: # %bb.3: # %bb ; ZICOND-NEXT: or s0, a3, a1 @@ -874,12 +874,12 @@ define void @sextw_removal_ccor(i1 %c, i32 signext %arg, i32 signext %arg1, i32 ; RV32SFB-LABEL: sextw_removal_ccor: ; RV32SFB: # %bb.0: # %bb ; RV32SFB-NEXT: addi sp, sp, -16 -; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill -; RV32SFB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: mv s0, a3 +; RV32SFB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: andi a0, a0, 1 ; RV32SFB-NEXT: mv s1, a2 +; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: beqz a0, .LBB15_4 ; RV32SFB-NEXT: # %bb.3: # %bb ; RV32SFB-NEXT: or s0, a3, a1 @@ -941,11 +941,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3 ; RV64SFBSIFIVEU74-LABEL: sextw_removal_ccaddw: ; RV64SFBSIFIVEU74: # %bb.0: # %bb ; RV64SFBSIFIVEU74-NEXT: addi sp, sp, -32 -; RV64SFBSIFIVEU74-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; RV64SFBSIFIVEU74-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64SFBSIFIVEU74-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64SFBSIFIVEU74-NEXT: mv s1, a1 ; RV64SFBSIFIVEU74-NEXT: andi a0, a0, 1 +; RV64SFBSIFIVEU74-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; RV64SFBSIFIVEU74-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64SFBSIFIVEU74-NEXT: mv s0, a2 ; RV64SFBSIFIVEU74-NEXT: beqz a0, .LBB16_4 ; RV64SFBSIFIVEU74-NEXT: # %bb.3: # %bb @@ -967,11 +967,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3 ; RV64SFBANDESAX45-LABEL: sextw_removal_ccaddw: ; RV64SFBANDESAX45: # %bb.0: # %bb ; RV64SFBANDESAX45-NEXT: addi sp, sp, -32 -; RV64SFBANDESAX45-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64SFBANDESAX45-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; RV64SFBANDESAX45-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64SFBANDESAX45-NEXT: mv s0, a2 +; RV64SFBANDESAX45-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; RV64SFBANDESAX45-NEXT: mv s1, a1 +; RV64SFBANDESAX45-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64SFBANDESAX45-NEXT: nds.bbc a0, 0, .LBB16_2 ; RV64SFBANDESAX45-NEXT: # %bb.1: ; RV64SFBANDESAX45-NEXT: addw s1, s1, a3 @@ -991,11 +991,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3 ; ZICOND-LABEL: sextw_removal_ccaddw: ; ZICOND: # %bb.0: # %bb ; ZICOND-NEXT: addi sp, sp, -32 -; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; ZICOND-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; ZICOND-NEXT: sd s1, 8(sp) # 8-byte Folded Spill ; ZICOND-NEXT: mv s1, a1 ; ZICOND-NEXT: andi a0, a0, 1 +; ZICOND-NEXT: sd s0, 16(sp) # 8-byte Folded Spill +; ZICOND-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; ZICOND-NEXT: mv s0, a2 ; ZICOND-NEXT: beqz a0, .LBB16_4 ; ZICOND-NEXT: # %bb.3: # %bb @@ -1017,11 +1017,11 @@ define void @sextw_removal_ccaddw(i1 %c, i32 signext %arg, i32 signext %arg1, i3 ; RV32SFB-LABEL: sextw_removal_ccaddw: ; RV32SFB: # %bb.0: # %bb ; RV32SFB-NEXT: addi sp, sp, -16 -; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill -; RV32SFB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: sw s1, 4(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: mv s1, a1 ; RV32SFB-NEXT: andi a0, a0, 1 +; RV32SFB-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32SFB-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32SFB-NEXT: mv s0, a2 ; RV32SFB-NEXT: beqz a0, .LBB16_4 ; RV32SFB-NEXT: # %bb.3: # %bb diff --git a/llvm/test/CodeGen/RISCV/zdinx-spill.ll b/llvm/test/CodeGen/RISCV/zdinx-spill.ll index d7a700622bf8c..6f206fe571c17 100644 --- a/llvm/test/CodeGen/RISCV/zdinx-spill.ll +++ b/llvm/test/CodeGen/RISCV/zdinx-spill.ll @@ -9,7 +9,6 @@ define double @foo(double %x) nounwind { ; CHECK-NEXT: liveins: $x10, $x11, $x8, $x9, $x18, $x19, $x20, $x21, $x22, $x23, $x24, $x25, $x26, $x27 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: $x2 = frame-setup ADDI $x2, -64 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 64 ; CHECK-NEXT: frame-setup SW killed $x8, $x2, 60 :: (store (s32) into %stack.1) ; CHECK-NEXT: frame-setup SW killed $x9, $x2, 56 :: (store (s32) into %stack.2) ; CHECK-NEXT: frame-setup SW killed $x18, $x2, 52 :: (store (s32) into %stack.3) @@ -22,18 +21,6 @@ define double @foo(double %x) nounwind { ; CHECK-NEXT: frame-setup SW killed $x25, $x2, 24 :: (store (s32) into %stack.10) ; CHECK-NEXT: frame-setup SW killed $x26, $x2, 20 :: (store (s32) into %stack.11) ; CHECK-NEXT: frame-setup SW killed $x27, $x2, 16 :: (store (s32) into %stack.12) - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -4 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x9, -8 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x18, -12 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x19, -16 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x20, -20 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x21, -24 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x22, -28 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x23, -32 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x24, -36 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x25, -40 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x26, -44 - ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x27, -48 ; CHECK-NEXT: renamable $x10_x11 = nofpexcept FADD_D_IN32X killed renamable $x10_x11, renamable $x10_x11, 7, implicit $frm ; CHECK-NEXT: PseudoRV32ZdinxSD killed renamable $x10_x11, $x2, 8 :: (store (s64) into %stack.0, align 4) ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def dead early-clobber $x6, 12 /* clobber */, implicit-def dead early-clobber $x7, 12 /* clobber */, implicit-def dead early-clobber $x8, 12 /* clobber */, implicit-def dead early-clobber $x9, 12 /* clobber */, implicit-def dead early-clobber $x10, 12 /* clobber */, implicit-def dead early-clobber $x11, 12 /* clobber */, implicit-def dead early-clobber $x12, 12 /* clobber */, implicit-def dead early-clobber $x13, 12 /* clobber */, implicit-def dead early-clobber $x14, 12 /* clobber */, implicit-def dead early-clobber $x15, 12 /* clobber */, implicit-def dead early-clobber $x16, 12 /* clobber */, implicit-def dead early-clobber $x17, 12 /* clobber */, implicit-def dead early-clobber $x18, 12 /* clobber */, implicit-def dead early-clobber $x19, 12 /* clobber */, implicit-def dead early-clobber $x20, 12 /* clobber */, implicit-def dead early-clobber $x21, 12 /* clobber */, implicit-def dead early-clobber $x22, 12 /* clobber */, implicit-def dead early-clobber $x23, 12 /* clobber */, implicit-def dead early-clobber $x24, 12 /* clobber */, implicit-def dead early-clobber $x25, 12 /* clobber */, implicit-def dead early-clobber $x26, 12 /* clobber */, implicit-def dead early-clobber $x27, 12 /* clobber */, implicit-def dead early-clobber $x28, 12 /* clobber */, implicit-def dead early-clobber $x29, 12 /* clobber */, implicit-def dead early-clobber $x31 @@ -50,20 +37,7 @@ define double @foo(double %x) nounwind { ; CHECK-NEXT: $x25 = frame-destroy LW $x2, 24 :: (load (s32) from %stack.10) ; CHECK-NEXT: $x26 = frame-destroy LW $x2, 20 :: (load (s32) from %stack.11) ; CHECK-NEXT: $x27 = frame-destroy LW $x2, 16 :: (load (s32) from %stack.12) - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x8 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x9 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x18 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x19 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x20 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x21 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x22 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x23 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x24 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x25 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x26 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x27 ; CHECK-NEXT: $x2 = frame-destroy ADDI $x2, 64 - ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11 %a = fadd double %x, %x call void asm sideeffect "", "~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{xr0},~{x31}"() From 7b541c931e975840c0ef86d8ebd16856d17c0c85 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Fri, 18 Jul 2025 07:06:49 -0700 Subject: [PATCH 335/813] [Github] Build CI Containers in Stacked PRs (#149346) Currently the pull_request event on the build CI container workflows are restricted to main. This prevents building them on stacked PRs. This is a bit annoying because we do not get the CI to test that everything is working until all of the base PRs have landed and the target branch becomes main. --- .github/workflows/build-ci-container-windows.yml | 2 -- .github/workflows/build-ci-container.yml | 2 -- 2 files changed, 4 deletions(-) diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml index 59079f057d021..f76c69f29fb30 100644 --- a/.github/workflows/build-ci-container-windows.yml +++ b/.github/workflows/build-ci-container-windows.yml @@ -11,8 +11,6 @@ on: - .github/workflows/build-ci-container-windows.yml - '.github/workflows/containers/github-action-ci-windows/**' pull_request: - branches: - - main paths: - .github/workflows/build-ci-container-windows.yml - '.github/workflows/containers/github-action-ci-windows/**' diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml index 3159aae32ca51..7f01264af8534 100644 --- a/.github/workflows/build-ci-container.yml +++ b/.github/workflows/build-ci-container.yml @@ -11,8 +11,6 @@ on: - .github/workflows/build-ci-container.yml - '.github/workflows/containers/github-action-ci/**' pull_request: - branches: - - main paths: - .github/workflows/build-ci-container.yml - '.github/workflows/containers/github-action-ci/**' From 5f531827a4b90f6e0051056fffd8642ae1c677e6 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 18 Jul 2025 16:13:00 +0200 Subject: [PATCH 336/813] [LSR] Do not consider uses in lifetime intrinsics (#149492) We should ignore uses of pointers in lifetime intrinsics, as these are not actually materialized in the final code, so don't affect register pressure or anything else LSR needs to model. Handling these only results in peculiar rewrites where additional intermediate GEPs are introduced. --- .../Transforms/Scalar/LoopStrengthReduce.cpp | 5 ++ .../LoopStrengthReduce/X86/lifetime-use.ll | 59 +++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 636bd81ce0755..9e318b04c2c99 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -3790,6 +3790,11 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { continue; } + // Do not consider uses inside lifetime intrinsics. These are not + // actually materialized. + if (UserInst->isLifetimeStartOrEnd()) + continue; + std::pair P = getUse(S, LSRUse::Basic, MemAccessTy()); size_t LUIdx = P.first; diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll new file mode 100644 index 0000000000000..c7a0de22b200b --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lifetime-use.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=loop-reduce -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define void @test(ptr %p, i64 %idx) { +; CHECK-LABEL: define void @test( +; CHECK-SAME: ptr [[P:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4 x [4 x i32]], align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 64, ptr [[ALLOCA]]) +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[IDX]], 6 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 48 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr nuw i8, ptr [[ALLOCA]], i64 48 +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[LOOP]] ], [ -8, %[[ENTRY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = shl nsw i64 [[LSR_IV]], 2 +; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP2]] +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[SCEVGEP8]], i64 32 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SCEVGEP9]], align 4 +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[SCEVGEP6]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[SCEVGEP7]], align 4 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SCEVGEP3]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[SCEVGEP5]], align 4 +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[SCEVGEP2]], align 4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 4 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 64, ptr [[ALLOCA]]) +; CHECK-NEXT: ret void +; +entry: + %alloca = alloca [4 x [4 x i32]], align 16 + call void @llvm.lifetime.start.p0(i64 64, ptr %alloca) + br label %loop + +loop: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %loop ] + %gep1 = getelementptr [4 x [12 x [4 x [4 x i32]]]], ptr %p, i64 0, i64 0, i64 0, i64 %indvars.iv, i64 0 + %0 = load i32, ptr %gep1, align 4 + %gep2 = getelementptr [6 x [4 x [4 x i32]]], ptr %p, i64 0, i64 0, i64 0, i64 %indvars.iv + %1 = load i32, ptr %gep2, align 4 + %gep3 = getelementptr [4 x [4 x i32]], ptr %alloca, i64 0, i64 3, i64 %indvars.iv + %2 = load i32, ptr %gep3, align 4 + %gep4 = getelementptr [6 x [4 x [4 x i32]]], ptr %p, i64 0, i64 %idx, i64 3, i64 %indvars.iv + %3 = load i32, ptr %gep4, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv, 1 + br i1 %exitcond.not, label %exit, label %loop + +exit: + call void @llvm.lifetime.end.p0(i64 64, ptr %alloca) + ret void +} From 22076644645a7731f0ec7a81fe78168cf5c2ed63 Mon Sep 17 00:00:00 2001 From: bd1976bris Date: Fri, 18 Jul 2025 15:18:20 +0100 Subject: [PATCH 337/813] [Clang][Test] Add PS5 and WI cases to clang/test/Sema/dllexport.c (#148818) Windows Itanium and PS5 are both Itanium C++ ABI variants which have the goal of semantic compatibility with Microsoft C++ code that uses dllimport/export. This patch adds Windows Itanium and PS5 triple testing to clang/test/Sema/dllexport.c. We have this testing in our downstream toolchain - for some reason it was not added upstream when the work for supporting dllimport/export was done. --- clang/test/Sema/dllexport.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/clang/test/Sema/dllexport.c b/clang/test/Sema/dllexport.c index 3f911fb095c0f..5f6ff36e290e9 100644 --- a/clang/test/Sema/dllexport.c +++ b/clang/test/Sema/dllexport.c @@ -2,6 +2,10 @@ // RUN: %clang_cc1 -triple x86_64-win32 -fsyntax-only -fms-extensions -verify -std=c11 %s // RUN: %clang_cc1 -triple i686-mingw32 -fsyntax-only -fms-extensions -verify -std=c11 %s // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c99 %s +// RUN: %clang_cc1 -triple i686-windows-itanium -fsyntax-only -fms-extensions -verify -std=c99 %s +// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify -std=c11 %s +// RUN: %clang_cc1 -triple x86_64-sie-ps5 -fsyntax-only -fms-extensions -verify -std=c99 %s +// RUN: %clang_cc1 -triple x86_64-sie-ps5 -fsyntax-only -fms-extensions -verify -std=c11 %s // Invalid usage. __declspec(dllexport) typedef int typedef1; From 44cd5027f826d1bc82e5e851d1012cc321806d12 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Fri, 18 Jul 2025 15:34:48 +0100 Subject: [PATCH 338/813] [LLVM][CodeGen][SVE] List MVTs that are desirable for extending loads. (#149153) Extend AArch64TargetLowering::isVectorLoadExtDesirable to specify the set of MVT for which load extension is desirable. Fixes https://github.com/llvm/llvm-project/issues/148939 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++-- .../Target/AArch64/AArch64ISelLowering.cpp | 4 +++- .../AArch64/sve-intrinsics-ldst-ext.ll | 24 +++++++++++++++++++ 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 40464e91f9efc..a3c6969e0daa0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7641,7 +7641,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) { if (SDValue(GN0, 0).hasOneUse() && isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) && - TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { + TLI.isVectorLoadExtDesirable(SDValue(N, 0))) { SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; @@ -15745,7 +15745,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x) if (auto *GN0 = dyn_cast(N0)) { if (SDValue(GN0, 0).hasOneUse() && ExtVT == GN0->getMemoryVT() && - TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { + TLI.isVectorLoadExtDesirable(SDValue(N, 0))) { SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d04e6c45e2103..f026726c3f484 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6439,7 +6439,9 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { } } - return true; + EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType(); + return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 || + PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64; } unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll index 4153f0be611a1..9698f1a6768fd 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst-ext.ll @@ -231,3 +231,27 @@ define @sload_8i8_8i64(ptr %a) { %aext = sext %aval to ret %aext } + +; Ensure we don't try to promote a predicate load to a sign-extended load. +define @sload_16i1_16i8(ptr %addr) { +; CHECK-LABEL: sload_16i1_16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr p0, [x0] +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ret + %load = load , ptr %addr + %zext = sext %load to + ret %zext +} + +; Ensure we don't try to promote a predicate load to a zero-extended load. +define @zload_16i1_16i8(ptr %addr) { +; CHECK-LABEL: zload_16i1_16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr p0, [x0] +; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1 +; CHECK-NEXT: ret + %load = load , ptr %addr + %zext = zext %load to + ret %zext +} From 37ea9d88a3b8224ffa3b117749a74b1f2f1cfb53 Mon Sep 17 00:00:00 2001 From: lntue Date: Fri, 18 Jul 2025 10:52:26 -0400 Subject: [PATCH 339/813] [libc] Fix tests' linking flags accidentally modified by #147931. (#149453) https://github.com/llvm/llvm-project/pull/147931 --- libc/cmake/modules/LLVMLibCTestRules.cmake | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index e210992c5111a..3fb62788c1168 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -571,6 +571,8 @@ function(add_integration_test test_name) target_compile_options(${fq_build_target_name} PRIVATE ${compile_options} ${INTEGRATION_TEST_COMPILE_OPTIONS}) + set(compiler_runtime "") + if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) target_link_options(${fq_build_target_name} PRIVATE ${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS} @@ -599,17 +601,19 @@ function(add_integration_test test_name) set(link_options -nolibc -nostartfiles - -static + -nostdlib ${LIBC_LINK_OPTIONS_DEFAULT} ${LIBC_TEST_LINK_OPTIONS_DEFAULT} ) target_link_options(${fq_build_target_name} PRIVATE ${link_options}) + list(APPEND compiler_runtime ${LIBGCC_S_LOCATION}) endif() target_link_libraries( ${fq_build_target_name} - ${fq_target_name}.__libc__ libc.startup.${LIBC_TARGET_OS}.crt1 libc.test.IntegrationTest.test + ${fq_target_name}.__libc__ + ${compiler_runtime} ) add_dependencies(${fq_build_target_name} libc.test.IntegrationTest.test @@ -770,6 +774,7 @@ function(add_libc_hermetic test_name) ${HERMETIC_TEST_COMPILE_OPTIONS}) set(link_libraries "") + set(compiler_runtime "") foreach(lib IN LISTS HERMETIC_TEST_LINK_LIBRARIES) if(TARGET ${lib}.hermetic) list(APPEND link_libraries ${lib}.hermetic) @@ -807,12 +812,12 @@ function(add_libc_hermetic test_name) set(link_options -nolibc -nostartfiles - -static + -nostdlib ${LIBC_LINK_OPTIONS_DEFAULT} ${LIBC_TEST_LINK_OPTIONS_DEFAULT} ) target_link_options(${fq_build_target_name} PRIVATE ${link_options}) - list(APPEND link_libraries ${LIBGCC_S_LOCATION}) + list(APPEND compiler_runtime ${LIBGCC_S_LOCATION}) endif() target_link_libraries( ${fq_build_target_name} @@ -820,7 +825,9 @@ function(add_libc_hermetic test_name) libc.startup.${LIBC_TARGET_OS}.crt1 ${link_libraries} LibcHermeticTestSupport.hermetic - ${fq_target_name}.__libc__) + ${fq_target_name}.__libc__ + ${compiler_runtime} + ) add_dependencies(${fq_build_target_name} LibcTest.hermetic libc.test.UnitTest.ErrnoSetterMatcher From 95b69e0e7014fd6eac98f53125857fddda022a62 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 18 Jul 2025 10:59:47 -0400 Subject: [PATCH 340/813] [AMDGPU] Add support for `v_prng_b32` on gfx1250 (#149450) Co-authored-by: Mekhanoshin, Stanislav --- .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 19 +++++++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 1 + llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll | 4 +- llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 45 ++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 45 ++++++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 52 +++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 52 +++++++++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 12 +++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 12 +++++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 36 +++++++++++++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 36 +++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 44 ++++++++++++++++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 44 ++++++++++++++++ .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 4 ++ .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 4 ++ .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 45 ++++++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 39 ++++++++++++++ .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 9 ++++ .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 36 +++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 33 ++++++++++++ .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 7 ++- 21 files changed, 576 insertions(+), 3 deletions(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index a9ea17642d6ad..d42e51d04ab9d 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -4,6 +4,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable +typedef unsigned int uint; typedef half __attribute__((ext_vector_type(2))) half2; // CHECK-LABEL: @test_setprio_inc_wg( @@ -42,6 +43,24 @@ void test_s_wait_tensorcnt() { __builtin_amdgcn_s_wait_tensorcnt(0); } +// CHECK-LABEL: @test_prng_b32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.prng.b32(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMP2]], align 4 +// CHECK-NEXT: ret void +// +void test_prng_b32(global uint* out, uint a) { + *out = __builtin_amdgcn_prng_b32(a); +} + // CHECK-LABEL: @test_tanh_f32( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 1bbbb610305e9..3ee90857b34b8 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1148,6 +1148,7 @@ defm V_MOV_B64 : VOP1_Real_FULL ; defm V_TANH_F32 : VOP1_Real_FULL; defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; +defm V_PRNG_B32 : VOP1_Real_FULL; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll index 2faf375a97a86..465414c5471ee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.prng.ll @@ -1,5 +1,7 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s declare i32 @llvm.amdgcn.prng.b32(i32) #0 @@ -29,4 +31,4 @@ define amdgpu_kernel void @prng_b32_constant_100(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind readnone } -attributes #1 = { nounwind } \ No newline at end of file +attributes #1 = { nounwind } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index 279bb262bff04..5f310a9954ad0 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -163,6 +163,51 @@ v_tanh_bf16 v5, src_scc v_tanh_bf16 v127, 0x8000 // GFX1250: v_tanh_bf16_e32 v127, 0x8000 ; encoding: [0xff,0x94,0xfe,0x7e,0x00,0x80,0x00,0x00] +v_prng_b32 v5, v1 +// GFX1250: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0x97,0x0a,0x7e] + +v_prng_b32 v5, v255 +// GFX1250: v_prng_b32_e32 v5, v255 ; encoding: [0xff,0x97,0x0a,0x7e] + +v_prng_b32 v5, s1 +// GFX1250: v_prng_b32_e32 v5, s1 ; encoding: [0x01,0x96,0x0a,0x7e] + +v_prng_b32 v5, s105 +// GFX1250: v_prng_b32_e32 v5, s105 ; encoding: [0x69,0x96,0x0a,0x7e] + +v_prng_b32 v5, vcc_lo +// GFX1250: v_prng_b32_e32 v5, vcc_lo ; encoding: [0x6a,0x96,0x0a,0x7e] + +v_prng_b32 v5, vcc_hi +// GFX1250: v_prng_b32_e32 v5, vcc_hi ; encoding: [0x6b,0x96,0x0a,0x7e] + +v_prng_b32 v5, ttmp15 +// GFX1250: v_prng_b32_e32 v5, ttmp15 ; encoding: [0x7b,0x96,0x0a,0x7e] + +v_prng_b32 v5, m0 +// GFX1250: v_prng_b32_e32 v5, m0 ; encoding: [0x7d,0x96,0x0a,0x7e] + +v_prng_b32 v5, exec_lo +// GFX1250: v_prng_b32_e32 v5, exec_lo ; encoding: [0x7e,0x96,0x0a,0x7e] + +v_prng_b32 v5, exec_hi +// GFX1250: v_prng_b32_e32 v5, exec_hi ; encoding: [0x7f,0x96,0x0a,0x7e] + +v_prng_b32 v5, null +// GFX1250: v_prng_b32_e32 v5, null ; encoding: [0x7c,0x96,0x0a,0x7e] + +v_prng_b32 v5, -1 +// GFX1250: v_prng_b32_e32 v5, -1 ; encoding: [0xc1,0x96,0x0a,0x7e] + +v_prng_b32 v5, 0.5 +// GFX1250: v_prng_b32_e32 v5, 0.5 ; encoding: [0xf0,0x96,0x0a,0x7e] + +v_prng_b32 v5, src_scc +// GFX1250: v_prng_b32_e32 v5, src_scc ; encoding: [0xfd,0x96,0x0a,0x7e] + +v_prng_b32 v255, 0xaf123456 +// GFX1250: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf] + v_rcp_bf16 v5, v1 // GFX1250: v_rcp_bf16_e32 v5, v1 ; encoding: [0x01,0xf3,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 76272d25d92d4..aa2e028f661e1 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -169,6 +169,51 @@ v_tanh_bf16 v127, 0x8000 v_tanh_bf16 v5.h, v1.h // GFX1250: v_tanh_bf16_e32 v5.h, v1.h ; encoding: [0x81,0x95,0x0a,0x7f] +v_prng_b32 v5, v1 +// GFX1250: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0x97,0x0a,0x7e] + +v_prng_b32 v5, v255 +// GFX1250: v_prng_b32_e32 v5, v255 ; encoding: [0xff,0x97,0x0a,0x7e] + +v_prng_b32 v5, s1 +// GFX1250: v_prng_b32_e32 v5, s1 ; encoding: [0x01,0x96,0x0a,0x7e] + +v_prng_b32 v5, s105 +// GFX1250: v_prng_b32_e32 v5, s105 ; encoding: [0x69,0x96,0x0a,0x7e] + +v_prng_b32 v5, vcc_lo +// GFX1250: v_prng_b32_e32 v5, vcc_lo ; encoding: [0x6a,0x96,0x0a,0x7e] + +v_prng_b32 v5, vcc_hi +// GFX1250: v_prng_b32_e32 v5, vcc_hi ; encoding: [0x6b,0x96,0x0a,0x7e] + +v_prng_b32 v5, ttmp15 +// GFX1250: v_prng_b32_e32 v5, ttmp15 ; encoding: [0x7b,0x96,0x0a,0x7e] + +v_prng_b32 v5, m0 +// GFX1250: v_prng_b32_e32 v5, m0 ; encoding: [0x7d,0x96,0x0a,0x7e] + +v_prng_b32 v5, exec_lo +// GFX1250: v_prng_b32_e32 v5, exec_lo ; encoding: [0x7e,0x96,0x0a,0x7e] + +v_prng_b32 v5, exec_hi +// GFX1250: v_prng_b32_e32 v5, exec_hi ; encoding: [0x7f,0x96,0x0a,0x7e] + +v_prng_b32 v5, null +// GFX1250: v_prng_b32_e32 v5, null ; encoding: [0x7c,0x96,0x0a,0x7e] + +v_prng_b32 v5, -1 +// GFX1250: v_prng_b32_e32 v5, -1 ; encoding: [0xc1,0x96,0x0a,0x7e] + +v_prng_b32 v5, 0.5 +// GFX1250: v_prng_b32_e32 v5, 0.5 ; encoding: [0xf0,0x96,0x0a,0x7e] + +v_prng_b32 v5, src_scc +// GFX1250: v_prng_b32_e32 v5, src_scc ; encoding: [0xfd,0x96,0x0a,0x7e] + +v_prng_b32 v255, 0xaf123456 +// GFX1250: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf] + v_rcp_bf16 v5, v1 // GFX1250: v_rcp_bf16_e32 v5, v1 ; encoding: [0x01,0xf3,0x0a,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index 0a8ee84561d33..e1cd2e3043693 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -170,6 +170,58 @@ v_tanh_bf16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 f // GFX1250: v_tanh_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x94,0xfe,0x7e,0x7f,0x6f,0x35,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_prng_b32 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_mirror +// GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_half_mirror +// GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_shl:1 +// GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_shl:15 +// GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_shr:1 +// GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_shr:15 +// GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_ror:1 +// GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_ror:15 +// GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index d4afb9d9b2d9a..c1d3238b65cbd 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -178,6 +178,58 @@ v_tanh_bf16 v5.h, v1.h quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_prng_b32 v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_mirror +// GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_half_mirror +// GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_shl:1 +// GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_shl:15 +// GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_shr:1 +// GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_shr:15 +// GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_ror:1 +// GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_ror:15 +// GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 +// GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0 +// GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_rcp_bf16 v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_rcp_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xf2,0x0a,0x7e,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index a7cb6bf8de69c..100e9f92ff58b 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -38,6 +38,18 @@ v_tanh_bf16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_tanh_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x94,0xfe,0x7e,0x7f,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index 6acab7edc0d49..2ae103545443c 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -46,6 +46,18 @@ v_tanh_bf16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7f,0x81,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0 +// GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_rcp_bf16 v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xf2,0x0a,0x7e,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 7486d849253e8..9c6a9127d82e4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -127,6 +127,42 @@ v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp // GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] +v_prng_b32_e64 v5, v1 +// GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] + +v_prng_b32_e64 v5, v255 +// GFX1250: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00] + +v_prng_b32_e64 v5, s1 +// GFX1250: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00] + +v_prng_b32_e64 v5, s105 +// GFX1250: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00] + +v_prng_b32_e64 v5, vcc_lo +// GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] + +v_prng_b32_e64 v5, vcc_hi +// GFX1250: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00] + +v_prng_b32_e64 v5, ttmp15 +// GFX1250: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00] + +v_prng_b32_e64 v5, m0 +// GFX1250: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00] + +v_prng_b32_e64 v5, exec_lo +// GFX1250: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00] + +v_prng_b32_e64 v5, exec_hi +// GFX1250: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00] + +v_prng_b32_e64 v5, null +// GFX1250: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00] + +v_prng_b32_e64 v5, -1 +// GFX1250: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00] + v_tanh_f32_e64 v5, v1 // GFX1250: v_tanh_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index b59b8b31e2d5f..2f57d1c331c42 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -223,6 +223,42 @@ v_tanh_f16_e64 v255, -|0x8000| clamp div:2 v_tanh_f16 v5.l, v128.h // GFX1250: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00] +v_prng_b32_e64 v5, v1 +// GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] + +v_prng_b32_e64 v5, v255 +// GFX1250: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00] + +v_prng_b32_e64 v5, s1 +// GFX1250: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00] + +v_prng_b32_e64 v5, s105 +// GFX1250: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00] + +v_prng_b32_e64 v5, vcc_lo +// GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] + +v_prng_b32_e64 v5, vcc_hi +// GFX1250: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00] + +v_prng_b32_e64 v5, ttmp15 +// GFX1250: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00] + +v_prng_b32_e64 v5, m0 +// GFX1250: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00] + +v_prng_b32_e64 v5, exec_lo +// GFX1250: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00] + +v_prng_b32_e64 v5, exec_hi +// GFX1250: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00] + +v_prng_b32_e64 v5, null +// GFX1250: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00] + +v_prng_b32_e64 v5, -1 +// GFX1250: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00] + v_rcp_bf16_e64 v5, v1 // GFX1250: v_rcp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index f7f20f46161ce..29bb842b529b7 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -170,6 +170,50 @@ v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mas // GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_mirror +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index e1241b01ccae1..7df92751c38d1 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -178,6 +178,50 @@ v_tanh_bf16_e64_dpp v5.h, v128.h quad_perm:[3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] +// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] +// GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_mirror +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_half_mirror +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_shl:1 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_shl:15 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_shr:1 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_shr:15 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_ror:1 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_ror:15 +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf +// GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index 0106175301d20..d235aeb9f3e62 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -50,6 +50,10 @@ v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0 // GFX1250: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index 93b86f3ffb841..f25e2a5882436 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -58,6 +58,10 @@ v_tanh_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index 5f37ba91e071b..aa968b2bb2bee 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -200,6 +200,51 @@ 0x81,0x95,0x0a,0x7f # GFX1250-REAL16: v_tanh_bf16_e32 v5.h, v1.h ; encoding: [0x81,0x95,0x0a,0x7f] +0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf +# GFX1250: v_prng_b32_e32 v255, 0xaf123456 ; encoding: [0xff,0x96,0xfe,0x7f,0x56,0x34,0x12,0xaf] + +0xc1,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, -1 ; encoding: [0xc1,0x96,0x0a,0x7e] + +0xf0,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, 0.5 ; encoding: [0xf0,0x96,0x0a,0x7e] + +0x7f,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, exec_hi ; encoding: [0x7f,0x96,0x0a,0x7e] + +0x7e,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, exec_lo ; encoding: [0x7e,0x96,0x0a,0x7e] + +0x7d,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, m0 ; encoding: [0x7d,0x96,0x0a,0x7e] + +0x7c,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, null ; encoding: [0x7c,0x96,0x0a,0x7e] + +0x01,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, s1 ; encoding: [0x01,0x96,0x0a,0x7e] + +0x69,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, s105 ; encoding: [0x69,0x96,0x0a,0x7e] + +0xfd,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, src_scc ; encoding: [0xfd,0x96,0x0a,0x7e] + +0x7b,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, ttmp15 ; encoding: [0x7b,0x96,0x0a,0x7e] + +0x01,0x97,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, v1 ; encoding: [0x01,0x97,0x0a,0x7e] + +0xff,0x97,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, v255 ; encoding: [0xff,0x97,0x0a,0x7e] + +0x6b,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, vcc_hi ; encoding: [0x6b,0x96,0x0a,0x7e] + +0x6a,0x96,0x0a,0x7e +# GFX1250: v_prng_b32_e32 v5, vcc_lo ; encoding: [0x6a,0x96,0x0a,0x7e] + 0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e32 v127.l, 0x8000 ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf2,0xfe,0x7e,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index 57bee2766ce44..913a2a916ff62 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -162,6 +162,45 @@ 0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff # GFX1250-REAL16: v_tanh_bf16_dpp v5.h, v1.h quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7f,0x81,0x1b,0x00,0xff] +0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff +# GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff +# GFX1250: v_prng_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x50,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01 +# GFX1250: v_prng_b32_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x5f,0x01,0x01] + +0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff +# GFX1250: v_prng_b32_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0xff] + +0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13 +# GFX1250: v_prng_b32_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x96,0x0a,0x7e,0x01,0x60,0x09,0x13] + 0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30 # GFX1250-REAL16: v_rcp_bf16_dpp v127.l, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] # GFX1250-FAKE16: v_rcp_bf16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xf2,0xfe,0x7e,0x7f,0x6f,0x35,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index 28ec6b11b4de3..4afe44e241bf3 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -43,6 +43,15 @@ # GFX1250-REAL16: v_tanh_bf16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x94,0x0a,0x7f,0x81,0x77,0x39,0x05] # GFX1250-FAKE16: v_add_f64_e32 v[156:157], v[129:130], v[187:188] ; encoding: [0x81,0x77,0x39,0x05] +0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00 +# GFX1250: v_prng_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0x96,0xfe,0x7f,0xff,0x00,0x00,0x00] + +0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05] + +0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05 +# GFX1250: v_prng_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x96,0x0a,0x7e,0x01,0x77,0x39,0x05] + 0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_dpp v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xf2,0xfe,0x7e,0x7f,0x00,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 5004762729701..1cf3b8807d044 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -175,6 +175,42 @@ # GFX1250-REAL16: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_tanh_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xca,0xd5,0x80,0x01,0x00,0x00] +0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] + 0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00 # GFX1250-REAL16: v_rcp_bf16_e64 v255.l, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] # GFX1250-FAKE16: v_rcp_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xf9,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index de908b95d94f9..83a647ad7c658 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -104,6 +104,39 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + 0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index cfe7173c383b3..ef5ede4d1d453 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -1,6 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-REAL16 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s 0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250: v_tanh_f32_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0x9e,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] @@ -34,6 +34,9 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + 0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] From de959569f7ae468736b5f98ae3ce69b9eb3825ec Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 18 Jul 2025 17:00:29 +0200 Subject: [PATCH 341/813] [AddressSanitizer] Generate test checks (NFC) --- .../AddressSanitizer/lifetime.ll | 447 +++++++++++++++--- 1 file changed, 391 insertions(+), 56 deletions(-) diff --git a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll index 1d073cdc3bdb9..a878dbe94d11d 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/lifetime.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; Test handling of llvm.lifetime intrinsics. ; RUN: opt < %s -passes=asan -asan-use-after-scope -asan-use-after-return=never -asan-use-stack-safety=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK-DEFAULT ; RUN: opt < %s -passes=asan -asan-use-after-scope -asan-use-after-return=never -asan-use-stack-safety=0 -asan-instrument-dynamic-allocas=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK-NO-DYNAMIC @@ -8,109 +9,389 @@ target triple = "x86_64-unknown-linux-gnu" declare void @llvm.lifetime.start.p0(i64, ptr nocapture) nounwind declare void @llvm.lifetime.end.p0(i64, ptr nocapture) nounwind -; CHECK-LABEL: define void @lifetime_no_size( define void @lifetime_no_size(i64 %i) sanitize_address { +; CHECK-LABEL: define void @lifetime_no_size( +; CHECK-SAME: i64 [[I:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32 +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr +; CHECK-NEXT: store i64 1102416563, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack to i64), ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: store i64 ptrtoint (ptr @lifetime_no_size to i64), ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr +; CHECK-NEXT: store i64 -868083117767659023, ptr [[TMP11]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 -1, ptr [[TMP2]]) +; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds [2 x i32], ptr [[TMP2]], i64 0, i64 [[I]] +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[AI]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP12]], 3 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP13]], 2147450880 +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i8 [[TMP16]], 0 +; CHECK-NEXT: br i1 [[TMP17]], label %[[BB18:.*]], label %[[BB23:.*]], !prof [[PROF1:![0-9]+]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[TMP19:%.*]] = and i64 [[TMP12]], 7 +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP19]] to i8 +; CHECK-NEXT: [[TMP21:%.*]] = icmp sge i8 [[TMP20]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP21]], label %[[BB22:.*]], label %[[BB23]] +; CHECK: [[BB22]]: +; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP12]]) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: unreachable +; CHECK: [[BB23]]: +; CHECK-NEXT: store volatile i8 0, ptr [[AI]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 -1, ptr [[TMP2]]) +; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr +; CHECK-NEXT: store i64 0, ptr [[TMP25]], align 1 +; CHECK-NEXT: ret void +; entry: %a = alloca [2 x i32], align 4 ; Poison memory in prologue: 0xf3f3f300f1f1f1f1 - ; CHECK: store i64 -868083117767659023, ptr %[[#]] call void @llvm.lifetime.start.p0(i64 -1, ptr %a) ; Check that lifetime with no size are ignored. - ; CHECK-NOT: store - ; CHECK: call void @llvm.lifetime.start %ai = getelementptr inbounds [2 x i32], ptr %a, i64 0, i64 %i store volatile i8 0, ptr %ai, align 4 - ; CHECK: store volatile call void @llvm.lifetime.end.p0(i64 -1, ptr %a) ; Check that lifetime with no size are ignored. - ; CHECK-NOT: store - ; CHECK: call void @llvm.lifetime.end ; Unpoison stack frame on exit. - ; CHECK: store i64 0, ptr %{{[0-9]+}} - ; CHECK: ret void ret void } ; Generic case of lifetime analysis. define void @lifetime() sanitize_address { - ; CHECK-LABEL: define void @lifetime() +; CHECK-DEFAULT-LABEL: define void @lifetime( +; CHECK-DEFAULT-SAME: ) #[[ATTR1]] { +; CHECK-DEFAULT-NEXT: [[TMP1:%.*]] = alloca i64, align 32 +; CHECK-DEFAULT-NEXT: store i64 0, ptr [[TMP1]], align 8 +; CHECK-DEFAULT-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32 +; CHECK-DEFAULT-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64 +; CHECK-DEFAULT-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], 32 +; CHECK-DEFAULT-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK-DEFAULT-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP2]] to ptr +; CHECK-DEFAULT-NEXT: store i64 1102416563, ptr [[TMP5]], align 8 +; CHECK-DEFAULT-NEXT: [[TMP6:%.*]] = add i64 [[TMP2]], 8 +; CHECK-DEFAULT-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-DEFAULT-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP7]], align 8 +; CHECK-DEFAULT-NEXT: [[TMP8:%.*]] = add i64 [[TMP2]], 16 +; CHECK-DEFAULT-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-DEFAULT-NEXT: store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP9]], align 8 +; CHECK-DEFAULT-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP2]], 3 +; CHECK-DEFAULT-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880 +; CHECK-DEFAULT-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-DEFAULT-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-DEFAULT-NEXT: store i64 -868082052615769615, ptr [[TMP13]], align 1 +; CHECK-DEFAULT-NEXT: [[TMP14:%.*]] = add i64 [[TMP11]], 4 +; CHECK-DEFAULT-NEXT: [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr +; CHECK-DEFAULT-NEXT: store i8 4, ptr [[TMP15]], align 1 +; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr [[TMP4]]) +; CHECK-DEFAULT-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-DEFAULT-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP16]], 3 +; CHECK-DEFAULT-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 2147450880 +; CHECK-DEFAULT-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr +; CHECK-DEFAULT-NEXT: [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1 +; CHECK-DEFAULT-NEXT: [[TMP21:%.*]] = icmp ne i8 [[TMP20]], 0 +; CHECK-DEFAULT-NEXT: br i1 [[TMP21]], label %[[BB22:.*]], label %[[BB27:.*]], !prof [[PROF1]] +; CHECK-DEFAULT: [[BB22]]: +; CHECK-DEFAULT-NEXT: [[TMP23:%.*]] = and i64 [[TMP16]], 7 +; CHECK-DEFAULT-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i8 +; CHECK-DEFAULT-NEXT: [[TMP25:%.*]] = icmp sge i8 [[TMP24]], [[TMP20]] +; CHECK-DEFAULT-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[BB27]] +; CHECK-DEFAULT: [[BB26]]: +; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP16]]) #[[ATTR4]] +; CHECK-DEFAULT-NEXT: unreachable +; CHECK-DEFAULT: [[BB27]]: +; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP4]], align 1 +; CHECK-DEFAULT-NEXT: [[TMP28:%.*]] = add i64 [[TMP11]], 4 +; CHECK-DEFAULT-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP29]], align 1 +; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP4]]) +; CHECK-DEFAULT-NEXT: [[TMP30:%.*]] = add i64 [[TMP11]], 4 +; CHECK-DEFAULT-NEXT: [[TMP31:%.*]] = inttoptr i64 [[TMP30]] to ptr +; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP31]], align 1 +; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[TMP4]]) +; CHECK-DEFAULT-NEXT: [[TMP32:%.*]] = alloca i8, i64 128, align 32 +; CHECK-DEFAULT-NEXT: [[TMP33:%.*]] = ptrtoint ptr [[TMP32]] to i64 +; CHECK-DEFAULT-NEXT: [[TMP34:%.*]] = add i64 [[TMP33]], 32 +; CHECK-DEFAULT-NEXT: call void @__asan_alloca_poison(i64 [[TMP34]], i64 40) +; CHECK-DEFAULT-NEXT: [[TMP35:%.*]] = ptrtoint ptr [[TMP32]] to i64 +; CHECK-DEFAULT-NEXT: store i64 [[TMP35]], ptr [[TMP1]], align 8 +; CHECK-DEFAULT-NEXT: [[TMP36:%.*]] = inttoptr i64 [[TMP34]] to ptr +; CHECK-DEFAULT-NEXT: [[TMP37:%.*]] = ptrtoint ptr [[TMP36]] to i64 +; CHECK-DEFAULT-NEXT: call void @__asan_unpoison_stack_memory(i64 [[TMP37]], i64 40) +; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[TMP36]]) +; CHECK-DEFAULT-NEXT: [[TMP38:%.*]] = ptrtoint ptr [[TMP36]] to i64 +; CHECK-DEFAULT-NEXT: [[TMP39:%.*]] = lshr i64 [[TMP38]], 3 +; CHECK-DEFAULT-NEXT: [[TMP40:%.*]] = add i64 [[TMP39]], 2147450880 +; CHECK-DEFAULT-NEXT: [[TMP41:%.*]] = inttoptr i64 [[TMP40]] to ptr +; CHECK-DEFAULT-NEXT: [[TMP42:%.*]] = load i8, ptr [[TMP41]], align 1 +; CHECK-DEFAULT-NEXT: [[TMP43:%.*]] = icmp ne i8 [[TMP42]], 0 +; CHECK-DEFAULT-NEXT: br i1 [[TMP43]], label %[[BB44:.*]], label %[[BB49:.*]], !prof [[PROF1]] +; CHECK-DEFAULT: [[BB44]]: +; CHECK-DEFAULT-NEXT: [[TMP45:%.*]] = and i64 [[TMP38]], 7 +; CHECK-DEFAULT-NEXT: [[TMP46:%.*]] = trunc i64 [[TMP45]] to i8 +; CHECK-DEFAULT-NEXT: [[TMP47:%.*]] = icmp sge i8 [[TMP46]], [[TMP42]] +; CHECK-DEFAULT-NEXT: br i1 [[TMP47]], label %[[BB48:.*]], label %[[BB49]] +; CHECK-DEFAULT: [[BB48]]: +; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP38]]) #[[ATTR4]] +; CHECK-DEFAULT-NEXT: unreachable +; CHECK-DEFAULT: [[BB49]]: +; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP36]], align 1 +; CHECK-DEFAULT-NEXT: [[TMP50:%.*]] = ptrtoint ptr [[TMP36]] to i64 +; CHECK-DEFAULT-NEXT: call void @__asan_poison_stack_memory(i64 [[TMP50]], i64 40) +; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[TMP36]]) +; CHECK-DEFAULT-NEXT: [[TMP51:%.*]] = add i64 [[TMP11]], 4 +; CHECK-DEFAULT-NEXT: [[TMP52:%.*]] = inttoptr i64 [[TMP51]] to ptr +; CHECK-DEFAULT-NEXT: store i8 4, ptr [[TMP52]], align 1 +; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[TMP4]]) +; CHECK-DEFAULT-NEXT: [[TMP53:%.*]] = ptrtoint ptr [[TMP4]] to i64 +; CHECK-DEFAULT-NEXT: [[TMP54:%.*]] = lshr i64 [[TMP53]], 3 +; CHECK-DEFAULT-NEXT: [[TMP55:%.*]] = add i64 [[TMP54]], 2147450880 +; CHECK-DEFAULT-NEXT: [[TMP56:%.*]] = inttoptr i64 [[TMP55]] to ptr +; CHECK-DEFAULT-NEXT: [[TMP57:%.*]] = load i8, ptr [[TMP56]], align 1 +; CHECK-DEFAULT-NEXT: [[TMP58:%.*]] = icmp ne i8 [[TMP57]], 0 +; CHECK-DEFAULT-NEXT: br i1 [[TMP58]], label %[[BB59:.*]], label %[[BB64:.*]], !prof [[PROF1]] +; CHECK-DEFAULT: [[BB59]]: +; CHECK-DEFAULT-NEXT: [[TMP60:%.*]] = and i64 [[TMP53]], 7 +; CHECK-DEFAULT-NEXT: [[TMP61:%.*]] = trunc i64 [[TMP60]] to i8 +; CHECK-DEFAULT-NEXT: [[TMP62:%.*]] = icmp sge i8 [[TMP61]], [[TMP57]] +; CHECK-DEFAULT-NEXT: br i1 [[TMP62]], label %[[BB63:.*]], label %[[BB64]] +; CHECK-DEFAULT: [[BB63]]: +; CHECK-DEFAULT-NEXT: call void @__asan_report_store1(i64 [[TMP53]]) #[[ATTR4]] +; CHECK-DEFAULT-NEXT: unreachable +; CHECK-DEFAULT: [[BB64]]: +; CHECK-DEFAULT-NEXT: store volatile i8 0, ptr [[TMP4]], align 1 +; CHECK-DEFAULT-NEXT: [[TMP65:%.*]] = add i64 [[TMP11]], 4 +; CHECK-DEFAULT-NEXT: [[TMP66:%.*]] = inttoptr i64 [[TMP65]] to ptr +; CHECK-DEFAULT-NEXT: store i8 -8, ptr [[TMP66]], align 1 +; CHECK-DEFAULT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP4]]) +; CHECK-DEFAULT-NEXT: [[TMP67:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-DEFAULT-NEXT: [[TMP68:%.*]] = load i64, ptr [[TMP1]], align 8 +; CHECK-DEFAULT-NEXT: call void @__asan_allocas_unpoison(i64 [[TMP68]], i64 [[TMP67]]) +; CHECK-DEFAULT-NEXT: store i64 1172321806, ptr [[TMP5]], align 8 +; CHECK-DEFAULT-NEXT: [[TMP69:%.*]] = add i64 [[TMP11]], 0 +; CHECK-DEFAULT-NEXT: [[TMP70:%.*]] = inttoptr i64 [[TMP69]] to ptr +; CHECK-DEFAULT-NEXT: store i64 0, ptr [[TMP70]], align 1 +; CHECK-DEFAULT-NEXT: ret void +; +; CHECK-NO-DYNAMIC-LABEL: define void @lifetime( +; CHECK-NO-DYNAMIC-SAME: ) #[[ATTR1]] { +; CHECK-NO-DYNAMIC-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32 +; CHECK-NO-DYNAMIC-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64 +; CHECK-NO-DYNAMIC-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 32 +; CHECK-NO-DYNAMIC-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +; CHECK-NO-DYNAMIC-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP1]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i64 1102416563, ptr [[TMP4]], align 8 +; CHECK-NO-DYNAMIC-NEXT: [[TMP5:%.*]] = add i64 [[TMP1]], 8 +; CHECK-NO-DYNAMIC-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.1 to i64), ptr [[TMP6]], align 8 +; CHECK-NO-DYNAMIC-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], 16 +; CHECK-NO-DYNAMIC-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i64 ptrtoint (ptr @lifetime to i64), ptr [[TMP8]], align 8 +; CHECK-NO-DYNAMIC-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP1]], 3 +; CHECK-NO-DYNAMIC-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 2147450880 +; CHECK-NO-DYNAMIC-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 0 +; CHECK-NO-DYNAMIC-NEXT: [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i64 -868082052615769615, ptr [[TMP12]], align 1 +; CHECK-NO-DYNAMIC-NEXT: [[TMP13:%.*]] = add i64 [[TMP10]], 4 +; CHECK-NO-DYNAMIC-NEXT: [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i8 4, ptr [[TMP14]], align 1 +; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr [[TMP3]]) +; CHECK-NO-DYNAMIC-NEXT: [[TMP15:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; CHECK-NO-DYNAMIC-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP15]], 3 +; CHECK-NO-DYNAMIC-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 2147450880 +; CHECK-NO-DYNAMIC-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr +; CHECK-NO-DYNAMIC-NEXT: [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1 +; CHECK-NO-DYNAMIC-NEXT: [[TMP20:%.*]] = icmp ne i8 [[TMP19]], 0 +; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP20]], label %[[BB21:.*]], label %[[BB26:.*]], !prof [[PROF1]] +; CHECK-NO-DYNAMIC: [[BB21]]: +; CHECK-NO-DYNAMIC-NEXT: [[TMP22:%.*]] = and i64 [[TMP15]], 7 +; CHECK-NO-DYNAMIC-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP22]] to i8 +; CHECK-NO-DYNAMIC-NEXT: [[TMP24:%.*]] = icmp sge i8 [[TMP23]], [[TMP19]] +; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP24]], label %[[BB25:.*]], label %[[BB26]] +; CHECK-NO-DYNAMIC: [[BB25]]: +; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP15]]) #[[ATTR4]] +; CHECK-NO-DYNAMIC-NEXT: unreachable +; CHECK-NO-DYNAMIC: [[BB26]]: +; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[TMP3]], align 1 +; CHECK-NO-DYNAMIC-NEXT: [[TMP27:%.*]] = add i64 [[TMP10]], 4 +; CHECK-NO-DYNAMIC-NEXT: [[TMP28:%.*]] = inttoptr i64 [[TMP27]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP28]], align 1 +; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]]) +; CHECK-NO-DYNAMIC-NEXT: [[TMP29:%.*]] = add i64 [[TMP10]], 4 +; CHECK-NO-DYNAMIC-NEXT: [[TMP30:%.*]] = inttoptr i64 [[TMP29]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP30]], align 1 +; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[TMP3]]) +; CHECK-NO-DYNAMIC-NEXT: [[ARR:%.*]] = alloca [10 x i32], align 16 +; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 40, ptr [[ARR]]) +; CHECK-NO-DYNAMIC-NEXT: [[TMP31:%.*]] = ptrtoint ptr [[ARR]] to i64 +; CHECK-NO-DYNAMIC-NEXT: [[TMP32:%.*]] = lshr i64 [[TMP31]], 3 +; CHECK-NO-DYNAMIC-NEXT: [[TMP33:%.*]] = add i64 [[TMP32]], 2147450880 +; CHECK-NO-DYNAMIC-NEXT: [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr +; CHECK-NO-DYNAMIC-NEXT: [[TMP35:%.*]] = load i8, ptr [[TMP34]], align 1 +; CHECK-NO-DYNAMIC-NEXT: [[TMP36:%.*]] = icmp ne i8 [[TMP35]], 0 +; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP36]], label %[[BB37:.*]], label %[[BB42:.*]], !prof [[PROF1]] +; CHECK-NO-DYNAMIC: [[BB37]]: +; CHECK-NO-DYNAMIC-NEXT: [[TMP38:%.*]] = and i64 [[TMP31]], 7 +; CHECK-NO-DYNAMIC-NEXT: [[TMP39:%.*]] = trunc i64 [[TMP38]] to i8 +; CHECK-NO-DYNAMIC-NEXT: [[TMP40:%.*]] = icmp sge i8 [[TMP39]], [[TMP35]] +; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP40]], label %[[BB41:.*]], label %[[BB42]] +; CHECK-NO-DYNAMIC: [[BB41]]: +; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP31]]) #[[ATTR4]] +; CHECK-NO-DYNAMIC-NEXT: unreachable +; CHECK-NO-DYNAMIC: [[BB42]]: +; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[ARR]], align 1 +; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 40, ptr [[ARR]]) +; CHECK-NO-DYNAMIC-NEXT: [[TMP43:%.*]] = add i64 [[TMP10]], 4 +; CHECK-NO-DYNAMIC-NEXT: [[TMP44:%.*]] = inttoptr i64 [[TMP43]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i8 4, ptr [[TMP44]], align 1 +; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[TMP3]]) +; CHECK-NO-DYNAMIC-NEXT: [[TMP45:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; CHECK-NO-DYNAMIC-NEXT: [[TMP46:%.*]] = lshr i64 [[TMP45]], 3 +; CHECK-NO-DYNAMIC-NEXT: [[TMP47:%.*]] = add i64 [[TMP46]], 2147450880 +; CHECK-NO-DYNAMIC-NEXT: [[TMP48:%.*]] = inttoptr i64 [[TMP47]] to ptr +; CHECK-NO-DYNAMIC-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; CHECK-NO-DYNAMIC-NEXT: [[TMP50:%.*]] = icmp ne i8 [[TMP49]], 0 +; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP50]], label %[[BB51:.*]], label %[[BB56:.*]], !prof [[PROF1]] +; CHECK-NO-DYNAMIC: [[BB51]]: +; CHECK-NO-DYNAMIC-NEXT: [[TMP52:%.*]] = and i64 [[TMP45]], 7 +; CHECK-NO-DYNAMIC-NEXT: [[TMP53:%.*]] = trunc i64 [[TMP52]] to i8 +; CHECK-NO-DYNAMIC-NEXT: [[TMP54:%.*]] = icmp sge i8 [[TMP53]], [[TMP49]] +; CHECK-NO-DYNAMIC-NEXT: br i1 [[TMP54]], label %[[BB55:.*]], label %[[BB56]] +; CHECK-NO-DYNAMIC: [[BB55]]: +; CHECK-NO-DYNAMIC-NEXT: call void @__asan_report_store1(i64 [[TMP45]]) #[[ATTR4]] +; CHECK-NO-DYNAMIC-NEXT: unreachable +; CHECK-NO-DYNAMIC: [[BB56]]: +; CHECK-NO-DYNAMIC-NEXT: store volatile i8 0, ptr [[TMP3]], align 1 +; CHECK-NO-DYNAMIC-NEXT: [[TMP57:%.*]] = add i64 [[TMP10]], 4 +; CHECK-NO-DYNAMIC-NEXT: [[TMP58:%.*]] = inttoptr i64 [[TMP57]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i8 -8, ptr [[TMP58]], align 1 +; CHECK-NO-DYNAMIC-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[TMP3]]) +; CHECK-NO-DYNAMIC-NEXT: store i64 1172321806, ptr [[TMP4]], align 8 +; CHECK-NO-DYNAMIC-NEXT: [[TMP59:%.*]] = add i64 [[TMP10]], 0 +; CHECK-NO-DYNAMIC-NEXT: [[TMP60:%.*]] = inttoptr i64 [[TMP59]] to ptr +; CHECK-NO-DYNAMIC-NEXT: store i64 0, ptr [[TMP60]], align 1 +; CHECK-NO-DYNAMIC-NEXT: ret void +; ; Regular variable lifetime intrinsics. %i = alloca i32, align 4 ; Poison memory in prologue: F1F1F1F1F8F3F3F3 - ; CHECK: store i64 -868082052615769615, ptr %{{[0-9]+}} ; Memory is unpoisoned at llvm.lifetime.start call void @llvm.lifetime.start.p0(i64 3, ptr %i) - ; CHECK: store i8 4, ptr %{{[0-9]+}} - ; CHECK-NEXT: llvm.lifetime.start store volatile i8 0, ptr %i - ; CHECK: store volatile call void @llvm.lifetime.end.p0(i64 4, ptr %i) - ; CHECK: store i8 -8, ptr %{{[0-9]+}} - ; CHECK-NEXT: call void @llvm.lifetime.end ; Memory is poisoned at every call to llvm.lifetime.end call void @llvm.lifetime.end.p0(i64 2, ptr %i) - ; CHECK: store i8 -8, ptr %{{[0-9]+}} - ; CHECK-NEXT: call void @llvm.lifetime.end ; Lifetime intrinsics for array. %arr = alloca [10 x i32], align 16 call void @llvm.lifetime.start.p0(i64 40, ptr %arr) - ; CHECK-DEFAULT: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 40) - ; CHECK-NO-DYNAMIC-NOT: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 40) store volatile i8 0, ptr %arr - ; CHECK: store volatile call void @llvm.lifetime.end.p0(i64 40, ptr %arr) - ; CHECK-DEFAULT: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 40) - ; CHECK-NO-DYNAMIC-NOT: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 40) ; One more lifetime start/end for the same variable %i. call void @llvm.lifetime.start.p0(i64 2, ptr %i) - ; CHECK: store i8 4, ptr %{{[0-9]+}} - ; CHECK-NEXT: llvm.lifetime.start store volatile i8 0, ptr %i - ; CHECK: store volatile call void @llvm.lifetime.end.p0(i64 4, ptr %i) - ; CHECK: store i8 -8, ptr %{{[0-9]+}} - ; CHECK-NEXT: llvm.lifetime.end ; Memory is unpoisoned at function exit (only once). - ; CHECK: store i64 0, ptr %{{[0-9]+}} - ; CHECK-NEXT: ret void ret void } ; Check that arguments of lifetime may come from phi nodes. define void @phi_args(i1 %x) sanitize_address { - ; CHECK-LABEL: define void @phi_args(i1 %x) +; CHECK-LABEL: define void @phi_args( +; CHECK-SAME: i1 [[X:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 64, align 32 +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP0]] to ptr +; CHECK-NEXT: store i64 1102416563, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr +; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.2 to i64), ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: store i64 ptrtoint (ptr @phi_args to i64), ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], 2147450880 +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = inttoptr i64 [[TMP10]] to ptr +; CHECK-NEXT: store i64 -868082052615769615, ptr [[TMP11]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP9]], 4 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: store i8 0, ptr [[TMP13]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[TMP2]]) +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP2]] to i64 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP14]], 3 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[TMP15]], 2147450880 +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i8 [[TMP18]], 0 +; CHECK-NEXT: br i1 [[TMP19]], label %[[BB20:.*]], label %[[BB25:.*]], !prof [[PROF1]] +; CHECK: [[BB20]]: +; CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP14]], 7 +; CHECK-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i8 +; CHECK-NEXT: [[TMP23:%.*]] = icmp sge i8 [[TMP22]], [[TMP18]] +; CHECK-NEXT: br i1 [[TMP23]], label %[[BB24:.*]], label %[[BB25]] +; CHECK: [[BB24]]: +; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP14]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB25]]: +; CHECK-NEXT: store volatile i8 0, ptr [[TMP2]], align 1 +; CHECK-NEXT: br i1 [[X]], label %[[BB0:.*]], label %[[BB1:.*]] +; CHECK: [[BB0]]: +; CHECK-NEXT: br label %[[BB1]] +; CHECK: [[BB1]]: +; CHECK-NEXT: [[I_PHI:%.*]] = phi ptr [ [[TMP2]], %[[BB25]] ], [ [[TMP2]], %[[BB0]] ] +; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[TMP9]], 4 +; CHECK-NEXT: [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr +; CHECK-NEXT: store i8 -8, ptr [[TMP27]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[I_PHI]]) +; CHECK-NEXT: store i64 1172321806, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP28:%.*]] = add i64 [[TMP9]], 0 +; CHECK-NEXT: [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr +; CHECK-NEXT: store i64 0, ptr [[TMP29]], align 1 +; CHECK-NEXT: ret void +; entry: %i = alloca i64, align 4 ; Poison memory in prologue: F1F1F1F1F8F3F3F3 - ; CHECK: store i64 -868082052615769615, ptr %{{[0-9]+}} call void @llvm.lifetime.start.p0(i64 8, ptr %i) - ; CHECK: store i8 0, ptr %{{[0-9]+}} - ; CHECK-NEXT: llvm.lifetime.start store volatile i8 0, ptr %i - ; CHECK: store volatile br i1 %x, label %bb0, label %bb1 @@ -120,49 +401,101 @@ bb0: bb1: %i.phi = phi ptr [ %i, %entry ], [ %i, %bb0 ] call void @llvm.lifetime.end.p0(i64 8, ptr %i.phi) - ; CHECK: store i8 -8, ptr %{{[0-9]+}} - ; CHECK-NEXT: llvm.lifetime.end ret void - ; CHECK: store i64 0, ptr %{{[0-9]+}} - ; CHECK-NEXT: ret void } ; Check that arguments of lifetime may come from getelementptr nodes. define void @getelementptr_args(i64 %i) sanitize_address{ - ; CHECK-LABEL: define void @getelementptr_args +; CHECK-LABEL: define void @getelementptr_args( +; CHECK-SAME: i64 [[I:%.*]]) #[[ATTR1]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[MYALLOCA:%.*]] = alloca i8, i64 1216, align 32 +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr [[MYALLOCA]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], 32 +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP0]], 1184 +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to ptr +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP0]] to ptr +; CHECK-NEXT: store i64 1102416563, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP0]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +; CHECK-NEXT: store i64 ptrtoint (ptr @___asan_gen_stack.3 to i64), ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +; CHECK-NEXT: store i64 ptrtoint (ptr @getelementptr_args to i64), ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = lshr i64 [[TMP0]], 3 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], 2147450880 +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr +; CHECK-NEXT: store i32 -235802127, ptr [[TMP13]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[TMP11]], 4 +; CHECK-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP14]], i64 128) +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP11]], 132 +; CHECK-NEXT: [[TMP16:%.*]] = inttoptr i64 [[TMP15]] to ptr +; CHECK-NEXT: store i64 -940422246894996750, ptr [[TMP16]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[TMP11]], 140 +; CHECK-NEXT: [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr +; CHECK-NEXT: store i64 -940422246894996750, ptr [[TMP18]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[TMP11]], 150 +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr +; CHECK-NEXT: store i16 -3085, ptr [[TMP20]], align 1 +; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP11]], 4 +; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP21]], i64 128) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 1024, ptr [[TMP2]]) +; CHECK-NEXT: [[AI:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP4]], i64 0, i64 [[I]] +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr [[AI]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = lshr i64 [[TMP22]], 3 +; CHECK-NEXT: [[TMP24:%.*]] = add i64 [[TMP23]], 2147450880 +; CHECK-NEXT: [[TMP25:%.*]] = inttoptr i64 [[TMP24]] to ptr +; CHECK-NEXT: [[TMP26:%.*]] = load i8, ptr [[TMP25]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i8 [[TMP26]], 0 +; CHECK-NEXT: br i1 [[TMP27]], label %[[BB28:.*]], label %[[BB29:.*]] +; CHECK: [[BB28]]: +; CHECK-NEXT: call void @__asan_report_store8(i64 [[TMP22]]) #[[ATTR4]] +; CHECK-NEXT: unreachable +; CHECK: [[BB29]]: +; CHECK-NEXT: store ptr [[TMP2]], ptr [[AI]], align 8 +; CHECK-NEXT: [[TMP30:%.*]] = add i64 [[TMP11]], 4 +; CHECK-NEXT: call void @__asan_set_shadow_f8(i64 [[TMP30]], i64 128) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 1024, ptr [[TMP2]]) +; CHECK-NEXT: store i64 1172321806, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP31:%.*]] = add i64 [[TMP11]], 0 +; CHECK-NEXT: call void @__asan_set_shadow_00(i64 [[TMP31]], i64 148) +; CHECK-NEXT: [[TMP32:%.*]] = add i64 [[TMP11]], 150 +; CHECK-NEXT: [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr +; CHECK-NEXT: store i16 0, ptr [[TMP33]], align 1 +; CHECK-NEXT: ret void +; entry: %x = alloca [1024 x i8], align 16 %a = alloca [2 x ptr], align 8 ; F1F1F1F1 - ; CHECK: store i32 -235802127, ptr %{{[0-9]+}} - ; CHECK: call void @__asan_set_shadow_f8(i64 %[[#]], i64 128) ; 0xf2f2f2f2f2f2f2f2 - ; CHECK: store i64 -940422246894996750, ptr %[[#]] ; 0xf2f2f2f2f2f2f2f2 - ; CHECK: store i64 -940422246894996750, ptr %[[#]] call void @llvm.lifetime.start.p0(i64 1024, ptr %x) - ; CHECK: call void @__asan_set_shadow_00(i64 %{{[0-9]+}}, i64 128) - ; CHECK-NEXT: call void @llvm.lifetime.start %ai = getelementptr inbounds [2 x ptr], ptr %a, i64 0, i64 %i store ptr %x, ptr %ai, align 8 - ; CHECK: store ptr call void @llvm.lifetime.end.p0(i64 1024, ptr %x) - ; CHECK: call void @__asan_set_shadow_f8(i64 %{{[0-9]+}}, i64 128) - ; CHECK-NEXT: call void @llvm.lifetime.end ret void - ; CHECK: call void @__asan_set_shadow_00(i64 %{{[0-9]+}}, i64 148) - ; CHECK: store i16 0, ptr %[[#]], align 1 - ; CHECK-NEXT: ret void } define void @zero_sized(i64 %a) #0 { -; CHECK-LABEL: define void @zero_sized(i64 %a) +; CHECK-LABEL: define void @zero_sized( +; CHECK-SAME: i64 [[A:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i64, align 8 +; CHECK-NEXT: [[B:%.*]] = alloca [0 x i8], align 1 +; CHECK-NEXT: store i64 [[A]], ptr [[A_ADDR]], align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 0, ptr [[B]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 0, ptr [[B]]) +; CHECK-NEXT: ret void +; entry: %a.addr = alloca i64, align 8 @@ -170,11 +503,13 @@ entry: store i64 %a, ptr %a.addr, align 8 call void @llvm.lifetime.start.p0(i64 0, ptr %b) #2 - ; CHECK: call void @llvm.lifetime.start call void @llvm.lifetime.end.p0(i64 0, ptr %b) #2 - ; CHECK: call void @llvm.lifetime.end ret void - ; CHECK-NEXT: ret void } +;. +; CHECK-DEFAULT: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. +; CHECK-NO-DYNAMIC: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575} +;. From 6c705d11365b74b8207dc92f5c94ee7eb682a11b Mon Sep 17 00:00:00 2001 From: Pengying Xu Date: Fri, 18 Jul 2025 23:01:16 +0800 Subject: [PATCH 342/813] [lld][elf] Skip BP ordering input sections with null data (#149265) --- lld/ELF/BPSectionOrderer.cpp | 6 +++--- lld/test/ELF/bp-section-orderer.s | 21 +++++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/lld/ELF/BPSectionOrderer.cpp b/lld/ELF/BPSectionOrderer.cpp index f464b1d4518a4..06152046d13d4 100644 --- a/lld/ELF/BPSectionOrderer.cpp +++ b/lld/ELF/BPSectionOrderer.cpp @@ -76,10 +76,10 @@ DenseMap elf::runBalancedPartitioning( if (!d) return; auto *sec = dyn_cast_or_null(d->section); - // Skip empty, discarded, ICF folded sections. Skipping ICF folded sections - // reduces duplicate detection work in BPSectionOrderer. + // Skip empty, discarded, ICF folded sections, .bss. Skipping ICF folded + // sections reduces duplicate detection work in BPSectionOrderer. if (!sec || sec->size == 0 || !sec->isLive() || sec->repl != sec || - !orderer.secToSym.try_emplace(sec, d).second) + !sec->content().data() || !orderer.secToSym.try_emplace(sec, d).second) return; rootSymbolToSectionIdxs[CachedHashStringRef( lld::utils::getRootSymbol(sym.getName()))] diff --git a/lld/test/ELF/bp-section-orderer.s b/lld/test/ELF/bp-section-orderer.s index 4df2e8d43022e..438d7c2da0f76 100644 --- a/lld/test/ELF/bp-section-orderer.s +++ b/lld/test/ELF/bp-section-orderer.s @@ -26,28 +26,28 @@ # RUN: ld.lld -o out.s a.o --irpgo-profile=a.profdata --bp-startup-sort=function # RUN: llvm-nm -jn out.s | tr '\n' , | FileCheck %s --check-prefix=STARTUP -# STARTUP: s5,s4,s3,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d3,d2,d1,{{$}} +# STARTUP: s5,s4,s3,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d3,d2,d1,g1,{{$}} # RUN: ld.lld -o out.os a.o --irpgo-profile=a.profdata --bp-startup-sort=function --symbol-ordering-file a.txt # RUN: llvm-nm -jn out.os | tr '\n' , | FileCheck %s --check-prefix=ORDER-STARTUP -# ORDER-STARTUP: s2,s1,s5,s4,s3,A,F,E,D,B,C,merged1,merged2,_start,d3,d2,d4,d1,{{$}} +# ORDER-STARTUP: s2,s1,s5,s4,s3,A,F,E,D,B,C,merged1,merged2,_start,d3,d2,d4,d1,g1,{{$}} # RUN: ld.lld -o out.cf a.o --verbose-bp-section-orderer --bp-compression-sort=function 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-FUNC # RUN: ld.lld -o out.cf.icf a.o --verbose-bp-section-orderer --bp-compression-sort=function --icf=all --gc-sections 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-ICF-FUNC # RUN: llvm-nm -jn out.cf | tr '\n' , | FileCheck %s --check-prefix=CFUNC -# CFUNC: s5,s4,s3,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d3,d2,d1,{{$}} +# CFUNC: s5,s4,s3,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d3,d2,d1,g1,{{$}} # RUN: ld.lld -o out.cd a.o --verbose-bp-section-orderer --bp-compression-sort=data 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-DATA # RUN: llvm-nm -jn out.cd | tr '\n' , | FileCheck %s --check-prefix=CDATA -# CDATA: s5,s3,s4,s2,s1,F,C,E,D,B,A,merged1,merged2,_start,d4,d1,d3,d2,{{$}} +# CDATA: s5,s3,s4,s2,s1,F,C,E,D,B,A,merged1,merged2,_start,d4,d1,d3,d2,g1,{{$}} # RUN: ld.lld -o out.cb a.o --verbose-bp-section-orderer --bp-compression-sort=both 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-BOTH # RUN: llvm-nm -jn out.cb | tr '\n' , | FileCheck %s --check-prefix=CBOTH -# CBOTH: s5,s3,s4,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d1,d3,d2,{{$}} +# CBOTH: s5,s3,s4,s2,s1,A,F,merged1,merged2,C,E,D,B,_start,d4,d1,d3,d2,g1,{{$}} # RUN: ld.lld -o out.cbs a.o --verbose-bp-section-orderer --bp-compression-sort=both --irpgo-profile=a.profdata --bp-startup-sort=function 2>&1 | FileCheck %s --check-prefix=BP-COMPRESSION-BOTH # RUN: llvm-nm -jn out.cbs | tr '\n' , | FileCheck %s --check-prefix=CBOTH-STARTUP -# CBOTH-STARTUP: s5,s3,s4,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d1,d3,d2,{{$}} +# CBOTH-STARTUP: s5,s3,s4,s2,s1,A,B,C,F,E,D,merged1,merged2,_start,d4,d1,d3,d2,g1,{{$}} # BP-COMPRESSION-FUNC: Ordered 9 sections ([[#]] bytes) using balanced partitioning # BP-COMPRESSION-ICF-FUNC: Ordered 8 sections ([[#]] bytes) using balanced partitioning @@ -108,6 +108,7 @@ d3 d2 #--- a.c +int g1; const char s5[] = "engineering"; const char s4[] = "computer program"; const char s3[] = "hardware engineer"; @@ -377,6 +378,14 @@ d1: .word 6 // 0x6 .size d1, 16 + .type g1,@object // @g1 + .section .bss.g1,"aw",@nobits + .globl g1 + .p2align 2, 0x0 +g1: + .word 0 // 0x0 + .size g1, 4 + .section ".note.GNU-stack","",@progbits .addrsig .addrsig_sym F From fd12e9aed889e4b546a2d5c4d3c0c10582fe9148 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Fri, 18 Jul 2025 08:03:19 -0700 Subject: [PATCH 343/813] [libc++][tests] Update XFAIL annotations for some tests on Windows (#149124) These tests still fail on Windows with clang-22, as reported in #70225. This started failing due to the version bump to Clang 22. --- libcxx/test/libcxx/fuzzing/random.pass.cpp | 7 ++++--- libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp | 7 ++++--- libcxx/test/std/numerics/c.math/cmath.pass.cpp | 7 ++++--- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/libcxx/test/libcxx/fuzzing/random.pass.cpp b/libcxx/test/libcxx/fuzzing/random.pass.cpp index cb074bd60fdc8..f0256a01f29ae 100644 --- a/libcxx/test/libcxx/fuzzing/random.pass.cpp +++ b/libcxx/test/libcxx/fuzzing/random.pass.cpp @@ -6,9 +6,10 @@ // //===----------------------------------------------------------------------===// -// This test fails because Clang no longer enables -fdelayed-template-parsing -// by default on Windows with C++20 (#69431). -// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21) +// This doesn't work on Windows because in the MSVC UCRT headers the math.h is +// actually intended to implement the full C++ spec requirements. For details +// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828 +// XFAIL: msvc // UNSUPPORTED: c++03, c++11 diff --git a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp index 1ba0063c1dada..f9f81d22ff80e 100644 --- a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp +++ b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp @@ -6,9 +6,10 @@ // //===----------------------------------------------------------------------===// -// This test fails because Clang no longer enables -fdelayed-template-parsing -// by default on Windows with C++20 (#69431). -// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21) +// This doesn't work on Windows because in the MSVC UCRT headers the math.h is +// actually intended to implement the full C++ spec requirements. For details +// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828 +// XFAIL: msvc // diff --git a/libcxx/test/std/numerics/c.math/cmath.pass.cpp b/libcxx/test/std/numerics/c.math/cmath.pass.cpp index 48c2918802fc3..8d261e9fcbdb2 100644 --- a/libcxx/test/std/numerics/c.math/cmath.pass.cpp +++ b/libcxx/test/std/numerics/c.math/cmath.pass.cpp @@ -6,9 +6,10 @@ // //===----------------------------------------------------------------------===// -// This test fails because Clang no longer enables -fdelayed-template-parsing -// by default on Windows with C++20 (#69431). -// XFAIL: msvc && (clang-18 || clang-19 || clang-20 || clang-21) +// This doesn't work on Windows because in the MSVC UCRT headers the math.h is +// actually intended to implement the full C++ spec requirements. For details +// see https://github.com/llvm/llvm-project/issues/70225#issuecomment-1992528828 +// XFAIL: msvc // From fdce69a462101e1dce225014ee545858e363e4e2 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 08:03:53 -0700 Subject: [PATCH 344/813] [llvm] Improve grammar and punctuation of LLVM Coding Standards (#149463) --- llvm/docs/CodingStandards.rst | 46 +++++++++++++++++------------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst index c614a6d7ace9e..732227b98ab9e 100644 --- a/llvm/docs/CodingStandards.rst +++ b/llvm/docs/CodingStandards.rst @@ -30,7 +30,7 @@ because the naming and other conventions are dictated by the C++ standard. There are some conventions that are not uniformly followed in the code base (e.g. the naming convention). This is because they are relatively new, and a -lot of code was written before they were put in place. Our long term goal is +lot of code was written before they were put in place. Our long-term goal is for the entire codebase to follow the convention, but we explicitly *do not* want patches that do large-scale reformatting of existing code. On the other hand, it is reasonable to rename the methods of a class if you're about to @@ -50,7 +50,7 @@ code imported into the tree. Generally, our preference is for standards conforming, modern, and portable C++ code as the implementation language of choice. -For automation, build-systems and utility scripts Python is preferred and +For automation, build-systems, and utility scripts, Python is preferred and is widely used in the LLVM repository already. C++ Standard Versions @@ -92,7 +92,7 @@ LLVM support libraries (for example, `ADT `_) implement specialized data structures or functionality missing in the standard library. Such libraries are usually implemented in the ``llvm`` namespace and -follow the expected standard interface, when there is one. +follow the expected standard interface when there is one. When both C++ and the LLVM support libraries provide similar functionality, and there isn't a specific reason to favor the C++ implementation, it is generally @@ -325,8 +325,8 @@ implementation file. In any case, implementation files can include additional comments (not necessarily in Doxygen markup) to explain implementation details as needed. -Don't duplicate function or class name at the beginning of the comment. -For humans it is obvious which function or class is being documented; +Don't duplicate the function or class name at the beginning of the comment. +For humans, it is obvious which function or class is being documented; automatic documentation processing tools are smart enough to bind the comment to the correct declaration. @@ -369,7 +369,7 @@ lower-case letter, and finish the last sentence without a period, if it would end in one otherwise. Sentences which end with different punctuation, such as "did you forget ';'?", should still do so. -For example this is a good error message: +For example, this is a good error message: .. code-block:: none @@ -443,7 +443,7 @@ Write your code to fit within 80 columns. There must be some limit to the width of the code in order to allow developers to have multiple files side-by-side in windows on a modest display. If you are going to pick a width limit, it is -somewhat arbitrary but you might as well pick something standard. Going with 90 +somewhat arbitrary, but you might as well pick something standard. Going with 90 columns (for example) instead of 80 columns wouldn't add any significant value and would be detrimental to printing out code. Also many other projects have standardized on 80 columns, so some people have already configured their editors @@ -520,7 +520,7 @@ within each other and within function calls in order to build up aggregates The historically common formatting of braced initialization of aggregate variables does not mix cleanly with deep nesting, general expression contexts, function arguments, and lambdas. We suggest new code use a simple rule for -formatting braced initialization lists: act as-if the braces were parentheses +formatting braced initialization lists: act as if the braces were parentheses in a function call. The formatting rules exactly match those already well understood for formatting nested function calls. Examples: @@ -607,11 +607,11 @@ Static constructors and destructors (e.g., global variables whose types have a constructor or destructor) should not be added to the code base, and should be removed wherever possible. -Globals in different source files are initialized in `arbitrary order +Globals in different source files are initialized in an `arbitrary order `_, making the code more difficult to reason about. -Static constructors have negative impact on launch time of programs that use +Static constructors have a negative impact on the launch time of programs that use LLVM as a library. We would really like for there to be zero cost for linking in an additional LLVM target or other library into an application, but static constructors undermine this goal. @@ -698,7 +698,7 @@ If you use a braced initializer list when initializing a variable, use an equals Use ``auto`` Type Deduction to Make Code More Readable ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Some are advocating a policy of "almost always ``auto``" in C++11, however LLVM +Some are advocating a policy of "almost always ``auto``" in C++11; however, LLVM uses a more moderate stance. Use ``auto`` if and only if it makes the code more readable or easier to maintain. Don't "almost always" use ``auto``, but do use ``auto`` with initializers like ``cast(...)`` or other places where the @@ -783,14 +783,14 @@ guards, and might not include their prerequisites. Name such files with the In general, a header should be implemented by one or more ``.cpp`` files. Each of these ``.cpp`` files should include the header that defines their interface -first. This ensures that all of the dependences of the header have been +first. This ensures that all of the dependencies of the header have been properly added to the header itself, and are not implicit. System headers should be included after user headers for a translation unit. Library Layering ^^^^^^^^^^^^^^^^ -A directory of header files (for example ``include/llvm/Foo``) defines a +A directory of header files (for example, ``include/llvm/Foo``) defines a library (``Foo``). One library (both its headers and implementation) should only use things from the libraries listed in its dependencies. @@ -822,7 +822,7 @@ especially in header files. But wait! Sometimes you need to have the definition of a class to use it, or to inherit from it. In these cases go ahead and ``#include`` that header file. Be -aware however that there are many cases where you don't need to have the full +aware, however, that there are many cases where you don't need to have the full definition of a class. If you are using a pointer or reference to a class, you don't need the header file. If you are simply returning a class instance from a prototyped function or method, you don't need it. In fact, for most cases, you @@ -970,7 +970,7 @@ loops. A silly example is something like this: When you have very, very small loops, this sort of structure is fine. But if it exceeds more than 10-15 lines, it becomes difficult for people to read and understand at a glance. The problem with this sort of code is that it gets very -nested very quickly. Meaning that the reader of the code has to keep a lot of +nested very quickly. This means that the reader of the code has to keep a lot of context in their brain to remember what is going immediately on in the loop, because they don't know if/when the ``if`` conditions will have ``else``\s etc. It is strongly preferred to structure the loop like this: @@ -988,7 +988,7 @@ It is strongly preferred to structure the loop like this: ... } -This has all the benefits of using early exits for functions: it reduces nesting +This has all the benefits of using early exits for functions: it reduces the nesting of the loop, it makes it easier to describe why the conditions are true, and it makes it obvious to the reader that there is no ``else`` coming up that they have to push context into their brain for. If a loop is large, this can be a @@ -1149,12 +1149,12 @@ In general, names should be in camel case (e.g. ``TextFileReader`` and nouns and start with an upper-case letter (e.g. ``TextFileReader``). * **Variable names** should be nouns (as they represent state). The name should - be camel case, and start with an upper case letter (e.g. ``Leader`` or + be camel case, and start with an upper-case letter (e.g. ``Leader`` or ``Boats``). * **Function names** should be verb phrases (as they represent actions), and command-like function should be imperative. The name should be camel case, - and start with a lower case letter (e.g. ``openFile()`` or ``isFoo()``). + and start with a lower-case letter (e.g. ``openFile()`` or ``isFoo()``). * **Enum declarations** (e.g. ``enum Foo {...}``) are types, so they should follow the naming conventions for types. A common use for enums is as a @@ -1207,7 +1207,7 @@ Assert Liberally ^^^^^^^^^^^^^^^^ Use the "``assert``" macro to its fullest. Check all of your preconditions and -assumptions, you never know when a bug (not necessarily even yours) might be +assumptions. You never know when a bug (not necessarily even yours) might be caught early by an assertion, which reduces debugging time dramatically. The "````" header file is probably already included by the header files you are using, so it doesn't cost anything to use it. @@ -1302,7 +1302,7 @@ preferred to write the code like this: assert(NewToSet && "The value shouldn't be in the set yet"); In C code where ``[[maybe_unused]]`` is not supported, use ``void`` cast to -suppress unused variable warning as follows: +suppress an unused variable warning as follows: .. code-block:: c @@ -1546,7 +1546,7 @@ whenever possible. The semantics of postincrement include making a copy of the value being incremented, returning it, and then preincrementing the "work value". For primitive types, this isn't a big deal. But for iterators, it can be a huge -issue (for example, some iterators contains stack and set objects in them... +issue (for example, some iterators contain stack and set objects in them... copying an iterator could invoke the copy ctor's of these as well). In general, get in the habit of always using preincrement, and you won't have a problem. @@ -1663,7 +1663,7 @@ Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements When writing the body of an ``if``, ``else``, or for/while loop statement, we prefer to omit the braces to avoid unnecessary line noise. However, braces -should be used in cases where the omission of braces harm the readability and +should be used in cases where the omission of braces harms the readability and maintainability of the code. We consider that readability is harmed when omitting the brace in the presence @@ -1763,7 +1763,7 @@ would help to avoid running into a "dangling else" situation. handleAttrOnDecl(D, A, i); } - // Use braces on the outer block because of a nested `if`; otherwise the + // Use braces on the outer block because of a nested `if`; otherwise, the // compiler would warn: `add explicit braces to avoid dangling else` if (auto *D = dyn_cast(D)) { if (shouldProcess(D)) From 151fffccf1340d8a2800664cbcaaa579ba772a4c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 08:05:12 -0700 Subject: [PATCH 345/813] [flang] Migrate away from ArrayRef(std::nullopt_t) (#149454) ArrayRef(std::nullopt_t) has been deprecated. This patch replaces std::nullopt with mlir::TypeRange{} or mlir::ValueRange{} as appropriate. --- flang/lib/Lower/Bridge.cpp | 15 ++++++----- flang/lib/Lower/ConvertConstant.cpp | 2 +- flang/lib/Lower/ConvertExpr.cpp | 26 ++++++++++--------- flang/lib/Lower/Runtime.cpp | 4 +-- flang/lib/Lower/VectorSubscripts.cpp | 2 +- flang/lib/Optimizer/Builder/FIRBuilder.cpp | 2 +- .../Optimizer/Builder/Runtime/Intrinsics.cpp | 6 +++-- flang/lib/Optimizer/Builder/Runtime/Stop.cpp | 2 +- .../LowerHLFIROrderedAssignments.cpp | 4 +-- .../lib/Optimizer/Transforms/MemoryUtils.cpp | 2 +- .../Transforms/PolymorphicOpConversion.cpp | 2 +- .../Transforms/SimplifyFIROperations.cpp | 2 +- .../Optimizer/FortranVariableTest.cpp | 6 ++--- 13 files changed, 40 insertions(+), 35 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 4241d12601242..5f0783f869bf6 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -1466,8 +1466,9 @@ class FirConverter : public Fortran::lower::AbstractConverter { assert(falseTarget && "missing conditional branch false block"); mlir::Location loc = toLocation(); mlir::Value bcc = builder->createConvert(loc, builder->getI1Type(), cond); - builder->create(loc, bcc, trueTarget, std::nullopt, - falseTarget, std::nullopt); + builder->create(loc, bcc, trueTarget, + mlir::ValueRange{}, falseTarget, + mlir::ValueRange{}); } void genConditionalBranch(mlir::Value cond, Fortran::lower::pft::Evaluation *trueTarget, @@ -2556,8 +2557,8 @@ class FirConverter : public Fortran::lower::AbstractConverter { builder->setInsertionPointToEnd(loopWrapperOp.getBody()); auto loopOp = builder->create( loc, nestLBs, nestUBs, nestSts, /*loopAnnotation=*/nullptr, - /*local_vars=*/std::nullopt, - /*local_syms=*/nullptr, /*reduce_vars=*/std::nullopt, + /*local_vars=*/mlir::ValueRange{}, + /*local_syms=*/nullptr, /*reduce_vars=*/mlir::ValueRange{}, /*reduce_byref=*/nullptr, /*reduce_syms=*/nullptr, /*reduce_attrs=*/nullptr); @@ -3810,9 +3811,9 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::Block *selectCaseBlock = insertBlock(blockList[0]); mlir::Block *assumedSizeBlock = rankStarBlock ? rankStarBlock : defaultBlock; - builder->create(loc, isAssumedSize, - assumedSizeBlock, std::nullopt, - selectCaseBlock, std::nullopt); + builder->create( + loc, isAssumedSize, assumedSizeBlock, mlir::ValueRange{}, + selectCaseBlock, mlir::ValueRange{}); startBlock(selectCaseBlock); } // Create fir.select_case for the other rank cases. diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp index 1850b67898126..b8ab5d09e3e08 100644 --- a/flang/lib/Lower/ConvertConstant.cpp +++ b/flang/lib/Lower/ConvertConstant.cpp @@ -303,7 +303,7 @@ createStringLitOp(fir::FirOpBuilder &builder, mlir::Location loc, mlir::NamedAttribute sizeAttr(sizeTag, builder.getI64IntegerAttr(len)); llvm::SmallVector attrs = {dataAttr, sizeAttr}; return builder.create( - loc, llvm::ArrayRef{type}, std::nullopt, attrs); + loc, llvm::ArrayRef{type}, mlir::ValueRange{}, attrs); } } diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp index 0a1cd67789772..281ab229d1b6a 100644 --- a/flang/lib/Lower/ConvertExpr.cpp +++ b/flang/lib/Lower/ConvertExpr.cpp @@ -1003,9 +1003,9 @@ class ScalarExprLowering { }, [&](const fir::MutableBoxValue &toBox) { if (toBox.isPointer()) { - Fortran::lower::associateMutableBox(converter, loc, toBox, expr, - /*lbounds=*/std::nullopt, - stmtCtx); + Fortran::lower::associateMutableBox( + converter, loc, toBox, expr, + /*lbounds=*/mlir::ValueRange{}, stmtCtx); return; } // For allocatable components, a deep copy is needed. @@ -3604,8 +3604,9 @@ class ArrayExprLowering { mlir::Value castTo = builder.createConvert(loc, fir::HeapType::get(seqTy), load); mlir::Value shapeOp = builder.genShape(loc, shape); - return builder.create( - loc, seqTy, castTo, shapeOp, /*slice=*/mlir::Value{}, std::nullopt); + return builder.create(loc, seqTy, castTo, shapeOp, + /*slice=*/mlir::Value{}, + mlir::ValueRange{}); }; // Custom lowering of the element store to deal with the extra indirection // to the lazy allocated buffer. @@ -4207,7 +4208,7 @@ class ArrayExprLowering { auto addr = builder->create(loc, eleRefTy, tmp, shape, /*slice=*/mlir::Value{}, indices, - /*typeParams=*/std::nullopt); + /*typeParams=*/mlir::ValueRange{}); auto load = builder->create(loc, addr); return builder->createConvert(loc, i1Ty, load); }; @@ -4522,17 +4523,18 @@ class ArrayExprLowering { fir::isRecordWithAllocatableMember(eleTy)) TODO(loc, "creating an array temp where the element type has " "allocatable members"); - mlir::Value temp = !seqTy.hasDynamicExtents() - ? builder.create(loc, type) - : builder.create( - loc, type, ".array.expr", std::nullopt, shape); + mlir::Value temp = + !seqTy.hasDynamicExtents() + ? builder.create(loc, type) + : builder.create(loc, type, ".array.expr", + mlir::ValueRange{}, shape); fir::FirOpBuilder *bldr = &converter.getFirOpBuilder(); stmtCtx.attachCleanup( [bldr, loc, temp]() { bldr->create(loc, temp); }); mlir::Value shapeOp = genShapeOp(shape); return builder.create(loc, seqTy, temp, shapeOp, /*slice=*/mlir::Value{}, - std::nullopt); + mlir::ValueRange{}); } static fir::ShapeOp genShapeOp(mlir::Location loc, fir::FirOpBuilder &builder, @@ -6483,7 +6485,7 @@ class ArrayExprLowering { mlir::Value initBuffSz = builder.createIntegerConstant(loc, idxTy, clInitialBufferSize); mem = builder.create( - loc, eleTy, /*typeparams=*/std::nullopt, initBuffSz); + loc, eleTy, /*typeparams=*/mlir::ValueRange{}, initBuffSz); builder.create(loc, initBuffSz, buffSize); } } else { diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp index 2be5ef76e46b8..5f73335242336 100644 --- a/flang/lib/Lower/Runtime.cpp +++ b/flang/lib/Lower/Runtime.cpp @@ -134,7 +134,7 @@ void Fortran::lower::genFailImageStatement( mlir::Location loc = converter.getCurrentLocation(); mlir::func::FuncOp callee = fir::runtime::getRuntimeFunc(loc, builder); - builder.create(loc, callee, std::nullopt); + builder.create(loc, callee, mlir::ValueRange{}); genUnreachable(builder, loc); } @@ -199,7 +199,7 @@ void Fortran::lower::genPauseStatement( mlir::Location loc = converter.getCurrentLocation(); mlir::func::FuncOp callee = fir::runtime::getRuntimeFunc(loc, builder); - builder.create(loc, callee, std::nullopt); + builder.create(loc, callee, mlir::ValueRange{}); } void Fortran::lower::genPointerAssociate(fir::FirOpBuilder &builder, diff --git a/flang/lib/Lower/VectorSubscripts.cpp b/flang/lib/Lower/VectorSubscripts.cpp index 389a89ddcf102..c7b3e11728cea 100644 --- a/flang/lib/Lower/VectorSubscripts.cpp +++ b/flang/lib/Lower/VectorSubscripts.cpp @@ -122,7 +122,7 @@ class VectorSubscriptBoxBuilder { TODO(loc, "threading length parameters in field index op"); fir::FirOpBuilder &builder = converter.getFirOpBuilder(); componentPath.emplace_back(builder.create( - loc, fldTy, componentName, recTy, /*typeParams*/ std::nullopt)); + loc, fldTy, componentName, recTy, /*typeParams=*/mlir::ValueRange{})); return fir::unwrapSequenceType(recTy.getType(componentName)); } diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp index acd5a88a2582d..5b1dbc4435d6c 100644 --- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp +++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp @@ -620,7 +620,7 @@ fir::StringLitOp fir::FirOpBuilder::createStringLitOp(mlir::Location loc, mlir::NamedAttribute sizeAttr(sizeTag, getI64IntegerAttr(data.size())); llvm::SmallVector attrs{dataAttr, sizeAttr}; return create(loc, llvm::ArrayRef{type}, - std::nullopt, attrs); + mlir::ValueRange{}, attrs); } mlir::Value fir::FirOpBuilder::genShape(mlir::Location loc, diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp index 773d6408079cc..04703f7911176 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp @@ -59,7 +59,8 @@ mlir::Value fir::runtime::genCpuTime(fir::FirOpBuilder &builder, mlir::Location loc) { mlir::func::FuncOp func = fir::runtime::getRuntimeFunc(loc, builder); - return builder.create(loc, func, std::nullopt).getResult(0); + return builder.create(loc, func, mlir::ValueRange{}) + .getResult(0); } void fir::runtime::genDateAndTime(fir::FirOpBuilder &builder, @@ -280,7 +281,8 @@ void fir::runtime::genRename(fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value fir::runtime::genTime(fir::FirOpBuilder &builder, mlir::Location loc) { auto func = fir::runtime::getRuntimeFunc(loc, builder); - return builder.create(loc, func, std::nullopt).getResult(0); + return builder.create(loc, func, mlir::ValueRange{}) + .getResult(0); } /// generate runtime call to transfer intrinsic with no size argument diff --git a/flang/lib/Optimizer/Builder/Runtime/Stop.cpp b/flang/lib/Optimizer/Builder/Runtime/Stop.cpp index 411181cc6dd1c..9b5e43b80b1f6 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Stop.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Stop.cpp @@ -25,7 +25,7 @@ void fir::runtime::genExit(fir::FirOpBuilder &builder, mlir::Location loc, void fir::runtime::genAbort(fir::FirOpBuilder &builder, mlir::Location loc) { mlir::func::FuncOp abortFunc = fir::runtime::getRuntimeFunc(loc, builder); - builder.create(loc, abortFunc, std::nullopt); + builder.create(loc, abortFunc, mlir::ValueRange{}); } void fir::runtime::genReportFatalUserError(fir::FirOpBuilder &builder, diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp index 03cc92e975b19..c5cf01ed98357 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIROrderedAssignments.cpp @@ -405,7 +405,7 @@ void OrderedAssignmentRewriter::pre(hlfir::ForallMaskOp forallMaskOp) { mlir::Location loc = forallMaskOp.getLoc(); mlir::Value mask = generateYieldedScalarValue(forallMaskOp.getMaskRegion(), builder.getI1Type()); - auto ifOp = builder.create(loc, std::nullopt, mask, false); + auto ifOp = builder.create(loc, mlir::TypeRange{}, mask, false); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); constructStack.push_back(ifOp); } @@ -530,7 +530,7 @@ void OrderedAssignmentRewriter::generateMaskIfOp(mlir::Value cdt) { mlir::Location loc = cdt.getLoc(); cdt = hlfir::loadTrivialScalar(loc, builder, hlfir::Entity{cdt}); cdt = builder.createConvert(loc, builder.getI1Type(), cdt); - auto ifOp = builder.create(cdt.getLoc(), std::nullopt, cdt, + auto ifOp = builder.create(cdt.getLoc(), mlir::TypeRange{}, cdt, /*withElseRegion=*/false); constructStack.push_back(ifOp.getOperation()); builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); diff --git a/flang/lib/Optimizer/Transforms/MemoryUtils.cpp b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp index 1f8edf851de9b..bc4fcd8b0112e 100644 --- a/flang/lib/Optimizer/Transforms/MemoryUtils.cpp +++ b/flang/lib/Optimizer/Transforms/MemoryUtils.cpp @@ -222,7 +222,7 @@ void AllocaReplaceImpl::genIndirectDeallocation( rewriter.create(loc, intPtrTy, ptrVal); mlir::Value isAllocated = rewriter.create( loc, mlir::arith::CmpIPredicate::ne, ptrToInt, c0); - auto ifOp = rewriter.create(loc, std::nullopt, isAllocated, + auto ifOp = rewriter.create(loc, mlir::TypeRange{}, isAllocated, /*withElseRegion=*/false); rewriter.setInsertionPointToStart(&ifOp.getThenRegion().front()); mlir::Value cast = fir::factory::createConvert( diff --git a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp index 57eae1ff052a2..6e45aae4246d0 100644 --- a/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp +++ b/flang/lib/Optimizer/Transforms/PolymorphicOpConversion.cpp @@ -456,7 +456,7 @@ llvm::LogicalResult SelectTypeConv::genTypeLadderStep( rewriter.setInsertionPointToEnd(thisBlock); if (destOps.has_value()) rewriter.create(loc, cmp, dest, destOps.value(), - newBlock, std::nullopt); + newBlock, mlir::ValueRange{}); else rewriter.create(loc, cmp, dest, newBlock); rewriter.setInsertionPointToEnd(newBlock); diff --git a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp index 506c8e66dbdfa..ad8464b495888 100644 --- a/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp +++ b/flang/lib/Optimizer/Transforms/SimplifyFIROperations.cpp @@ -261,7 +261,7 @@ class DoConcurrentConversion innermostUnorderdLoop = rewriter.create( doConcurentOp.getLoc(), lb, ub, st, /*unordred=*/true, /*finalCountValue=*/false, - /*iterArgs=*/std::nullopt, loop.getReduceVars(), + /*iterArgs=*/mlir::ValueRange{}, loop.getReduceVars(), loop.getReduceAttrsAttr()); ivArgs.push_back(innermostUnorderdLoop.getInductionVar()); rewriter.setInsertionPointToStart(innermostUnorderdLoop.getBody()); diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp index 98270adaa7c73..59808779aa6ef 100644 --- a/flang/unittests/Optimizer/FortranVariableTest.cpp +++ b/flang/unittests/Optimizer/FortranVariableTest.cpp @@ -48,7 +48,7 @@ TEST_F(FortranVariableTest, SimpleScalar) { mlir::Value addr = builder->create(loc, eleType); auto name = mlir::StringAttr::get(&context, "x"); auto declare = builder->create(loc, addr.getType(), addr, - /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt, + /*shape=*/mlir::Value{}, /*typeParams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, /*data_attr=*/cuf::DataAttributeAttr{}); @@ -102,11 +102,11 @@ TEST_F(FortranVariableTest, SimpleArray) { extents.size(), fir::SequenceType::getUnknownExtent()); mlir::Type seqTy = fir::SequenceType::get(typeShape, eleType); mlir::Value addr = builder->create( - loc, seqTy, /*pinned=*/false, /*typeParams=*/std::nullopt, extents); + loc, seqTy, /*pinned=*/false, /*typeParams=*/mlir::ValueRange{}, extents); mlir::Value shape = createShape(extents); auto name = mlir::StringAttr::get(&context, "x"); auto declare = builder->create(loc, addr.getType(), addr, - shape, /*typeParams*/ std::nullopt, /*dummy_scope=*/nullptr, name, + shape, /*typeParams=*/mlir::ValueRange{}, /*dummy_scope=*/nullptr, name, /*fortran_attrs=*/fir::FortranVariableFlagsAttr{}, /*data_attr=*/cuf::DataAttributeAttr{}); From 724cfce5801829340b240ba62e82a7e7199e971d Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Fri, 18 Jul 2025 18:05:18 +0300 Subject: [PATCH 346/813] [Clang] Do not assume a perfect match is a better match than a non-template non-perfect match (#149504) This fixes a regression introduced by the "perfect match" overload resolution mechanism introduced in 8c5a307. [This does regress the performance noticeably (-0.7% for a stage 2 build)](https://llvm-compile-time-tracker.com/compare.php?from=42d2ae1034b287eb60563c370dbf52c59b66db20&to=82303bbc3e003c937ded498ac9f94f49a3fc3d90&stat=instructions:u), however, the original patch had a +4% performance impact, so we are only losing some of the gain, and this has the benefit of being correct and more robust. Fixes #147374 --- clang/include/clang/Sema/Overload.h | 2 - clang/lib/Sema/SemaOverload.cpp | 49 +++---------------- ...overload-resolution-deferred-templates.cpp | 28 +++++++++++ 3 files changed, 34 insertions(+), 45 deletions(-) diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index a70335bef9dd4..d34a4146ddbd6 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -1491,8 +1491,6 @@ class Sema; OverloadingResult BestViableFunctionImpl(Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best); - void PerfectViableFunction(Sema &S, SourceLocation Loc, - OverloadCandidateSet::iterator &Best); }; bool isBetterOverloadCandidate(Sema &S, const OverloadCandidate &Cand1, diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index 1b54628c5e564..5dd5b495480d9 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -11354,55 +11354,18 @@ OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S, DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled; if (TwoPhaseResolution) { - - PerfectViableFunction(S, Loc, Best); - if (Best != end()) - return ResultForBestCandidate(Best); + OverloadingResult Res = BestViableFunctionImpl(S, Loc, Best); + if (Best != end() && Best->isPerfectMatch(S.Context)) { + if (!(HasDeferredTemplateConstructors && + isa_and_nonnull(Best->Function))) + return Res; + } } InjectNonDeducedTemplateCandidates(S); return BestViableFunctionImpl(S, Loc, Best); } -void OverloadCandidateSet::PerfectViableFunction( - Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) { - - Best = end(); - for (auto It = Candidates.begin(); It != Candidates.end(); ++It) { - - if (!It->isPerfectMatch(S.getASTContext())) - continue; - - // We found a suitable conversion function - // but if there is a template constructor in the target class - // we might prefer that instead. - if (HasDeferredTemplateConstructors && - isa_and_nonnull(It->Function)) { - Best = end(); - break; - } - - if (Best == end()) { - Best = It; - continue; - } - if (Best->Function && It->Function) { - FunctionDecl *D = - S.getMoreConstrainedFunction(Best->Function, It->Function); - if (D == nullptr) { - Best = end(); - break; - } - if (D == It->Function) - Best = It; - continue; - } - // ambiguous - Best = end(); - break; - } -} - OverloadingResult OverloadCandidateSet::BestViableFunctionImpl( Sema &S, SourceLocation Loc, OverloadCandidateSet::iterator &Best) { diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp index 46c3670848529..135865c8450f5 100644 --- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp +++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp @@ -283,3 +283,31 @@ void f() { } #endif + +namespace GH147374 { + +struct String {}; +template void operator+(T, String &&) = delete; + +struct Bar { + void operator+(String) const; // expected-note {{candidate function}} + friend void operator+(Bar, String) {}; // expected-note {{candidate function}} +}; + +struct Baz { + void operator+(String); // expected-note {{candidate function}} + friend void operator+(Baz, String) {}; // expected-note {{candidate function}} +}; + +void test() { + Bar a; + String b; + a + b; + //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Bar' and 'String')}} + + Baz z; + z + b; + //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Baz' and 'String')}} +} + +} From 55305db90a3f329bdf7917d1c8bf36b318e33c72 Mon Sep 17 00:00:00 2001 From: RolandF77 Date: Fri, 18 Jul 2025 11:06:05 -0400 Subject: [PATCH 347/813] [PowerPC] Update maintainers (#149171) Update PowerPC BE maintainers. --- llvm/Maintainers.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md index de5f66ce1584c..87d2a9ac3bf88 100644 --- a/llvm/Maintainers.md +++ b/llvm/Maintainers.md @@ -263,6 +263,20 @@ amaclean@nvidia.com (email), [AlexMaclean](https://github.com/AlexMaclean) (GitH #### PowerPC backend +Amy Kwan (esp. release issues) \ +Amy.Kwan1@ibm.com (email), [amy-kwan](https://github.com/amy-kwan) (GitHub) \ +Lei Huang \ +lei@ca.ibm.com (email), [lei137](https://github.com/lei137) (GitHub) \ +Sean Fertile (esp. ABI/ELF/XCOFF) \ +sfertile@ca.ibm.com (email), [mandlebug](https://github.com/mandlebug) (GitHub) \ +Zhijian Lin \ +zhijian@ca.ibm.com (email), [diggerlin](https://github.com/diggerlin) (GitHub) \ +Maryam Moghadas \ +maryammo@ca.ibm.com (email), [maryammo](https://github.com/maryammo) (GitHub) \ +Roland Froese \ +froese@ca.ibm.com (email), [RolandF77](https://github.com/RolandF77) (GitHub) \ +llvmonpower \ +powerllvm@ca.ibm.com (email), [llvmonpower](https://github.com/llvmonpower) (GitHub) #### RISCV backend From 0e4069580413f3869e94ec1f0f84a085b639226e Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 18 Jul 2025 11:06:27 -0400 Subject: [PATCH 348/813] [libc++] Remove unused _LIBCPP_HAS_NO_STD_MODULES macro from __config_site (#148902) Since 1d6b6132f, that macro isn't used anywhere anymore. --- libcxx/include/__config_site.in | 1 - llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 - 2 files changed, 2 deletions(-) diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in index fc01aaf2d8746..b68c0c8258366 100644 --- a/libcxx/include/__config_site.in +++ b/libcxx/include/__config_site.in @@ -30,7 +30,6 @@ #cmakedefine01 _LIBCPP_HAS_LOCALIZATION #cmakedefine01 _LIBCPP_HAS_UNICODE #cmakedefine01 _LIBCPP_HAS_WIDE_CHARACTERS -#cmakedefine _LIBCPP_HAS_NO_STD_MODULES #cmakedefine01 _LIBCPP_HAS_TIME_ZONE_DATABASE #cmakedefine01 _LIBCPP_INSTRUMENTED_WITH_ASAN diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 82ec8121548c9..5309b5d095134 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -35,7 +35,6 @@ if (current_toolchain == default_toolchain) { "_LIBCPP_HAS_LOCALIZATION=1", "_LIBCPP_HAS_UNICODE=1", "_LIBCPP_HAS_WIDE_CHARACTERS=1", - "_LIBCPP_HAS_NO_STD_MODULES=", "_LIBCPP_HAS_TERMINAL=1", "_LIBCPP_INSTRUMENTED_WITH_ASAN=", "_LIBCPP_ABI_DEFINES=", From f73e163278fd6e50fc7855e52625ddf2e537c912 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Fri, 18 Jul 2025 08:10:20 -0700 Subject: [PATCH 349/813] [DAGCombiner] Fold [us]itofp of truncate (#149391) --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 12 +++ llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 47 +++++++++++ llvm/test/CodeGen/NVPTX/trunc-tofp.ll | 81 +++++++++++++++++++ 3 files changed, 140 insertions(+) create mode 100644 llvm/test/CodeGen/NVPTX/trunc-tofp.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a3c6969e0daa0..b7e41427f2c27 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18727,6 +18727,12 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI)) return FTrunc; + // fold (sint_to_fp (trunc nsw x)) -> (sint_to_fp x) + if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoSignedWrap() && + TLI.isTypeDesirableForOp(ISD::SINT_TO_FP, + N0.getOperand(0).getValueType())) + return DAG.getNode(ISD::SINT_TO_FP, DL, VT, N0.getOperand(0)); + return SDValue(); } @@ -18764,6 +18770,12 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI)) return FTrunc; + // fold (uint_to_fp (trunc nuw x)) -> (uint_to_fp x) + if (N0.getOpcode() == ISD::TRUNCATE && N0->getFlags().hasNoUnsignedWrap() && + TLI.isTypeDesirableForOp(ISD::UINT_TO_FP, + N0.getOperand(0).getValueType())) + return DAG.getNode(ISD::UINT_TO_FP, DL, VT, N0.getOperand(0)); + return SDValue(); } diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index cbc9f700b1f01..aba20e6b0f27f 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -2311,4 +2311,51 @@ entry: ret void } +define <4 x float> @test_uitofp_v4i8(<4 x i8> %a) { +; CHECK-LABEL: test_uitofp_v4i8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_v4i8_param_0]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x7773U; +; CHECK-NEXT: cvt.rn.f32.u32 %r3, %r2; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0x7772U; +; CHECK-NEXT: cvt.rn.f32.u32 %r5, %r4; +; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x7771U; +; CHECK-NEXT: cvt.rn.f32.u32 %r7, %r6; +; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x7770U; +; CHECK-NEXT: cvt.rn.f32.u32 %r9, %r8; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3}; +; CHECK-NEXT: ret; + %r = uitofp <4 x i8> %a to <4 x float> + ret <4 x float> %r +} + +define <4 x float> @test_sitofp_v4i8(<4 x i8> %a) { +; CHECK-LABEL: test_sitofp_v4i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_v4i8_param_0]; +; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0xbbb3U; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: cvt.rn.f32.s16 %r3, %rs1; +; CHECK-NEXT: prmt.b32 %r4, %r1, 0, 0xaaa2U; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; +; CHECK-NEXT: cvt.rn.f32.s16 %r5, %rs2; +; CHECK-NEXT: prmt.b32 %r6, %r1, 0, 0x9991U; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r6; +; CHECK-NEXT: cvt.rn.f32.s16 %r7, %rs3; +; CHECK-NEXT: prmt.b32 %r8, %r1, 0, 0x8880U; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r8; +; CHECK-NEXT: cvt.rn.f32.s16 %r9, %rs4; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r9, %r7, %r5, %r3}; +; CHECK-NEXT: ret; + %r = sitofp <4 x i8> %a to <4 x float> + ret <4 x float> %r +} + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll new file mode 100644 index 0000000000000..404c423cc026a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll @@ -0,0 +1,81 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_80 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_80 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +define float @uitofp_trunc_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: uitofp_trunc_nuw( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [uitofp_trunc_nuw_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [uitofp_trunc_nuw_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.rn.f32.u32 %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %v = add i32 %x, %y + %t = trunc nuw i32 %v to i16 + %f = uitofp i16 %t to float + ret float %f +} + +define float @sitofp_trunc_nsw(i32 %x, i32 %y) { +; CHECK-LABEL: sitofp_trunc_nsw( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [sitofp_trunc_nsw_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [sitofp_trunc_nsw_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.rn.f32.s32 %r4, %r3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %v = add i32 %x, %y + %t = trunc nsw i32 %v to i16 + %f = sitofp i16 %t to float + ret float %f +} + +define float @uitofp_trunc_nsw(i32 %x, i32 %y) { +; CHECK-LABEL: uitofp_trunc_nsw( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [uitofp_trunc_nsw_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [uitofp_trunc_nsw_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: cvt.rn.f32.u16 %r4, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %v = add i32 %x, %y + %t = trunc nsw i32 %v to i16 + %f = uitofp i16 %t to float + ret float %f +} + +define float @sitofp_trunc_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: sitofp_trunc_nuw( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [sitofp_trunc_nuw_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [sitofp_trunc_nuw_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: cvt.rn.f32.s16 %r4, %rs1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %v = add i32 %x, %y + %t = trunc nuw i32 %v to i16 + %f = sitofp i16 %t to float + ret float %f +} From e73cb43b44ddd7aeae7217aa1c9e7f8364a5e6df Mon Sep 17 00:00:00 2001 From: Adam Siemieniuk Date: Fri, 18 Jul 2025 17:16:46 +0200 Subject: [PATCH 350/813] [mlir][xegpu] Remove unused custom pass declaration (#149278) Removes unused declaration for pass creation. Only the create function auto-generated from tablegen should be used. --- mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h b/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h index ac4915901fdec..ff99d7ce96daf 100644 --- a/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h +++ b/mlir/include/mlir/Conversion/VectorToXeGPU/VectorToXeGPU.h @@ -21,9 +21,6 @@ class RewritePatternSet; /// Collect a set of patterns to convert from the vector to XeGPU ops. void populateVectorToXeGPUConversionPatterns(RewritePatternSet &patterns); -/// Create a pass to convert ops from vector to XeGPU. -std::unique_ptr createConvertVectorToXeGPUPass(); - } // namespace mlir #endif // MLIR_CONVERSION_VECTORTOXEGPU_VECTORTOXEGPU_H From ff5f3ae02aeac848dbb80ad9c652eae3ec107201 Mon Sep 17 00:00:00 2001 From: Kelvin Li Date: Fri, 18 Jul 2025 11:20:00 -0400 Subject: [PATCH 351/813] [flang] convert program name to upper case (NFC) (#149508) --- flang/test/Semantics/PowerPC/ppc-vector-types01.f90 | 2 +- flang/test/Semantics/PowerPC/ppc-vector-types02.f90 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/test/Semantics/PowerPC/ppc-vector-types01.f90 b/flang/test/Semantics/PowerPC/ppc-vector-types01.f90 index ad69b69a47f76..ea54a00fa4392 100644 --- a/flang/test/Semantics/PowerPC/ppc-vector-types01.f90 +++ b/flang/test/Semantics/PowerPC/ppc-vector-types01.f90 @@ -1,7 +1,7 @@ ! RUN: %flang_fc1 -fdebug-unparse %s | FileCheck %s ! REQUIRES: target=powerpc{{.*}} - ! CHECK-LABEL: PROGRAM ppc_vec_unit + ! CHECK-LABEL: PROGRAM PPC_VEC_UNIT program ppc_vec_unit implicit none ! CHECK: VECTOR(INTEGER(KIND=4_4)) :: vi1, vi2 diff --git a/flang/test/Semantics/PowerPC/ppc-vector-types02.f90 b/flang/test/Semantics/PowerPC/ppc-vector-types02.f90 index 8c96684c50eb7..175b58680a209 100644 --- a/flang/test/Semantics/PowerPC/ppc-vector-types02.f90 +++ b/flang/test/Semantics/PowerPC/ppc-vector-types02.f90 @@ -2,7 +2,7 @@ ! REQUIRES: target=powerpc{{.*}} ! C: MainProgram scope: ppc_vec_types -! CHECK-LABEL: MainProgram scope: ppc_vec_types size={{[0-9]*}} alignment={{[0-9]*}} +! CHECK-LABEL: MainProgram scope: PPC_VEC_TYPES size={{[0-9]*}} alignment={{[0-9]*}} program ppc_vec_types implicit none vector(integer(4)) :: vi From 5f001294b1d42a0b4146e0b08ccae72667de6a5d Mon Sep 17 00:00:00 2001 From: Brad Smith Date: Fri, 18 Jul 2025 11:26:36 -0400 Subject: [PATCH 352/813] Remove last few bits for Native Client support (#148983) --- .../macros/properties/architectures.h | 2 +- libcxx/include/__config | 9 ------- libcxx/include/limits | 2 +- libcxx/src/random.cpp | 26 ------------------- .../numeric.limits.members/traps.pass.cpp | 3 +-- llvm/tools/llvm-readobj/ELFDumper.cpp | 2 +- 6 files changed, 4 insertions(+), 40 deletions(-) diff --git a/libc/src/__support/macros/properties/architectures.h b/libc/src/__support/macros/properties/architectures.h index c88956ff41148..ecc93196be286 100644 --- a/libc/src/__support/macros/properties/architectures.h +++ b/libc/src/__support/macros/properties/architectures.h @@ -21,7 +21,7 @@ #define LIBC_TARGET_ARCH_IS_GPU #endif -#if defined(__pnacl__) || defined(__CLR_VER) || defined(LIBC_TARGET_ARCH_IS_GPU) +#if defined(__CLR_VER) || defined(LIBC_TARGET_ARCH_IS_GPU) #define LIBC_TARGET_ARCH_IS_VM #endif diff --git a/libcxx/include/__config b/libcxx/include/__config index ee06abfba7a08..e4422298bf971 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -265,13 +265,6 @@ _LIBCPP_HARDENING_MODE_DEBUG // When this option is used, the token passed to `std::random_device`'s // constructor *must* be "/dev/urandom" -- anything else is an error. // -// _LIBCPP_USING_NACL_RANDOM -// NaCl's sandbox (which PNaCl also runs in) doesn't allow filesystem access, -// including accesses to the special files under `/dev`. This implementation -// uses the NaCL syscall `nacl_secure_random_init()` to get entropy. -// When this option is used, the token passed to `std::random_device`'s -// constructor *must* be "/dev/urandom" -- anything else is an error. -// // _LIBCPP_USING_WIN32_RANDOM // Use rand_s(), for use on Windows. // When this option is used, the token passed to `std::random_device`'s @@ -283,8 +276,6 @@ _LIBCPP_HARDENING_MODE_DEBUG # define _LIBCPP_USING_GETENTROPY # elif defined(__Fuchsia__) # define _LIBCPP_USING_FUCHSIA_CPRNG -# elif defined(__native_client__) -# define _LIBCPP_USING_NACL_RANDOM # elif defined(_LIBCPP_WIN32API) # define _LIBCPP_USING_WIN32_RANDOM # else diff --git a/libcxx/include/limits b/libcxx/include/limits index 1205e6a0c2781..e8581cf9c321d 100644 --- a/libcxx/include/limits +++ b/libcxx/include/limits @@ -219,7 +219,7 @@ protected: static _LIBCPP_CONSTEXPR const bool is_bounded = true; static _LIBCPP_CONSTEXPR const bool is_modulo = !std::is_signed<_Tp>::value; -# if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || defined(__wasm__) +# if defined(__i386__) || defined(__x86_64__) || defined(__wasm__) static _LIBCPP_CONSTEXPR const bool traps = true; # else static _LIBCPP_CONSTEXPR const bool traps = false; diff --git a/libcxx/src/random.cpp b/libcxx/src/random.cpp index 5c6644811bfee..79815aadc7323 100644 --- a/libcxx/src/random.cpp +++ b/libcxx/src/random.cpp @@ -31,8 +31,6 @@ # include # include # endif -#elif defined(_LIBCPP_USING_NACL_RANDOM) -# include #elif defined(_LIBCPP_USING_FUCHSIA_CPRNG) # include #endif @@ -93,30 +91,6 @@ unsigned random_device::operator()() { return r; } -#elif defined(_LIBCPP_USING_NACL_RANDOM) - -random_device::random_device(const string& __token) { - if (__token != "/dev/urandom") - std::__throw_system_error(ENOENT, ("random device not supported " + __token).c_str()); - int error = nacl_secure_random_init(); - if (error) - std::__throw_system_error(error, ("random device failed to open " + __token).c_str()); -} - -random_device::~random_device() {} - -unsigned random_device::operator()() { - unsigned r; - size_t n = sizeof(r); - size_t bytes_written; - int error = nacl_secure_random(&r, n, &bytes_written); - if (error != 0) - std::__throw_system_error(error, "random_device failed getting bytes"); - else if (bytes_written != n) - std::__throw_runtime_error("random_device failed to obtain enough bytes"); - return r; -} - #elif defined(_LIBCPP_USING_WIN32_RANDOM) random_device::random_device(const string& __token) { diff --git a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp index a9b1e44602bd2..66e149bf58d1b 100644 --- a/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/limits/numeric.limits.members/traps.pass.cpp @@ -14,8 +14,7 @@ #include "test_macros.h" -#if defined(__i386__) || defined(__x86_64__) || defined(__pnacl__) || \ - defined(__wasm__) +#if defined(__i386__) || defined(__x86_64__) || defined(__wasm__) static const bool integral_types_trap = true; #else static const bool integral_types_trap = false; diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 465c189680cae..ccc64fec12958 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -5511,7 +5511,7 @@ template static GNUAbiTag getGNUAbiTag(ArrayRef Desc) { return {"", "", /*IsValid=*/false}; static const char *OSNames[] = { - "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable", "NaCl", + "Linux", "Hurd", "Solaris", "FreeBSD", "NetBSD", "Syllable", }; StringRef OSName = "Unknown"; if (Words[0] < std::size(OSNames)) From 32f0fc597f92f98f1be81abbd07f5164377668ef Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 18 Jul 2025 16:27:46 +0100 Subject: [PATCH 353/813] [lldb] Correct spacing of = {...} when depth limit is hit (#149480) In some places it was printing "= {...}" and some "={...}" with no space. I think the space looks nicer so do that in both cases. --- lldb/source/DataFormatters/ValueObjectPrinter.cpp | 2 +- .../TestFrameVarDepthAndElemCount.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lldb/source/DataFormatters/ValueObjectPrinter.cpp b/lldb/source/DataFormatters/ValueObjectPrinter.cpp index 40493df8aec37..05fcc4db3b125 100644 --- a/lldb/source/DataFormatters/ValueObjectPrinter.cpp +++ b/lldb/source/DataFormatters/ValueObjectPrinter.cpp @@ -854,7 +854,7 @@ llvm::Error ValueObjectPrinter::PrintChildrenIfNeeded(bool value_printed, PrintChildren(value_printed, summary_printed, curr_ptr_depth); } else if (HasReachedMaximumDepth() && IsAggregate() && ShouldPrintValueObject()) { - m_stream->PutCString("{...}\n"); + m_stream->PutCString(" {...}\n"); // The maximum child depth has been reached. If `m_max_depth` is the default // (i.e. the user has _not_ customized it), then lldb presents a warning to // the user. The warning tells the user that the limit has been reached, but diff --git a/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py b/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py index 1dfd7df9ff1be..33648678da797 100644 --- a/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py +++ b/lldb/test/API/lang/cpp/frame-var-depth-and-elem-count/TestFrameVarDepthAndElemCount.py @@ -19,10 +19,10 @@ def test(self): self.expect( "frame var --depth 2 --element-count 5 -- c", substrs=[ - "[0] = {\n b ={...}\n }", - "[1] = {\n b ={...}\n }", - "[2] = {\n b ={...}\n }", - "[3] = {\n b ={...}\n }", - "[4] = {\n b ={...}\n }", + "[0] = {\n b = {...}\n }", + "[1] = {\n b = {...}\n }", + "[2] = {\n b = {...}\n }", + "[3] = {\n b = {...}\n }", + "[4] = {\n b = {...}\n }", ], ) From ac7ceb3dabfac548caa993e7b77bbadc78af4464 Mon Sep 17 00:00:00 2001 From: quic-areg Date: Fri, 18 Jul 2025 10:27:59 -0500 Subject: [PATCH 354/813] [Hexagon][llvm-objdump] Improve disassembly of Hexagon bundles (#145807) Hexagon instructions are VLIW "bundles" of up to four instruction words encoded as a single MCInst with operands for each sub-instruction. Previously, the disassembler's getInstruction() returned the full bundle, which made it difficult to work with llvm-objdump. For example, since all instructions are bundles, and bundles do not branch, branch targets could not be printed. This patch modifies the Hexagon disassembler to return individual sub-instructions instead of entire bundles, enabling correct printing of branch targets and relocations. It also introduces `MCDisassembler::getInstructionBundle` for cases where the full bundle is still needed. By default, llvm-objdump separates instructions with newlines. However, this does not work well for Hexagon syntax: { inst1 inst2 inst3 inst4 } :endloop0 Instructions may be followed by a closing brace, a closing brace with `:endloop`, or a newline. Branches must appear within the braces. To address this, `PrettyPrinter::getInstructionSeparator()` is added and overridden for Hexagon. --- lld/test/ELF/hexagon-plt.s | 18 +- lld/test/ELF/hexagon-shared.s | 2 +- lld/test/ELF/hexagon-tls-gd-xform.s | 4 +- .../llvm/MC/MCDisassembler/MCDisassembler.h | 12 ++ .../Disassembler/HexagonDisassembler.cpp | 109 ++++++++--- .../MCTargetDesc/HexagonInstPrinter.cpp | 34 ++-- .../MCTargetDesc/HexagonMCTargetDesc.cpp | 19 +- llvm/test/MC/Hexagon/two_ext.s | 4 +- .../ELF/Hexagon/hexagon-bundles.s | 47 +++++ llvm/tools/llvm-mc/Disassembler.cpp | 6 +- llvm/tools/llvm-objdump/llvm-objdump.cpp | 180 +++++++++++------- 11 files changed, 297 insertions(+), 138 deletions(-) create mode 100644 llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s diff --git a/lld/test/ELF/hexagon-plt.s b/lld/test/ELF/hexagon-plt.s index 679de82923a72..780dc434a6698 100644 --- a/lld/test/ELF/hexagon-plt.s +++ b/lld/test/ELF/hexagon-plt.s @@ -30,31 +30,31 @@ # DIS: <_start>: ## Direct call ## Call foo directly -# DIS-NEXT: { call 0x2003c } +# DIS-NEXT: { call 0x2003c } ## Call bar via plt -# DIS-NEXT: { call 0x20060 } +# DIS-NEXT: { call 0x20060 } ## Call weak via plt -# DIS-NEXT: { call 0x20070 } +# DIS-NEXT: { call 0x20070 } # DIS-NEXT: { immext(#0) ## Call foo directly -# DIS-NEXT: if (p0) jump:nt 0x2003c } +# DIS-NEXT: if (p0) jump:nt 0x2003c } # DIS-NEXT: { immext(#64) ## Call bar via plt -# DIS-NEXT: if (p0) jump:nt 0x20060 } +# DIS-NEXT: if (p0) jump:nt 0x20060 } # DIS-NEXT: { immext(#64) ## Call weak via plt -# DIS-NEXT: if (p0) jump:nt 0x20070 } +# DIS-NEXT: if (p0) jump:nt 0x20070 } # DIS-NEXT: { immext(#0) ## Call foo directly -# DIS-NEXT: r0 = #0 ; jump 0x2003c } +# DIS-NEXT: r0 = #0 ; jump 0x2003c } # DIS-NEXT: { immext(#0) ## Call bar via plt -# DIS-NEXT: r0 = #0 ; jump 0x20060 } +# DIS-NEXT: r0 = #0 ; jump 0x20060 } # DIS-NEXT: { immext(#0) ## Call weak via plt -# DIS-NEXT: r0 = #0 ; jump 0x20070 } +# DIS-NEXT: r0 = #0 ; jump 0x20070 } # DIS: : # DIS-NEXT: 2003c: diff --git a/lld/test/ELF/hexagon-shared.s b/lld/test/ELF/hexagon-shared.s index cc62662d278e2..7f7390f1fa8d8 100644 --- a/lld/test/ELF/hexagon-shared.s +++ b/lld/test/ELF/hexagon-shared.s @@ -88,7 +88,7 @@ pvar: # PLT-NEXT: jumpr r28 } # TEXT: bc 00 01 00 000100bc -# TEXT: { call 0x10300 } +# TEXT: { call 0x10300 } # TEXT: if (p0) jump:nt 0x10300 # TEXT: r0 = #0 ; jump 0x10300 # TEXT: r0 = add(r1,##-65548) diff --git a/lld/test/ELF/hexagon-tls-gd-xform.s b/lld/test/ELF/hexagon-tls-gd-xform.s index 65aeb118fcb33..ade54e8a16fad 100644 --- a/lld/test/ELF/hexagon-tls-gd-xform.s +++ b/lld/test/ELF/hexagon-tls-gd-xform.s @@ -18,10 +18,10 @@ _start: .ifdef GDPLT call x@gdplt -# CHECK_GDPLT: 101ec: { call 0x10220 } +# CHECK_GDPLT: 101ec: { call 0x10220 <__tls_get_addr@plt> } .else call x -# CHECK: 101b8: { call 0x101e0 } +# CHECK: 101b8: { call 0x101e0 } .endif # CHECK_GDPLT: 10220: { immext(#0x20040) diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h index 3a7ca1a69ab85..cae2fbcac1fef 100644 --- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h +++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h @@ -136,6 +136,18 @@ class LLVM_ABI MCDisassembler { ArrayRef Bytes, uint64_t Address, raw_ostream &CStream) const = 0; + /// Returns the disassembly of an instruction bundle for VLIW architectures + /// like Hexagon. + /// + /// \param Instr - An MCInst to populate with the contents of + /// the Bundle with sub-instructions encoded as Inst operands. + virtual DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size, + ArrayRef Bytes, + uint64_t Address, + raw_ostream &CStream) const { + return Fail; + } + /// Used to perform separate target specific disassembly for a particular /// symbol. May parse any prelude that precedes instructions after the /// start of a symbol, or the entire symbol. diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 5bd31707acb6f..22cff7c80fa01 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -43,12 +43,12 @@ namespace { class HexagonDisassembler : public MCDisassembler { public: std::unique_ptr const MCII; - std::unique_ptr CurrentBundle; + mutable std::unique_ptr CurrentBundle; mutable MCInst const *CurrentExtender; HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, MCInstrInfo const *MCII) - : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *), + : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(nullptr), CurrentExtender(nullptr) {} DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB, @@ -57,7 +57,23 @@ class HexagonDisassembler : public MCDisassembler { DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size, ArrayRef Bytes, uint64_t Address, raw_ostream &CStream) const override; + + DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size, + ArrayRef Bytes, uint64_t Address, + raw_ostream &CStream) const override; + void remapInstruction(MCInst &Instr) const; + +private: + bool makeBundle(ArrayRef Bytes, uint64_t Address, + uint64_t &BytesToSkip, raw_ostream &CS) const; + + void resetBundle() const { + CurrentBundle.reset(); + CurrentInstruction = nullptr; + } + + mutable MCOperand *CurrentInstruction = nullptr; }; static uint64_t fullValue(HexagonDisassembler const &Disassembler, MCInst &MI, @@ -171,43 +187,88 @@ LLVMInitializeHexagonDisassembler() { createHexagonDisassembler); } -DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, - ArrayRef Bytes, - uint64_t Address, - raw_ostream &CS) const { - CommentStream = &CS; - - DecodeStatus Result = DecodeStatus::Success; +bool HexagonDisassembler::makeBundle(ArrayRef Bytes, uint64_t Address, + uint64_t &BytesToSkip, + raw_ostream &CS) const { bool Complete = false; - Size = 0; + DecodeStatus Result = DecodeStatus::Success; - *CurrentBundle = &MI; - MI.setOpcode(Hexagon::BUNDLE); - MI.addOperand(MCOperand::createImm(0)); + CurrentBundle.reset(new MCInst); + CurrentBundle->setOpcode(Hexagon::BUNDLE); + CurrentBundle->addOperand(MCOperand::createImm(0)); while (Result == Success && !Complete) { if (Bytes.size() < HEXAGON_INSTR_SIZE) - return MCDisassembler::Fail; + return false; MCInst *Inst = getContext().createMCInst(); - Result = getSingleInstruction(*Inst, MI, Bytes, Address, CS, Complete); - MI.addOperand(MCOperand::createInst(Inst)); - Size += HEXAGON_INSTR_SIZE; + Result = getSingleInstruction(*Inst, *CurrentBundle, Bytes, Address, CS, + Complete); + CurrentBundle->addOperand(MCOperand::createInst(Inst)); + BytesToSkip += HEXAGON_INSTR_SIZE; Bytes = Bytes.slice(HEXAGON_INSTR_SIZE); } if (Result == MCDisassembler::Fail) - return Result; - if (Size > HEXAGON_MAX_PACKET_SIZE) - return MCDisassembler::Fail; + return false; + if (BytesToSkip > HEXAGON_MAX_PACKET_SIZE) + return false; const auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI); const auto STI_ = (ArchSTI != nullptr) ? *ArchSTI : STI; - HexagonMCChecker Checker(getContext(), *MCII, STI_, MI, + HexagonMCChecker Checker(getContext(), *MCII, STI_, *CurrentBundle, *getContext().getRegisterInfo(), false); if (!Checker.check()) - return MCDisassembler::Fail; - remapInstruction(MI); + return false; + remapInstruction(*CurrentBundle); + return true; +} + +DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size, + ArrayRef Bytes, + uint64_t Address, + raw_ostream &CS) const { + CommentStream = &CS; + + Size = 0; + uint64_t BytesToSkip = 0; + + if (!CurrentBundle) { + if (!makeBundle(Bytes, Address, BytesToSkip, CS)) { + Size = BytesToSkip; + resetBundle(); + return MCDisassembler::Fail; + } + CurrentInstruction = (CurrentBundle->begin() + 1); + } + + MI = *(CurrentInstruction->getInst()); + Size = HEXAGON_INSTR_SIZE; + if (++CurrentInstruction == CurrentBundle->end()) + resetBundle(); return MCDisassembler::Success; } +DecodeStatus HexagonDisassembler::getInstructionBundle(MCInst &MI, + uint64_t &Size, + ArrayRef Bytes, + uint64_t Address, + raw_ostream &CS) const { + CommentStream = &CS; + Size = 0; + uint64_t BytesToSkip = 0; + assert(!CurrentBundle); + + if (!makeBundle(Bytes, Address, BytesToSkip, CS)) { + Size = BytesToSkip; + resetBundle(); + return MCDisassembler::Fail; + } + + MI = *CurrentBundle; + Size = HEXAGON_INSTR_SIZE * HexagonMCInstrInfo::bundleSize(MI); + resetBundle(); + + return Success; +} + void HexagonDisassembler::remapInstruction(MCInst &Instr) const { for (auto I: HexagonMCInstrInfo::bundleInstructions(Instr)) { auto &MI = const_cast(*I.getInst()); @@ -482,7 +543,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, unsigned Offset = 1; bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI); bool PrevVector = false; - auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle); + auto Instructions = HexagonMCInstrInfo::bundleInstructions(*CurrentBundle); auto i = Instructions.end() - 1; for (auto n = Instructions.begin() - 1;; --i, ++Offset) { if (i == n) diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp index 9030e43b7149f..f83e06cd3d930 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp @@ -33,30 +33,18 @@ void HexagonInstPrinter::printRegName(raw_ostream &O, MCRegister Reg) { void HexagonInstPrinter::printInst(const MCInst *MI, uint64_t Address, StringRef Annot, const MCSubtargetInfo &STI, raw_ostream &OS) { - assert(HexagonMCInstrInfo::isBundle(*MI)); - assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE); - assert(HexagonMCInstrInfo::bundleSize(*MI) > 0); - HasExtender = false; - for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) { - MCInst const &MCI = *I.getInst(); - if (HexagonMCInstrInfo::isDuplex(MII, MCI)) { - printInstruction(MCI.getOperand(1).getInst(), Address, OS); - OS << '\v'; - HasExtender = false; - printInstruction(MCI.getOperand(0).getInst(), Address, OS); - } else - printInstruction(&MCI, Address, OS); - HasExtender = HexagonMCInstrInfo::isImmext(MCI); - OS << "\n"; - } - - bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI); - bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI); - if (IsLoop0) { - OS << (IsLoop1 ? " :endloop01" : " :endloop0"); - } else if (IsLoop1) { - OS << " :endloop1"; + if (HexagonMCInstrInfo::isDuplex(MII, *MI)) { + printInstruction(MI->getOperand(1).getInst(), Address, OS); + OS << '\v'; + HasExtender = false; + printInstruction(MI->getOperand(0).getInst(), Address, OS); + } else { + printInstruction(MI, Address, OS); } + HasExtender = HexagonMCInstrInfo::isImmext(*MI); + if ((MI->getOpcode() & HexagonII::INST_PARSE_MASK) == + HexagonII::INST_PARSE_PACKET_END) + HasExtender = false; } void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo, diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp index 980df819b2c26..bfea50e2d6dc0 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp @@ -252,8 +252,21 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer { std::string Buffer; { raw_string_ostream TempStream(Buffer); - InstPrinter.printInst(&Inst, Address, "", STI, TempStream); + for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) { + InstPrinter.printInst(I.getInst(), Address, "", STI, TempStream); + TempStream << "\n"; + } + } + + std::string LoopString = ""; + bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(Inst); + bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(Inst); + if (IsLoop0) { + LoopString += (IsLoop1 ? " :endloop01" : " :endloop0"); + } else if (IsLoop1) { + LoopString += " :endloop1"; } + StringRef Contents(Buffer); auto PacketBundle = Contents.rsplit('\n'); auto HeadTail = PacketBundle.first.split('\n'); @@ -275,9 +288,9 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer { } if (HexagonMCInstrInfo::isMemReorderDisabled(Inst)) - OS << "\n\t} :mem_noshuf" << PacketBundle.second; + OS << "\n\t} :mem_noshuf" << LoopString; else - OS << "\t}" << PacketBundle.second; + OS << "\t}" << LoopString; } void finish() override { finishAttributeSection(); } diff --git a/llvm/test/MC/Hexagon/two_ext.s b/llvm/test/MC/Hexagon/two_ext.s index 28b2aa3f1ecae..09b51c5f029a7 100644 --- a/llvm/test/MC/Hexagon/two_ext.s +++ b/llvm/test/MC/Hexagon/two_ext.s @@ -6,7 +6,7 @@ if (!p1) call foo_b } # CHECK: 00004000 { immext(#0) -# CHECK: 5d004100 if (p1) call 0x0 +# CHECK: 5d004100 if (p1) call 0x0 <.text> # CHECK: 00004000 immext(#0) -# CHECK: 5d20c100 if (!p1) call 0x0 } +# CHECK: 5d20c100 if (!p1) call 0x0 <.text> } diff --git a/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s new file mode 100644 index 0000000000000..6a4927e4af2a4 --- /dev/null +++ b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s @@ -0,0 +1,47 @@ +/// Checks that various hexagon scenarios are handled correctly: +/// - branch targets +/// - endloops +/// - inline-relocs +/// - multi-insn bundles + +{ + r6 = sub(r1, r0) + r7 = and(r4, #0x0) + if (p1) jump:t target1 + if (p2) jump:nt target2 +} + +{ + r8 = r7 + r9 = add(r8, #0) + r10 = memw(r9) +} :endloop0 + +{ jump ##sym } + +target1: + nop + +target2: + nop + +// RUN: llvm-mc %s --triple=hexagon -filetype=obj | llvm-objdump -d -r - | FileCheck %s + +// CHECK: 00000000 <.text>: +// CHECK-NEXT: 0: 12 51 00 5c 5c005112 { if (p1) jump:t 0x24 +// CHECK-NEXT: 4: 14 42 00 5c 5c004214 if (p2) jump:nt 0x28 +// CHECK-NEXT: 8: 06 41 20 f3 f3204106 r6 = sub(r1,r0) +// CHECK-NEXT: c: 07 c0 04 76 7604c007 r7 = and(r4,#0x0) } +// CHECK-NEXT: 10: 08 80 67 70 70678008 { r8 = r7 +// CHECK-NEXT: 14: 09 40 08 b0 b0084009 r9 = add(r8,#0x0) +// CHECK-NEXT: 18: 0a c0 89 91 9189c00a r10 = memw(r9+#0x0) } :endloop0 +// CHECK-NEXT: 1c: 00 40 00 00 00004000 { immext(#0x0) +// CHECK-NEXT: 0000001c: R_HEX_B32_PCREL_X sym +// CHECK-NEXT: 20: 00 c0 00 58 5800c000 jump 0x1c <.text+0x1c> } +// CHECK-NEXT: 00000020: R_HEX_B22_PCREL_X sym+0x4 +// CHECK-EMPTY: +// CHECK-NEXT: 00000024 : +// CHECK-NEXT: 24: 00 c0 00 7f 7f00c000 { nop } +// CHECK-EMPTY: +// CHECK-NEXT: 00000028 : +// CHECK-NEXT: 28: 00 c0 00 7f 7f00c000 { nop } diff --git a/llvm/tools/llvm-mc/Disassembler.cpp b/llvm/tools/llvm-mc/Disassembler.cpp index 607184e3b7247..86727931067a5 100644 --- a/llvm/tools/llvm-mc/Disassembler.cpp +++ b/llvm/tools/llvm-mc/Disassembler.cpp @@ -45,7 +45,11 @@ static bool PrintInsts(const MCDisassembler &DisAsm, const ByteArrayTy &Bytes, MCInst Inst; MCDisassembler::DecodeStatus S; - S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls()); + if (STI.getTargetTriple().getArch() == Triple::hexagon) + S = DisAsm.getInstructionBundle(Inst, Size, Data.slice(Index), Index, + nulls()); + else + S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls()); switch (S) { case MCDisassembler::Fail: SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]), diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp index c5967cd090eec..74eb9033c8e2c 100644 --- a/llvm/tools/llvm-objdump/llvm-objdump.cpp +++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp @@ -693,6 +693,30 @@ class PrettyPrinter { } else OS << "\t"; } + + virtual void emitPostInstructionInfo(formatted_raw_ostream &FOS, + const MCAsmInfo &MAI, + const MCSubtargetInfo &STI, + StringRef Comments, + LiveVariablePrinter &LVP) { + do { + if (!Comments.empty()) { + // Emit a line of comments. + StringRef Comment; + std::tie(Comment, Comments) = Comments.split('\n'); + // MAI.getCommentColumn() assumes that instructions are printed at the + // position of 8, while getInstStartColumn() returns the actual + // position. + unsigned CommentColumn = + MAI.getCommentColumn() - 8 + getInstStartColumn(STI); + FOS.PadToColumn(CommentColumn); + FOS << MAI.getCommentString() << ' ' << Comment; + } + LVP.printAfterInst(FOS); + FOS << "\n"; + } while (!Comments.empty()); + FOS.flush(); + } }; PrettyPrinter PrettyPrinterInst; @@ -714,6 +738,35 @@ class HexagonPrettyPrinter : public PrettyPrinter { } } } + + std::string getInstructionSeparator() const { + SmallString<40> Separator; + raw_svector_ostream OS(Separator); + if (ShouldClosePacket) { + OS << " }"; + if (IsLoop0 || IsLoop1) + OS << " "; + if (IsLoop0) + OS << (IsLoop1 ? ":endloop01" : ":endloop0"); + else if (IsLoop1) + OS << ":endloop1"; + } + OS << '\n'; + return OS.str().str(); + } + + void emitPostInstructionInfo(formatted_raw_ostream &FOS, const MCAsmInfo &MAI, + const MCSubtargetInfo &STI, StringRef Comments, + LiveVariablePrinter &LVP) override { + // Hexagon does not write anything to the comment stream, so we can just + // print the separator. + LVP.printAfterInst(FOS); + FOS << getInstructionSeparator(); + FOS.flush(); + if (ShouldClosePacket) + reset(); + } + void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef Bytes, object::SectionedAddress Address, formatted_raw_ostream &OS, StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP, @@ -724,60 +777,64 @@ class HexagonPrettyPrinter : public PrettyPrinter { if (!MI) { printLead(Bytes, Address.Address, OS); OS << " "; + reset(); return; } - std::string Buffer; + + StringRef Preamble = IsStartOfBundle ? " { " : " "; + + if (SP && (PrintSource || PrintLines)) + SP->printSourceLine(OS, Address, ObjectFilename, LVP, ""); + printLead(Bytes, Address.Address, OS); + OS << Preamble; + std::string Buf; { - raw_string_ostream TempStream(Buffer); + raw_string_ostream TempStream(Buf); IP.printInst(MI, Address.Address, "", STI, TempStream); } - StringRef Contents(Buffer); - // Split off bundle attributes - auto PacketBundle = Contents.rsplit('\n'); - // Split off first instruction from the rest - auto HeadTail = PacketBundle.first.split('\n'); - auto Preamble = " { "; - auto Separator = ""; - - // Hexagon's packets require relocations to be inline rather than - // clustered at the end of the packet. - std::vector::const_iterator RelCur = Rels->begin(); - std::vector::const_iterator RelEnd = Rels->end(); - auto PrintReloc = [&]() -> void { - while ((RelCur != RelEnd) && (RelCur->getOffset() <= Address.Address)) { - if (RelCur->getOffset() == Address.Address) { - printRelocation(OS, ObjectFilename, *RelCur, Address.Address, false); - return; - } - ++RelCur; - } - }; + StringRef Contents(Buf); + + auto Duplex = Contents.split('\v'); + bool HasDuplex = !Duplex.second.empty(); + if (HasDuplex) { + OS << Duplex.first; + OS << "; "; + OS << Duplex.second; + } else { + OS << Duplex.first; + } - while (!HeadTail.first.empty()) { - OS << Separator; - Separator = "\n"; - if (SP && (PrintSource || PrintLines)) - SP->printSourceLine(OS, Address, ObjectFilename, LVP, ""); - printLead(Bytes, Address.Address, OS); - OS << Preamble; - Preamble = " "; - StringRef Inst; - auto Duplex = HeadTail.first.split('\v'); - if (!Duplex.second.empty()) { - OS << Duplex.first; - OS << "; "; - Inst = Duplex.second; - } + uint32_t Instruction = support::endian::read32le(Bytes.data()); + + uint32_t ParseMask = 0x0000c000; + uint32_t PacketEndMask = 0x0000c000; + uint32_t LoopEndMask = 0x00008000; + uint32_t ParseBits = Instruction & ParseMask; + + if (ParseBits == LoopEndMask) { + if (IsStartOfBundle) + IsLoop0 = true; else - Inst = HeadTail.first; - OS << Inst; - HeadTail = HeadTail.second.split('\n'); - if (HeadTail.first.empty()) - OS << " } " << PacketBundle.second; - PrintReloc(); - Bytes = Bytes.slice(4); - Address.Address += 4; + IsLoop1 = true; } + + IsStartOfBundle = false; + + if (ParseBits == PacketEndMask || HasDuplex) + ShouldClosePacket = true; + } + +private: + bool IsStartOfBundle = true; + bool IsLoop0 = false; + bool IsLoop1 = false; + bool ShouldClosePacket = false; + + void reset() { + IsStartOfBundle = true; + IsLoop0 = false; + IsLoop1 = false; + ShouldClosePacket = false; } }; HexagonPrettyPrinter HexagonPrettyPrinterInst; @@ -1610,29 +1667,6 @@ static StringRef getSegmentName(const MachOObjectFile *MachO, return ""; } -static void emitPostInstructionInfo(formatted_raw_ostream &FOS, - const MCAsmInfo &MAI, - const MCSubtargetInfo &STI, - StringRef Comments, - LiveVariablePrinter &LVP) { - do { - if (!Comments.empty()) { - // Emit a line of comments. - StringRef Comment; - std::tie(Comment, Comments) = Comments.split('\n'); - // MAI.getCommentColumn() assumes that instructions are printed at the - // position of 8, while getInstStartColumn() returns the actual position. - unsigned CommentColumn = - MAI.getCommentColumn() - 8 + getInstStartColumn(STI); - FOS.PadToColumn(CommentColumn); - FOS << MAI.getCommentString() << ' ' << Comment; - } - LVP.printAfterInst(FOS); - FOS << '\n'; - } while (!Comments.empty()); - FOS.flush(); -} - static void createFakeELFSections(ObjectFile &Obj) { assert(Obj.isELF()); if (auto *Elf32LEObj = dyn_cast(&Obj)) @@ -2526,15 +2560,15 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj, } assert(DT->Context->getAsmInfo()); - emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(), - *DT->SubtargetInfo, CommentStream.str(), LVP); + DT->Printer->emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(), + *DT->SubtargetInfo, + CommentStream.str(), LVP); Comments.clear(); if (BTF) printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LVP); - // Hexagon handles relocs in pretty printer - if (InlineRelocs && Obj.getArch() != Triple::hexagon) { + if (InlineRelocs) { while (findRel()) { // When --adjust-vma is used, update the address printed. printRelocation(FOS, Obj.getFileName(), *RelCur, From a676ecd83fad9b04d315c4e667742d25679cbc9f Mon Sep 17 00:00:00 2001 From: lntue Date: Fri, 18 Jul 2025 11:28:22 -0400 Subject: [PATCH 355/813] [libc][math] Add POSIX math constants to math.h header. (#149150) https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/math.h.html --- libc/include/llvm-libc-macros/math-macros.h | 101 ++++++++++++++++++++ libc/test/include/CMakeLists.txt | 15 +++ libc/test/include/math_constants_test.c | 23 +++++ 3 files changed, 139 insertions(+) create mode 100644 libc/test/include/math_constants_test.c diff --git a/libc/include/llvm-libc-macros/math-macros.h b/libc/include/llvm-libc-macros/math-macros.h index 2f05d7544666e..6697ce5b03851 100644 --- a/libc/include/llvm-libc-macros/math-macros.h +++ b/libc/include/llvm-libc-macros/math-macros.h @@ -50,4 +50,105 @@ #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT) #endif +// POSIX math constants +// https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/math.h.html +#define M_E (__extension__ 0x1.5bf0a8b145769p1) +#define M_EGAMMA (__extension__ 0x1.2788cfc6fb619p-1) +#define M_LOG2E (__extension__ 0x1.71547652b82fep0) +#define M_LOG10E (__extension__ 0x1.bcb7b1526e50ep-2) +#define M_LN2 (__extension__ 0x1.62e42fefa39efp-1) +#define M_LN10 (__extension__ 0x1.26bb1bbb55516p1) +#define M_PHI (__extension__ 0x1.9e3779b97f4a8p0) +#define M_PI (__extension__ 0x1.921fb54442d18p1) +#define M_PI_2 (__extension__ 0x1.921fb54442d18p0) +#define M_PI_4 (__extension__ 0x1.921fb54442d18p-1) +#define M_1_PI (__extension__ 0x1.45f306dc9c883p-2) +#define M_1_SQRTPI (__extension__ 0x1.20dd750429b6dp-1) +#define M_2_PI (__extension__ 0x1.45f306dc9c883p-1) +#define M_2_SQRTPI (__extension__ 0x1.20dd750429b6dp0) +#define M_SQRT2 (__extension__ 0x1.6a09e667f3bcdp0) +#define M_SQRT3 (__extension__ 0x1.bb67ae8584caap0) +#define M_SQRT1_2 (__extension__ 0x1.6a09e667f3bcdp-1) +#define M_SQRT1_3 (__extension__ 0x1.279a74590331cp-1) + +#define M_Ef (__extension__ 0x1.5bf0a8p1f) +#define M_EGAMMAf (__extension__ 0x1.2788dp-1f) +#define M_LOG2Ef (__extension__ 0x1.715476p0f) +#define M_LOG10Ef (__extension__ 0x1.bcb7b2p-2f) +#define M_LN2f (__extension__ 0x1.62e43p-1f) +#define M_LN10f (__extension__ 0x1.26bb1cp1f) +#define M_PHIf (__extension__ 0x1.9e377ap0f) +#define M_PIf (__extension__ 0x1.921fb6p1f) +#define M_PI_2f (__extension__ 0x1.921fb6p0f) +#define M_PI_4f (__extension__ 0x1.921fb6p-1f) +#define M_1_PIf (__extension__ 0x1.45f306p-2f) +#define M_1_SQRTPIf (__extension__ 0x1.20dd76p-1f) +#define M_2_PIf (__extension__ 0x1.45f306p-1f) +#define M_2_SQRTPIf (__extension__ 0x1.20dd76p0f) +#define M_SQRT2f (__extension__ 0x1.6a09e6p0f) +#define M_SQRT3f (__extension__ 0x1.bb67aep0f) +#define M_SQRT1_2f (__extension__ 0x1.6a09e6p-1f) +#define M_SQRT1_3f (__extension__ 0x1.279a74p-1f) + +#define M_El (__extension__ 0x1.5bf0a8b1457695355fb8ac404e7ap1L) +#define M_EGAMMAl (__extension__ 0x1.2788cfc6fb618f49a37c7f0202a6p-1L) +#define M_LOG2El (__extension__ 0x1.71547652b82fe1777d0ffda0d23ap0L) +#define M_LOG10El (__extension__ 0x1.bcb7b1526e50e32a6ab7555f5a68p-2L) +#define M_LN2l (__extension__ 0x1.62e42fefa39ef35793c7673007e6p-1L) +#define M_LN10l (__extension__ 0x1.26bb1bbb5551582dd4adac5705a6p1L) +#define M_PHIl (__extension__ 0x1.9e3779b97f4a7c15f39cc0605ceep0L) +#define M_PIl (__extension__ 0x1.921fb54442d18469898cc51701b8p1L) +#define M_PI_2l (__extension__ 0x1.921fb54442d18469898cc51701b8p0L) +#define M_PI_4l (__extension__ 0x1.921fb54442d18469898cc51701b8p-1L) +#define M_1_PIl (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-2L) +#define M_1_SQRTPIl (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep-1L) +#define M_2_PIl (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-1L) +#define M_2_SQRTPIl (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep0L) +#define M_SQRT2l (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p0L) +#define M_SQRT3l (__extension__ 0x1.bb67ae8584caa73b25742d7078b8p0L) +#define M_SQRT1_2l (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p-1L) +#define M_SQRT1_3l (__extension__ 0x1.279a74590331c4d218f81e4afb25p-1L) + +#ifdef __FLT16_MANT_DIG__ +#define M_Ef16 (__extension__ 0x1.5cp1f16) +#define M_EGAMMAf16 (__extension__ 0x1.278p-1f16) +#define M_LOG2Ef16 (__extension__ 0x1.714f16) +#define M_LOG10Ef16 (__extension__ 0x1.bccp-2f16) +#define M_LN2f16 (__extension__ 0x1.63p-1f16) +#define M_LN10f16 (__extension__ 0x1.26cp1f16) +#define M_PHIf16 (__extension__ 0x1.9e4p0f16) +#define M_PIf16 (__extension__ 0x1.92p1f16) +#define M_PI_2f16 (__extension__ 0x1.92p0f16) +#define M_PI_4f16 (__extension__ 0x1.92p-1f16) +#define M_1_PIf16 (__extension__ 0x1.46p-2f16) +#define M_1_SQRTPIf16 (__extension__ 0x1.20cp-1f16) +#define M_2_PIf16 (__extension__ 0x1.46p-1f16) +#define M_2_SQRTPIf16 (__extension__ 0x1.20cp0f16) +#define M_SQRT2f16 (__extension__ 0x1.6ap0f16) +#define M_SQRT3f16 (__extension__ 0x1.bb8p0f16) +#define M_SQRT1_2f16 (__extension__ 0x1.6ap-1f16) +#define M_SQRT1_3f16 (__extension__ 0x1.278p-1f16) +#endif // __FLT16_MANT_DIG__ + +#ifdef __SIZEOF_FLOAT128__ +#define M_Ef128 (__extension__ 0x1.5bf0a8b1457695355fb8ac404e7ap1q) +#define M_EGAMMAf128 (__extension__ 0x1.2788cfc6fb618f49a37c7f0202a6p-1q) +#define M_LOG2Ef128 (__extension__ 0x1.71547652b82fe1777d0ffda0d23ap0q) +#define M_LOG10Ef128 (__extension__ 0x1.bcb7b1526e50e32a6ab7555f5a68p-2q) +#define M_LN2f128 (__extension__ 0x1.62e42fefa39ef35793c7673007e6p-1q) +#define M_LN10f128 (__extension__ 0x1.26bb1bbb5551582dd4adac5705a6p1q) +#define M_PHIf128 (__extension__ 0x1.9e3779b97f4a7c15f39cc0605ceep0q) +#define M_PIf128 (__extension__ 0x1.921fb54442d18469898cc51701b8p1q) +#define M_PI_2f128 (__extension__ 0x1.921fb54442d18469898cc51701b8p0q) +#define M_PI_4f128 (__extension__ 0x1.921fb54442d18469898cc51701b8p-1q) +#define M_1_PIf128 (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-2q) +#define M_1_SQRTPIf128 (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep-1q) +#define M_2_PIf128 (__extension__ 0x1.45f306dc9c882a53f84eafa3ea6ap-1q) +#define M_2_SQRTPIf128 (__extension__ 0x1.20dd750429b6d11ae3a914fed7fep0q) +#define M_SQRT2f128 (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p0q) +#define M_SQRT3f128 (__extension__ 0x1.bb67ae8584caa73b25742d7078b8p0q) +#define M_SQRT1_2f128 (__extension__ 0x1.6a09e667f3bcc908b2fb1366ea95p-1q) +#define M_SQRT1_3f128 (__extension__ 0x1.279a74590331c4d218f81e4afb25p-1q) +#endif // __SIZEOF_FLOAT128__ + #endif // LLVM_LIBC_MACROS_MATH_MACROS_H diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt index 24935cec048ba..11e4c3a84157f 100644 --- a/libc/test/include/CMakeLists.txt +++ b/libc/test/include/CMakeLists.txt @@ -484,6 +484,21 @@ add_libc_test( libc.include.llvm-libc-macros.math_function_macros ) +add_libc_test( + math_constants_c_test + C_TEST + UNIT_TEST_ONLY + SUITE + libc_include_tests + SRCS + math_constants_test.c + COMPILE_OPTIONS + -Wall + -Werror + DEPENDS + libc.include.llvm-libc-macros.math_macros +) + # Test `#include <...>` of each header in each available language mode. # This is gated on -DLLVM_LIBC_BUILD_HEADER_TESTS=ON until all the bugs # in headers are fixed so the tests all compile. diff --git a/libc/test/include/math_constants_test.c b/libc/test/include/math_constants_test.c new file mode 100644 index 0000000000000..eb497a9d8a50a --- /dev/null +++ b/libc/test/include/math_constants_test.c @@ -0,0 +1,23 @@ +//===-- Unittests for math constants --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "include/llvm-libc-macros/math-macros.h" + +#define IS_DOUBLE(X) _Generic((X), double: 1, default: 0) + +#define IS_FLOAT(X) _Generic((X), float: 1, default: 0) + +// check if macro is defined +#ifndef M_PI +#error "M_PI macro is not defined" +#else +int main(void) { + _Static_assert(IS_DOUBLE(M_PI), "M_PI is not of double type."); + _Static_assert(IS_FLOAT(M_PIf), "M_PIf is not of float type."); + return 0; +} +#endif From d737fe2c91391a41a5b5ee8e3062d78a01936c61 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Fri, 18 Jul 2025 11:54:27 -0400 Subject: [PATCH 356/813] [libc++][NFC] Fix typos in the libc++ 21 release notes (#149536) --- libcxx/docs/ReleaseNotes/21.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index 6f18b61284f49..d31ca0130cb80 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -10,7 +10,7 @@ Written by the `Libc++ Team `_ .. warning:: - These are in-progress notes for the upcoming libc++ 20.0.0 release. + These are in-progress notes for the upcoming libc++ 21.0.0 release. Release notes for previous releases can be found on `the Download Page `_. @@ -18,7 +18,7 @@ Introduction ============ This document contains the release notes for the libc++ C++ Standard Library, -part of the LLVM Compiler Infrastructure, release 20.0.0. Here we describe the +part of the LLVM Compiler Infrastructure, release 21.0.0. Here we describe the status of libc++ in some detail, including major improvements from the previous release and new feature work. For the general LLVM release notes, see `the LLVM documentation `_. All LLVM releases may From 148fd6ed0a21aaa540ad443b8108456b191dd485 Mon Sep 17 00:00:00 2001 From: Annu Singh Date: Fri, 18 Jul 2025 21:30:44 +0530 Subject: [PATCH 357/813] [DAG] Adding abdu/abds to canCreateUndefOrPoison (#149017) Fixes #147695 - [Alive2 test - freeze abdu](https://alive2.llvm.org/ce/z/aafeJs) - [Alive 2 test - freeze abds](https://alive2.llvm.org/ce/z/XrSmP4) --------- Co-authored-by: Simon Pilgrim --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 2 ++ llvm/test/CodeGen/AArch64/freeze.ll | 34 +++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 5453828177c72..245811587e3b4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5544,6 +5544,8 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::USUBSAT: case ISD::MULHU: case ISD::MULHS: + case ISD::ABDU: + case ISD::ABDS: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll index 0c56e1b66e81f..d428b6aa483a7 100644 --- a/llvm/test/CodeGen/AArch64/freeze.ll +++ b/llvm/test/CodeGen/AArch64/freeze.ll @@ -395,3 +395,37 @@ define i64 @freeze_array() { %t1 = add i64 %v1, %v2 ret i64 %t1 } + +define <8 x i16> @freeze_abdu(<8 x i16> %a, <8 x i16> %b) { +; CHECK-SD-LABEL: freeze_abdu: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: uaba v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_abdu: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: uabd v1.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret + %d = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> %a, <8 x i16> %b) + %f = freeze <8 x i16> %d + %r = add <8 x i16> %a, %f + ret <8 x i16> %r +} + +define <8 x i16> @freeze_abds(<8 x i16> %a, <8 x i16> %b) { +; CHECK-SD-LABEL: freeze_abds: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: saba v0.8h, v0.8h, v1.8h +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: freeze_abds: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: sabd v1.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h +; CHECK-GI-NEXT: ret + %d = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %a, <8 x i16> %b) + %f = freeze <8 x i16> %d + %r = add <8 x i16> %a, %f + ret <8 x i16> %r +} From c244c3b2d95a1605337b1156fad412ee2c9cd8c9 Mon Sep 17 00:00:00 2001 From: Jonathan Peyton Date: Fri, 18 Jul 2025 11:03:12 -0500 Subject: [PATCH 358/813] [OpenMP] [NFC] Remove dead code: building task stack (#143589) This code hasn't been enabled since the first code changes were introduced. Remove the dead code. --- openmp/runtime/src/kmp.h | 30 ---- openmp/runtime/src/kmp_tasking.cpp | 252 +---------------------------- 2 files changed, 1 insertion(+), 281 deletions(-) diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h index f62cabee6ea84..307dc625a67e9 100644 --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -34,15 +34,6 @@ #define TASK_CURRENT_NOT_QUEUED 0 #define TASK_CURRENT_QUEUED 1 -#ifdef BUILD_TIED_TASK_STACK -#define TASK_STACK_EMPTY 0 // entries when the stack is empty -#define TASK_STACK_BLOCK_BITS 5 // Used in TASK_STACK_SIZE and TASK_STACK_MASK -// Number of entries in each task stack array -#define TASK_STACK_BLOCK_SIZE (1 << TASK_STACK_BLOCK_BITS) -// Mask for determining index into stack block -#define TASK_STACK_INDEX_MASK (TASK_STACK_BLOCK_SIZE - 1) -#endif // BUILD_TIED_TASK_STACK - #define TASK_NOT_PUSHED 1 #define TASK_SUCCESSFULLY_PUSHED 0 #define TASK_TIED 1 @@ -2704,23 +2695,6 @@ extern std::atomic __kmp_tdg_task_id; extern kmp_int32 __kmp_num_tdg; #endif -#ifdef BUILD_TIED_TASK_STACK - -/* Tied Task stack definitions */ -typedef struct kmp_stack_block { - kmp_taskdata_t *sb_block[TASK_STACK_BLOCK_SIZE]; - struct kmp_stack_block *sb_next; - struct kmp_stack_block *sb_prev; -} kmp_stack_block_t; - -typedef struct kmp_task_stack { - kmp_stack_block_t ts_first_block; // first block of stack entries - kmp_taskdata_t **ts_top; // pointer to the top of stack - kmp_int32 ts_entries; // number of entries on the stack -} kmp_task_stack_t; - -#endif // BUILD_TIED_TASK_STACK - typedef struct kmp_tasking_flags { /* Total struct must be exactly 32 bits */ #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) /* Same fields as in the #else branch, but in reverse order */ @@ -2863,10 +2837,6 @@ typedef struct kmp_base_thread_data { kmp_int32 td_deque_ntasks; // Number of tasks in deque // GEH: shouldn't this be volatile since used in while-spin? kmp_int32 td_deque_last_stolen; // Thread number of last successful steal -#ifdef BUILD_TIED_TASK_STACK - kmp_task_stack_t td_susp_tied_tasks; // Stack of suspended tied tasks for task -// scheduling constraint -#endif // BUILD_TIED_TASK_STACK } kmp_base_thread_data_t; #define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index d7bc4922d54f7..e4d92a78fd6b9 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -42,221 +42,6 @@ static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id); int __kmp_taskloop_task(int gtid, void *ptask); #endif -#ifdef BUILD_TIED_TASK_STACK - -// __kmp_trace_task_stack: print the tied tasks from the task stack in order -// from top do bottom -// -// gtid: global thread identifier for thread containing stack -// thread_data: thread data for task team thread containing stack -// threshold: value above which the trace statement triggers -// location: string identifying call site of this function (for trace) -static void __kmp_trace_task_stack(kmp_int32 gtid, - kmp_thread_data_t *thread_data, - int threshold, char *location) { - kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; - kmp_taskdata_t **stack_top = task_stack->ts_top; - kmp_int32 entries = task_stack->ts_entries; - kmp_taskdata_t *tied_task; - - KA_TRACE( - threshold, - ("__kmp_trace_task_stack(start): location = %s, gtid = %d, entries = %d, " - "first_block = %p, stack_top = %p \n", - location, gtid, entries, task_stack->ts_first_block, stack_top)); - - KMP_DEBUG_ASSERT(stack_top != NULL); - KMP_DEBUG_ASSERT(entries > 0); - - while (entries != 0) { - KMP_DEBUG_ASSERT(stack_top != &task_stack->ts_first_block.sb_block[0]); - // fix up ts_top if we need to pop from previous block - if (entries & TASK_STACK_INDEX_MASK == 0) { - kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(stack_top); - - stack_block = stack_block->sb_prev; - stack_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; - } - - // finish bookkeeping - stack_top--; - entries--; - - tied_task = *stack_top; - - KMP_DEBUG_ASSERT(tied_task != NULL); - KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); - - KA_TRACE(threshold, - ("__kmp_trace_task_stack(%s): gtid=%d, entry=%d, " - "stack_top=%p, tied_task=%p\n", - location, gtid, entries, stack_top, tied_task)); - } - KMP_DEBUG_ASSERT(stack_top == &task_stack->ts_first_block.sb_block[0]); - - KA_TRACE(threshold, - ("__kmp_trace_task_stack(exit): location = %s, gtid = %d\n", - location, gtid)); -} - -// __kmp_init_task_stack: initialize the task stack for the first time -// after a thread_data structure is created. -// It should not be necessary to do this again (assuming the stack works). -// -// gtid: global thread identifier of calling thread -// thread_data: thread data for task team thread containing stack -static void __kmp_init_task_stack(kmp_int32 gtid, - kmp_thread_data_t *thread_data) { - kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; - kmp_stack_block_t *first_block; - - // set up the first block of the stack - first_block = &task_stack->ts_first_block; - task_stack->ts_top = (kmp_taskdata_t **)first_block; - memset((void *)first_block, '\0', - TASK_STACK_BLOCK_SIZE * sizeof(kmp_taskdata_t *)); - - // initialize the stack to be empty - task_stack->ts_entries = TASK_STACK_EMPTY; - first_block->sb_next = NULL; - first_block->sb_prev = NULL; -} - -// __kmp_free_task_stack: free the task stack when thread_data is destroyed. -// -// gtid: global thread identifier for calling thread -// thread_data: thread info for thread containing stack -static void __kmp_free_task_stack(kmp_int32 gtid, - kmp_thread_data_t *thread_data) { - kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; - kmp_stack_block_t *stack_block = &task_stack->ts_first_block; - - KMP_DEBUG_ASSERT(task_stack->ts_entries == TASK_STACK_EMPTY); - // free from the second block of the stack - while (stack_block != NULL) { - kmp_stack_block_t *next_block = (stack_block) ? stack_block->sb_next : NULL; - - stack_block->sb_next = NULL; - stack_block->sb_prev = NULL; - if (stack_block != &task_stack->ts_first_block) { - __kmp_thread_free(thread, - stack_block); // free the block, if not the first - } - stack_block = next_block; - } - // initialize the stack to be empty - task_stack->ts_entries = 0; - task_stack->ts_top = NULL; -} - -// __kmp_push_task_stack: Push the tied task onto the task stack. -// Grow the stack if necessary by allocating another block. -// -// gtid: global thread identifier for calling thread -// thread: thread info for thread containing stack -// tied_task: the task to push on the stack -static void __kmp_push_task_stack(kmp_int32 gtid, kmp_info_t *thread, - kmp_taskdata_t *tied_task) { - // GEH - need to consider what to do if tt_threads_data not allocated yet - kmp_thread_data_t *thread_data = - &thread->th.th_task_team->tt.tt_threads_data[__kmp_tid_from_gtid(gtid)]; - kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; - - if (tied_task->td_flags.team_serial || tied_task->td_flags.tasking_ser) { - return; // Don't push anything on stack if team or team tasks are serialized - } - - KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); - KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); - - KA_TRACE(20, - ("__kmp_push_task_stack(enter): GTID: %d; THREAD: %p; TASK: %p\n", - gtid, thread, tied_task)); - // Store entry - *(task_stack->ts_top) = tied_task; - - // Do bookkeeping for next push - task_stack->ts_top++; - task_stack->ts_entries++; - - if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { - // Find beginning of this task block - kmp_stack_block_t *stack_block = - (kmp_stack_block_t *)(task_stack->ts_top - TASK_STACK_BLOCK_SIZE); - - // Check if we already have a block - if (stack_block->sb_next != - NULL) { // reset ts_top to beginning of next block - task_stack->ts_top = &stack_block->sb_next->sb_block[0]; - } else { // Alloc new block and link it up - kmp_stack_block_t *new_block = (kmp_stack_block_t *)__kmp_thread_calloc( - thread, sizeof(kmp_stack_block_t)); - - task_stack->ts_top = &new_block->sb_block[0]; - stack_block->sb_next = new_block; - new_block->sb_prev = stack_block; - new_block->sb_next = NULL; - - KA_TRACE( - 30, - ("__kmp_push_task_stack(): GTID: %d; TASK: %p; Alloc new block: %p\n", - gtid, tied_task, new_block)); - } - } - KA_TRACE(20, ("__kmp_push_task_stack(exit): GTID: %d; TASK: %p\n", gtid, - tied_task)); -} - -// __kmp_pop_task_stack: Pop the tied task from the task stack. Don't return -// the task, just check to make sure it matches the ending task passed in. -// -// gtid: global thread identifier for the calling thread -// thread: thread info structure containing stack -// tied_task: the task popped off the stack -// ending_task: the task that is ending (should match popped task) -static void __kmp_pop_task_stack(kmp_int32 gtid, kmp_info_t *thread, - kmp_taskdata_t *ending_task) { - // GEH - need to consider what to do if tt_threads_data not allocated yet - kmp_thread_data_t *thread_data = - &thread->th.th_task_team->tt_threads_data[__kmp_tid_from_gtid(gtid)]; - kmp_task_stack_t *task_stack = &thread_data->td.td_susp_tied_tasks; - kmp_taskdata_t *tied_task; - - if (ending_task->td_flags.team_serial || ending_task->td_flags.tasking_ser) { - // Don't pop anything from stack if team or team tasks are serialized - return; - } - - KMP_DEBUG_ASSERT(task_stack->ts_top != NULL); - KMP_DEBUG_ASSERT(task_stack->ts_entries > 0); - - KA_TRACE(20, ("__kmp_pop_task_stack(enter): GTID: %d; THREAD: %p\n", gtid, - thread)); - - // fix up ts_top if we need to pop from previous block - if (task_stack->ts_entries & TASK_STACK_INDEX_MASK == 0) { - kmp_stack_block_t *stack_block = (kmp_stack_block_t *)(task_stack->ts_top); - - stack_block = stack_block->sb_prev; - task_stack->ts_top = &stack_block->sb_block[TASK_STACK_BLOCK_SIZE]; - } - - // finish bookkeeping - task_stack->ts_top--; - task_stack->ts_entries--; - - tied_task = *(task_stack->ts_top); - - KMP_DEBUG_ASSERT(tied_task != NULL); - KMP_DEBUG_ASSERT(tied_task->td_flags.tasktype == TASK_TIED); - KMP_DEBUG_ASSERT(tied_task == ending_task); // If we built the stack correctly - - KA_TRACE(20, ("__kmp_pop_task_stack(exit): GTID: %d; TASK: %p\n", gtid, - tied_task)); - return; -} -#endif /* BUILD_TIED_TASK_STACK */ - // returns 1 if new task is allowed to execute, 0 otherwise // checks Task Scheduling constraint (if requested) and // mutexinoutset dependencies if any @@ -683,13 +468,6 @@ static void __kmp_task_start(kmp_int32 gtid, kmp_task_t *task, // KMP_DEBUG_ASSERT( current_task -> td_flags.executing == 1 ); current_task->td_flags.executing = 0; -// Add task to stack if tied -#ifdef BUILD_TIED_TASK_STACK - if (taskdata->td_flags.tiedness == TASK_TIED) { - __kmp_push_task_stack(gtid, thread, taskdata); - } -#endif /* BUILD_TIED_TASK_STACK */ - // mark starting task as executing and as current task thread->th.th_current_task = taskdata; @@ -1041,13 +819,6 @@ static void __kmp_task_finish(kmp_int32 gtid, kmp_task_t *task, is_taskgraph = taskdata->is_taskgraph; #endif -// Pop task from stack if tied -#ifdef BUILD_TIED_TASK_STACK - if (taskdata->td_flags.tiedness == TASK_TIED) { - __kmp_pop_task_stack(gtid, thread, taskdata); - } -#endif /* BUILD_TIED_TASK_STACK */ - if (UNLIKELY(taskdata->td_flags.tiedness == TASK_UNTIED)) { // untied task needs to check the counter so that the task structure is not // freed prematurely @@ -3786,13 +3557,6 @@ static void __kmp_free_task_deque(kmp_thread_data_t *thread_data) { thread_data->td.td_deque = NULL; __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock); } - -#ifdef BUILD_TIED_TASK_STACK - // GEH: Figure out what to do here for td_susp_tied_tasks - if (thread_data->td.td_susp_tied_tasks.ts_entries != TASK_STACK_EMPTY) { - __kmp_free_task_stack(__kmp_thread_from_gtid(gtid), thread_data); - } -#endif // BUILD_TIED_TASK_STACK } // __kmp_realloc_task_threads_data: @@ -3849,14 +3613,7 @@ static int __kmp_realloc_task_threads_data(kmp_info_t *thread, KMP_MEMCPY_S((void *)new_data, nthreads * sizeof(kmp_thread_data_t), (void *)old_data, maxthreads * sizeof(kmp_thread_data_t)); -#ifdef BUILD_TIED_TASK_STACK - // GEH: Figure out if this is the right thing to do - for (i = maxthreads; i < nthreads; i++) { - kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; - __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); - } -#endif // BUILD_TIED_TASK_STACK - // Install the new data and free the old data + // Install the new data and free the old data (*threads_data_p) = new_data; __kmp_free(old_data); } else { @@ -3868,13 +3625,6 @@ static int __kmp_realloc_task_threads_data(kmp_info_t *thread, // kmp_reap_task_team( ). *threads_data_p = (kmp_thread_data_t *)__kmp_allocate( nthreads * sizeof(kmp_thread_data_t)); -#ifdef BUILD_TIED_TASK_STACK - // GEH: Figure out if this is the right thing to do - for (i = 0; i < nthreads; i++) { - kmp_thread_data_t *thread_data = &(*threads_data_p)[i]; - __kmp_init_task_stack(__kmp_gtid_from_thread(thread), thread_data); - } -#endif // BUILD_TIED_TASK_STACK } task_team->tt.tt_max_threads = nthreads; } else { From a9147e64aa751caaa106953fded2d0f7223bb167 Mon Sep 17 00:00:00 2001 From: Peter Rong Date: Fri, 18 Jul 2025 09:11:59 -0700 Subject: [PATCH 359/813] =?UTF-8?q?Revert=20"[DWARFLinker]=20Use=20differe?= =?UTF-8?q?nt=20addresses=20to=20distinguish=20invalid=20=E2=80=A6=20(#149?= =?UTF-8?q?422)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …DW_AT_LLVM_stmt_sequence offset (#149376)" This reverts commit b0c6148584854af3d7ed2425034c3b5252f6b769. --- llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp index 559d808a72f98..222dc88098102 100644 --- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp @@ -43,12 +43,6 @@ namespace llvm { using namespace dwarf_linker; using namespace dwarf_linker::classic; -enum InvalidStmtSeqOffset { - MaxStmtSeqOffset = UINT64_MAX, - OrigOffsetMissing = MaxStmtSeqOffset - 1, - NewOffsetMissing = MaxStmtSeqOffset - 2, -}; - /// Hold the input and output of the debug info size in bytes. struct DebugInfoSize { uint64_t Input; @@ -2321,7 +2315,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { // Some sequences are discarded by the DWARFLinker if they are invalid // (empty). if (OrigRowIter == SeqOffToOrigRow.end()) { - StmtSeq.set(OrigOffsetMissing); + StmtSeq.set(UINT64_MAX); continue; } size_t OrigRowIndex = OrigRowIter->second; @@ -2331,7 +2325,7 @@ void DWARFLinker::DIECloner::generateLineTableForUnit(CompileUnit &Unit) { if (NewRowIter == OrigRowToNewRow.end()) { // If the original row index is not found in the map, update the // stmt_sequence attribute to the 'invalid offset' magic value. - StmtSeq.set(NewOffsetMissing); + StmtSeq.set(UINT64_MAX); continue; } From 92e2d4e9e1ad7a8d66d481b4df3f971450f829f5 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 18 Jul 2025 17:38:11 +0100 Subject: [PATCH 360/813] [DAG] visitFREEZE - remove unused HadMaybePoisonOperands check. NFC. (#149517) Redundant since #145939 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b7e41427f2c27..fed5e7238433e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16772,12 +16772,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { if (DAG.isGuaranteedNotToBeUndefOrPoison(Op, /*PoisonOnly*/ false, /*Depth*/ 1)) continue; - bool HadMaybePoisonOperands = !MaybePoisonOperands.empty(); - bool IsNewMaybePoisonOperand = MaybePoisonOperands.insert(Op).second; - if (IsNewMaybePoisonOperand) + if (MaybePoisonOperands.insert(Op).second) MaybePoisonOperandNumbers.push_back(OpNo); - if (!HadMaybePoisonOperands) - continue; } // NOTE: the whole op may be not guaranteed to not be undef or poison because // it could create undef or poison due to it's poison-generating flags. From 7d040d4675baf6881cf50c4dba78cc18af85f9ef Mon Sep 17 00:00:00 2001 From: Han-Chung Wang Date: Fri, 18 Jul 2025 09:42:40 -0700 Subject: [PATCH 361/813] [mlir][linalg] Handle outer_dims_perm in linalg.pack consumer fusion. (#149426) Signed-off-by: hanhanW --- .../Linalg/Transforms/TilingInterfaceImpl.cpp | 9 +++- .../tile-and-fuse-consumer.mlir | 45 +++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp index 5a10883a6043c..b059bcc025315 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp @@ -893,6 +893,13 @@ struct PackOpTiling SmallVector outerDimOffsets, outerDimSizes; DenseMap dimAndTileMapping = packOp.getDimAndTileMapping(); + SmallVector outerShapeWithoutTranspose( + packOp.getDestType().getShape().take_front(packOp.getSourceRank())); + if (!packOp.getOuterDimsPerm().empty()) { + applyPermutationToVector( + outerShapeWithoutTranspose, + invertPermutationVector(packOp.getOuterDimsPerm())); + } for (auto dim : llvm::seq(packOp.getSourceRank())) { if (dimAndTileMapping.count(dim)) { FailureOr cstTileSize = @@ -908,7 +915,7 @@ struct PackOpTiling // TODO: It could be untiled if the `srcDimSize` is dynamic. It is a // hard check to determine if a dimension is tiled or not. int64_t srcDimSize = packOp.getSourceType().getDimSize(dim); - int64_t destDimSize = packOp.getDestType().getDimSize(dim); + int64_t destDimSize = outerShapeWithoutTranspose[dim]; bool isTiled = failed(cstTileSize) || ShapedType::isDynamic(srcDimSize) || cstTileSize.value() != srcDimSize; diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index 7b0a8494a8acb..20164d5dfd91a 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -451,6 +451,51 @@ module attributes {transform.with_named_sequence} { // ----- + +func.func @fuse_perfect_tiling_pack_consumer_with_outer_dims_perm(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>, %arg2: tensor<2x64x16x1xf32>) -> tensor<2x64x16x1xf32> { + %0 = scf.forall (%arg3) = (0) to (32) step (16) shared_outs(%arg4 = %arg1) -> (tensor<64x32xf32>) { + %src = tensor.extract_slice %arg0[0, %arg3] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32> + %dest = tensor.extract_slice %arg4[0, %arg3] [64, 16] [1, 1] : tensor<64x32xf32> to tensor<64x16xf32> + %1 = linalg.exp ins(%src : tensor<64x16xf32>) outs(%dest : tensor<64x16xf32>) -> tensor<64x16xf32> + scf.forall.in_parallel { + tensor.parallel_insert_slice %1 into %arg4[0, %arg3] [64, 16] [1, 1] : tensor<64x16xf32> into tensor<64x32xf32> + } + } + %pack = linalg.pack %0 outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] into %arg2 : tensor<64x32xf32> -> tensor<2x64x16x1xf32> + return %pack : tensor<2x64x16x1xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK: #[[PACK_RESULT_MAP:.*]] = affine_map<(d0) -> (d0 floordiv 16)> +// CHECK: func.func @fuse_perfect_tiling_pack_consumer_with_outer_dims_perm( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]] +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]] +// CHECK: %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (32) step (16) +// CHECK-SAME: shared_outs(%[[FIRST_OUT_ARG:.*]] = %[[ARG1]], %[[PACK_OUT_ARG:.*]] = %[[ARG2]]) +// CHECK: %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: %[[ELEM_DEST:.*]] = tensor.extract_slice %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: %[[ELEM:.*]] = linalg.exp +// CHECK-SAME: ins(%[[ELEM_SRC]] +// CHECK-SAME: outs(%[[ELEM_DEST]] +// CHECK-DAG: %[[PACK_RESULT_OFFSET:.*]] = affine.apply #[[PACK_RESULT_MAP]](%[[IV]]) +// CHECK-DAG: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], 0, 0, 0] [1, 64, 16, 1] [1, 1, 1, 1] +// CHECK: %[[PACK:.*]] = linalg.pack %[[ELEM]] +// CHECK-SAME: outer_dims_perm = [1, 0] inner_dims_pos = [1, 0] inner_tiles = [16, 1] +// CHECK-SAME: into %[[TILED_PACK_DEST]] +// CHECK: scf.forall.in_parallel { +// CHECK: tensor.parallel_insert_slice %[[ELEM]] into %[[FIRST_OUT_ARG]][0, %[[IV]]] [64, 16] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][%[[PACK_RESULT_OFFSET]], 0, 0, 0] [1, 64, 16, 1] [1, 1, 1, 1] + +// ----- + // It is valid to fuse the pack op in perfect tiling scenario when the dimension // is dynamic and padding is not needed. From 0c75e093813c86a1c99b75d2a46f56db7ab516dd Mon Sep 17 00:00:00 2001 From: Shaoce SUN Date: Sat, 19 Jul 2025 00:44:16 +0800 Subject: [PATCH 362/813] [TableGen] Add `getName()` to error messages for better debugging (#149531) Including the name helps quickly locate the corresponding Instruction that caused the issue. --- llvm/utils/TableGen/Common/CodeGenSchedule.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp index 50346c29e8862..b07ea9e9d5caf 100644 --- a/llvm/utils/TableGen/Common/CodeGenSchedule.cpp +++ b/llvm/utils/TableGen/Common/CodeGenSchedule.cpp @@ -2114,7 +2114,8 @@ void CodeGenSchedModels::addWriteRes(const Record *ProcWriteResDef, const Record *WRDef = ProcWriteResDef->getValueAsDef("WriteType"); if (!WRMap.try_emplace(WRDef, ProcWriteResDef).second) PrintFatalError(ProcWriteResDef->getLoc(), - "WriteType already used in another WriteRes"); + "WriteType of " + WRDef->getName() + + " already used in another WriteRes"); } // Visit ProcResourceKinds referenced by the newly discovered WriteRes. @@ -2148,7 +2149,8 @@ void CodeGenSchedModels::addReadAdvance(const Record *ProcReadAdvanceDef, const Record *RADef = ProcReadAdvanceDef->getValueAsDef("ReadType"); if (!RAMap.try_emplace(RADef, ProcReadAdvanceDef).second) PrintFatalError(ProcReadAdvanceDef->getLoc(), - "ReadType already used in another ReadAdvance"); + "ReadType of " + RADef->getName() + + " already used in another ReadAdvance"); } } From 01213141357e4a79d2d97187ff0cb89d8d173634 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Fri, 18 Jul 2025 18:48:07 +0200 Subject: [PATCH 363/813] [MemoryTaggingSupport] Remove unnecessary bitcast (NFC) As the comment indicates, this is no longer necessary with opaque pointers. --- llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 6226596017980..40dc02c546dfa 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -232,13 +232,7 @@ void alignAndPadAlloca(memtag::AllocaInfo &Info, llvm::Align Alignment) { NewAI->setSwiftError(Info.AI->isSwiftError()); NewAI->copyMetadata(*Info.AI); - Value *NewPtr = NewAI; - - // TODO: Remove when typed pointers dropped - if (Info.AI->getType() != NewAI->getType()) - NewPtr = new BitCastInst(NewAI, Info.AI->getType(), "", Info.AI->getIterator()); - - Info.AI->replaceAllUsesWith(NewPtr); + Info.AI->replaceAllUsesWith(NewAI); Info.AI->eraseFromParent(); Info.AI = NewAI; } From 73e4b589ba9526c72f495ca6898ed18d730d2db4 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 18 Jul 2025 09:51:21 -0700 Subject: [PATCH 364/813] MC: Simplify fragment reuse determination First, avoid checking MCSubtargetInfo by reducing unnecessary overhead introduced in https://reviews.llvm.org/D44928 . That change passed STI to both FT_Data and FT_Relaxable fragments, but STI is only necessary for FT_Relaxable. The use of STI in FT_Data was added for: * Bundle alignment mode, which has been removed (#148781). * ARM, which inappropriately uses STI in `ARMAsmBackend::applyFixup` due to tech debt, unlike other targets. All tests passed even without the `copySTI` change. To ensure safety, `copySTI` now starts a new fragment to prevent mixed STI values. Second, avoid checking LinkerRelaxable by eagerly starting a new fragment when a FT_Data/FT_Align fragment is marked linker-relaxable. There is currently an extra empty FT_Data if an alignment immediately follows a linker-relaxable fragment, which will be improved in the future when FT_Align information is moved to the variable-tail. Pull Request: https://github.com/llvm/llvm-project/pull/149471 --- llvm/include/llvm/MC/MCObjectStreamer.h | 13 +------ llvm/include/llvm/MC/MCSection.h | 1 + llvm/include/llvm/MC/MCStreamer.h | 4 +- llvm/lib/MC/MCObjectStreamer.cpp | 43 ++++++++++------------ llvm/lib/MC/MCParser/MCTargetAsmParser.cpp | 5 +++ llvm/lib/MC/MCStreamer.cpp | 13 +++++++ llvm/test/MC/RISCV/Relocations/mc-dump.s | 1 + 7 files changed, 43 insertions(+), 37 deletions(-) diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h index a55fd4a14675f..319e131999d48 100644 --- a/llvm/include/llvm/MC/MCObjectStreamer.h +++ b/llvm/include/llvm/MC/MCObjectStreamer.h @@ -73,20 +73,9 @@ class MCObjectStreamer : public MCStreamer { MCSymbol *emitCFILabel() override; void emitCFISections(bool EH, bool Debug) override; - void insert(MCFragment *F) { - auto *Sec = CurFrag->getParent(); - F->setParent(Sec); - F->setLayoutOrder(CurFrag->getLayoutOrder() + 1); - CurFrag->Next = F; - CurFrag = F; - Sec->curFragList()->Tail = F; - } - /// Get a data fragment to write into, creating a new one if the current /// fragment is not FT_Data. - /// Optionally a \p STI can be passed in so that a new fragment is created - /// if the Subtarget differs from the current fragment. - MCFragment *getOrCreateDataFragment(const MCSubtargetInfo *STI = nullptr); + MCFragment *getOrCreateDataFragment(); protected: bool changeSectionImpl(MCSection *Section, uint32_t Subsection); diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h index 296fdd8af0d14..313071ec75033 100644 --- a/llvm/include/llvm/MC/MCSection.h +++ b/llvm/include/llvm/MC/MCSection.h @@ -188,6 +188,7 @@ class LLVM_ABI MCSection { // destructors. class MCFragment { friend class MCAssembler; + friend class MCStreamer; friend class MCObjectStreamer; friend class MCSection; diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h index b3a9aabd6ece5..4b91dbc794682 100644 --- a/llvm/include/llvm/MC/MCStreamer.h +++ b/llvm/include/llvm/MC/MCStreamer.h @@ -429,7 +429,6 @@ class LLVM_ABI MCStreamer { CurFrag->getParent() == getCurrentSection().first); return CurFrag; } - /// Save the current and previous section on the section stack. void pushSection() { SectionStack.push_back( @@ -457,6 +456,9 @@ class LLVM_ABI MCStreamer { MCSymbol *endSection(MCSection *Section); + void insert(MCFragment *F); + void newFragment(); + /// Returns the mnemonic for \p MI, if the streamer has access to a /// instruction printer and returns an empty string otherwise. virtual StringRef getMnemonic(const MCInst &MI) const { return ""; } diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp index 67433f2b265e5..d5b8f22463894 100644 --- a/llvm/lib/MC/MCObjectStreamer.cpp +++ b/llvm/lib/MC/MCObjectStreamer.cpp @@ -106,26 +106,12 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) { MCDwarfFrameEmitter::Emit(*this, MAB, false); } -static bool canReuseDataFragment(const MCFragment &F, - const MCAssembler &Assembler, - const MCSubtargetInfo *STI) { - if (!F.hasInstructions()) - return true; - // Do not add data after a linker-relaxable instruction. The difference - // between a new label and a label at or before the linker-relaxable - // instruction cannot be resolved at assemble-time. - if (F.isLinkerRelaxable()) - return false; - // If the subtarget is changed mid fragment we start a new fragment to record - // the new STI. - return !STI || F.getSubtargetInfo() == STI; -} - -MCFragment * -MCObjectStreamer::getOrCreateDataFragment(const MCSubtargetInfo *STI) { +MCFragment *MCObjectStreamer::getOrCreateDataFragment() { + // TODO: Start a new fragment whenever finalizing the variable-size tail of a + // previous one, so that all getOrCreateDataFragment calls can be replaced + // with getCurrentFragment auto *F = getCurrentFragment(); - if (F->getKind() != MCFragment::FT_Data || - !canReuseDataFragment(*F, *Assembler, STI)) { + if (F->getKind() != MCFragment::FT_Data) { F = getContext().allocFragment(); insert(F); } @@ -363,16 +349,23 @@ void MCObjectStreamer::emitInstToData(const MCInst &Inst, F->doneAppending(); if (!Fixups.empty()) F->appendFixups(Fixups); + F->setHasInstructions(STI); + bool MarkedLinkerRelaxable = false; for (auto &Fixup : MutableArrayRef(F->getFixups()).slice(FixupStartIndex)) { Fixup.setOffset(Fixup.getOffset() + CodeOffset); - if (Fixup.isLinkerRelaxable()) { - F->setLinkerRelaxable(); + if (!Fixup.isLinkerRelaxable()) + continue; + F->setLinkerRelaxable(); + // Do not add data after a linker-relaxable instruction. The difference + // between a new label and a label at or before the linker-relaxable + // instruction cannot be resolved at assemble-time. + if (!MarkedLinkerRelaxable) { + MarkedLinkerRelaxable = true; getCurrentSectionOnly()->setLinkerRelaxable(); + newFragment(); } } - - F->setHasInstructions(STI); } void MCObjectStreamer::emitInstToFragment(const MCInst &Inst, @@ -568,8 +561,10 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment, // if the alignment is larger than the minimum NOP size. unsigned Size; if (getAssembler().getBackend().shouldInsertExtraNopBytesForCodeAlign(*F, - Size)) + Size)) { getCurrentSectionOnly()->setLinkerRelaxable(); + newFragment(); + } } void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset, diff --git a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp index 665d92eb9a21c..7f0934971b27c 100644 --- a/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp +++ b/llvm/lib/MC/MCParser/MCTargetAsmParser.cpp @@ -9,6 +9,7 @@ #include "llvm/MC/MCParser/MCTargetAsmParser.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCRegister.h" +#include "llvm/MC/MCStreamer.h" using namespace llvm; @@ -22,6 +23,10 @@ MCTargetAsmParser::~MCTargetAsmParser() = default; MCSubtargetInfo &MCTargetAsmParser::copySTI() { MCSubtargetInfo &STICopy = getContext().getSubtargetCopy(getSTI()); STI = &STICopy; + // The returned STI will likely be modified. Create a new fragment to prevent + // mixing STI values within a fragment. + if (getStreamer().getCurrentFragment()) + getStreamer().newFragment(); return STICopy; } diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp index d814ab8880500..c3ecf8fc717f5 100644 --- a/llvm/lib/MC/MCStreamer.cpp +++ b/llvm/lib/MC/MCStreamer.cpp @@ -1404,6 +1404,19 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) { return Sym; } +void MCStreamer::insert(MCFragment *F) { + auto *Sec = CurFrag->getParent(); + F->setParent(Sec); + F->setLayoutOrder(CurFrag->getLayoutOrder() + 1); + CurFrag->Next = F; + CurFrag = F; + Sec->curFragList()->Tail = F; +} + +void MCStreamer::newFragment() { + insert(getContext().allocFragment()); +} + static VersionTuple targetVersionOrMinimumSupportedOSVersion(const Triple &Target, VersionTuple TargetVersion) { diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s index 24f3e67ebbdda..f72258498169f 100644 --- a/llvm/test/MC/RISCV/Relocations/mc-dump.s +++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s @@ -9,6 +9,7 @@ # CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00] # CHECK-NEXT: Fixup @0 Value:specifier(19,ext) Kind:4023 # CHECK-NEXT: Symbol @0 $x +# CHECK-NEXT:8 Data Size:0 [] # CHECK-NEXT:8 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops # CHECK-NEXT:12 Data Size:4 [13,05,30,00] # CHECK-NEXT:16 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops From 5138b61a25f11eb8675d0031712c1ee6b4cb8be4 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Fri, 18 Jul 2025 12:55:11 -0400 Subject: [PATCH 365/813] [AMDGPU][True16][Codegen] remove packed build_vector pattern from true16 (#148715) Some of the packed build_vector use vgpr_32 for i16/f16/bf16. In gfx11, bf16 arithmetic get promoted to f32 and this is done via v2i16 pack. In true16 mode this v2i16 pack is selected to a build_vector/v_lshlrev pattern which only accepts VGPR32. This causes isel to insert an illegal copy "vgpr32 = copy vgpr16" between def and use. In the end this illegal copy confuses cse pass and trigger wrong code elimination. Remove the packed build_vector pattern from true16. After removal, ISel will use vgpr16 build_vector patterns instead. --- llvm/lib/Target/AMDGPU/SIInstructions.td | 18 +- llvm/test/CodeGen/AMDGPU/add.v2i16.ll | 14 +- .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll | 11906 ++++++++-------- .../CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll | 1953 ++- .../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll | 6 +- .../CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll | 1308 +- .../CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll | 2946 ++-- .../CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll | 216 +- .../CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll | 87 +- .../CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll | 5478 +++---- .../CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll | 1125 +- .../CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll | 1075 +- .../AMDGPU/atomicrmw-bf16-gfx11plus.ll | 122 + llvm/test/CodeGen/AMDGPU/bf16.ll | 841 +- .../buffer-fat-pointer-atomicrmw-fadd.ll | 302 +- .../buffer-fat-pointer-atomicrmw-fmax.ll | 302 +- .../buffer-fat-pointer-atomicrmw-fmin.ll | 302 +- .../CodeGen/AMDGPU/calling-conventions.ll | 161 +- llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 28 +- .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 486 +- .../AMDGPU/divergence-driven-buildvector.ll | 57 +- llvm/test/CodeGen/AMDGPU/fabs.bf16.ll | 123 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 694 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 694 +- .../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 694 +- .../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 694 +- llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll | 532 +- llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll | 56 +- llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll | 532 +- llvm/test/CodeGen/AMDGPU/function-args.ll | 257 +- .../AMDGPU/gfx-callable-argument-types.ll | 231 +- .../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 694 +- .../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 694 +- .../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 694 +- .../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 694 +- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll | 12 +- llvm/test/CodeGen/AMDGPU/idot4u.ll | 36 +- .../isel-amdgpu-cs-chain-preserve-cc.ll | 124 +- llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll | 219 +- llvm/test/CodeGen/AMDGPU/llvm.log.ll | 115 +- llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 115 +- llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 75 +- llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 47 +- llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll | 7085 +++++---- llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll | 7085 +++++---- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll | 14 +- .../test/CodeGen/AMDGPU/vector-reduce-umax.ll | 4 +- 47 files changed, 25634 insertions(+), 25313 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 2a6fcadd4c49c..991d9f83e92e4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3427,30 +3427,32 @@ def : GCNPat < (S_LSHL_B32 SReg_32:$src1, (i16 16)) >; +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat < (v2i16 (DivergentBinFrag (i16 0), (i16 VGPR_32:$src1))), (v2i16 (V_LSHLREV_B32_e64 (i16 16), VGPR_32:$src1)) >; - def : GCNPat < - (v2i16 (UniformBinFrag (i16 SReg_32:$src1), (i16 0))), - (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) + (v2i16 (DivergentBinFrag (i16 VGPR_32:$src1), (i16 0))), + (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) >; def : GCNPat < - (v2i16 (DivergentBinFrag (i16 VGPR_32:$src1), (i16 0))), - (v2i16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) + (v2f16 (DivergentBinFrag (f16 VGPR_32:$src1), (f16 FP_ZERO))), + (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) >; +} def : GCNPat < - (v2f16 (UniformBinFrag (f16 SReg_32:$src1), (f16 FP_ZERO))), + (v2i16 (UniformBinFrag (i16 SReg_32:$src1), (i16 0))), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; def : GCNPat < - (v2f16 (DivergentBinFrag (f16 VGPR_32:$src1), (f16 FP_ZERO))), - (v2f16 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), VGPR_32:$src1)) + (v2f16 (UniformBinFrag (f16 SReg_32:$src1), (f16 FP_ZERO))), + (S_AND_B32 (S_MOV_B32 (i32 0xffff)), SReg_32:$src1) >; foreach vecTy = [v2i16, v2f16, v2bf16] in { diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll index 50d20e9b0e4d7..6cb236dbee76e 100644 --- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -780,7 +780,8 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -789,11 +790,12 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_u16 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index cb2f0f28a29d6..0d5f538215f18 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_4: ; %end @@ -6549,319 +6549,314 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v53, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v65 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v53, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v32i32_to_v128i8: @@ -15418,63 +15413,63 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -15488,146 +15483,144 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -15641,720 +15634,746 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l ; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -42137,64 +42156,64 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -42222,50 +42241,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -42309,50 +42328,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB36_4: ; %end @@ -42360,319 +42379,314 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v53, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v65 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v53, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v32f32_to_v128i8: @@ -52196,63 +52210,63 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -52266,146 +52280,144 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -52419,720 +52431,746 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l ; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -77900,64 +77938,64 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -77985,50 +78023,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -78097,50 +78135,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB56_4: ; %end @@ -78148,319 +78186,314 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v53, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v65 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v53, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v16i64_to_v128i8: @@ -87027,63 +87060,63 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -87097,146 +87130,144 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -87250,720 +87281,746 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l ; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -111743,64 +111800,64 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -111828,50 +111885,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -111915,50 +111972,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB72_4: ; %end @@ -111966,319 +112023,314 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v67 -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v64 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v53, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v39 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v55, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v65 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v53, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v16f64_to_v128i8: @@ -121787,63 +121839,63 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:336 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:312 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:248 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 ; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -121857,146 +121909,144 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.l, v6.l +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v64.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.l, 8, v69.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(61) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v68.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(59) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v96.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.l, 8, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.l, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.l, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v69.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v69.h, 8, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.h, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -122010,720 +122060,746 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v148.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v150.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v150.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v151.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v145.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v146.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v132.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v135.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v146.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v132.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v147.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v119.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v133.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v130.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v133.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v118.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v129.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v130.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v116.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v129.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v8.h, v117.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v112.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v113.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v85.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v99.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v99.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v96.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v17.h, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v18.h, v82.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v83.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v69.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v55.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v64.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v22.h, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v23.h, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v53.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v27.l, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v27.h, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v50.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v28.h, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v51.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v148.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v145.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v150.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v150.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v151.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v145.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v151.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v144.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v135.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v135.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v146.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v147.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v132.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v131.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v119.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v132.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v133.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v130.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v133.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v134.h, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v134.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v128.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v128.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v11, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v114.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v114.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v113.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v129.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v130.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v116.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v116.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v101.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v102.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v117.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l ; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v103.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v117.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v103.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v97.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v96.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v85.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v85.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v112.h, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v113.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v99.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v84.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v99.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v84.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v83.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v17, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v100.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v80.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v100.h, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l ; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v86.l, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v19, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v70.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v86.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v21, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v87.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v96.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v67.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h ; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v87.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v81.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v66.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v66.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v52.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v81.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h ; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v22, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v82.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v82.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.l, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v68.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v69.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v24, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v68.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v69.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v21, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v70.l, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v37.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v55.h, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v64.h, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v20, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v64.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v65.l, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, 0x300, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v29, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v65.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v53.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v54.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v55.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v39, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v53.h, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v54.h, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v26, v39 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v49.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v50.l, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v50.h, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v51.l, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v51.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v25, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v33, 16, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v37, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v34 +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -160055,116 +160131,116 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 @@ -160187,341 +160263,338 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v162.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v4.h ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v46.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v74.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v45.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v42.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v60.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v94.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v76.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v95.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.h, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v24.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v26.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v30.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.h ; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v33 :: v_dual_lshlrev_b32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v37, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v70, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v17, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v55.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v70, v36, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_add_f32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v70.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v36, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v34, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v37, v51, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v17, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v48, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v71, v37, v51 :: v_dual_lshlrev_b32 v20, 16, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v71.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v80.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v70 +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v55 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v34, v17 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v73, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v19, 0x40c00000, v19 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v83, v33, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v83.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v84.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v33, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v71 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v82.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v20 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_cndmask_b32 v84, v19, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v20, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v80 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v83.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v82 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v84 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v34, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v41, 8, v22 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v85.h -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v86.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_lshlrev_b32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v33, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v86.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_lshlrev_b32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v33, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v23, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v44, 8, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v96, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_f32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v97.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v96.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v87 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v85 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v35, v38 :: v_dual_add_f32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v98.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v25, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v28, 0x40c00000, v28 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v97.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v36, v26, v38 :: v_dual_add_f32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v26, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v26, v33, v28, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v26, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v98 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v25, v39, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v96 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v100.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v25, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v99.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 24, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v27, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v100 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v28 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v99 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 @@ -160529,21 +160602,22 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v112.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25 ; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1 ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v103.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1 @@ -160556,45 +160630,44 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v113.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 ; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v31 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v31, 0x40c00000, v31 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 24, v30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v29 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v31, 16, v31 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v115.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v31, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 ; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v114.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v116, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v116, v33, v37 :: v_dual_and_b32 v35, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v116.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v32, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 @@ -160607,10 +160680,10 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v32, v33, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v32, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v115 +; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v114 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v132, v31, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v37, v36 @@ -160622,9 +160695,9 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v131.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v133.h ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v144, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -160640,252 +160713,240 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v147, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v144.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v149, v33, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v144, v33, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v162, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v146.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v34, v36, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v162.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v164.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v36, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v147.h -; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v38 :: v_dual_lshlrev_b32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 8, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v149 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v144 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v165.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v35, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v164.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v180, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v180, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v180.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v33, v8, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v6, v33, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v6, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v39 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v165 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v5, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v161 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v179.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v178, v5, v38 :: v_dual_add_f32 v33, 0x40c00000, v39 ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v10 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v178.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 24, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v46, v35, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v179 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v178 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v47, v35, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v45, v7, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v47.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v44, v7, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v46.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v42, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v41, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v45.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v44.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v37, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v42 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v41 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 24, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v59, v38, v50, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v61, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v14 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v59.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v74, v35, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v61.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v73, v35, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff ; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v60, v48, v52 :: v_dual_add_f32 v37, 0x40c00000, v51 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v57, v48, v52, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v14, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v60 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v57 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v74.h ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v35, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v12 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v39 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v76, v37, v38 :: v_dual_and_b32 v37, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_lshlrev_b32 v16, 16, v16 ; GFX11-TRUE16-NEXT: v_add3_u32 v39, v49, v14, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v37, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v39, v48, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_add3_u32 v14, v49, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v77.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v78, v39, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v78.h ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v13, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v16, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v94, v35, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v35, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v93, v13, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v48, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v91, v13, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v94.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v95, v39, v51, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v104.h +; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v89.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v76 +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v77 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v92, v35, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v104.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v91.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v95.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v93.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v39, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14 ; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v92 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] @@ -160905,332 +160966,327 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7 ; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v144.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v106.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v107.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v149.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v107.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v106.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v78.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v105.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v162.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v164.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v105.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v94.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v91.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v148.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v147.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v95.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v93.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v180.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v90.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v164.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v180.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v90.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v144.h ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v165.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v47.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v76.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v58.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v75.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v161.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v179.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v72.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v6.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v46.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v75.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v178.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v63.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v179.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v62.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v74.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v57.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v73.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v178.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v59.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v8.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v45.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v43.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v89.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v41.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v42.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v89.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v40.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v61.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v183.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v16, v14 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v42.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v43.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v60.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v181.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v11.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v104.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v176.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v166.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v167.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v57.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v78.h ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v94.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v167.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v59.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v182.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v77.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v20, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v76.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v104.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v91.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v92.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v73.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v79.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v72.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v61.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v12.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v77.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v95.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v18, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v93.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v149.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, v18, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v79.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v92.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v18, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v74.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v46.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v63.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v62.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v60.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v47.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v58.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v45.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v19, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v40.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v44.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v18, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v177.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v182.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v41.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v183.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v176.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v181.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v19.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v177.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v22, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v163.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v162.l ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v166.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v148.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v28, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v26, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v28 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v145.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v25, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v135.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v25, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v28, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v128.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.l, 8, v119.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v13.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v13.h, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v32, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v34 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v114.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v14 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[19:22], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:112 ; GFX11-TRUE16-NEXT: s_clause 0x1f ; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:16 @@ -185249,64 +185305,64 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -185331,52 +185387,52 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] @@ -185436,371 +185492,366 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v68 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v69 -; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v51, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v68 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v54 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v51, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v65, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v64f16_to_v128i8: @@ -208007,64 +208058,64 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -208089,52 +208140,52 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] @@ -208194,371 +208245,366 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v1.h, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v160.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v68 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v69 -; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v151.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v67 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v51, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v68 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v51, v54 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v33.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v51, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v66 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v65 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v51, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52 +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.l, v12.h, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v65, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v12, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v16, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v15, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v17, v18 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v20, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v22, v23 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v26, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v80.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v67 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v64 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v38, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v35, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v25, v26 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v27, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v29, v30 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v32 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v33, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51 +; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[17:20], off offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[21:24], off offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[25:28], off offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[29:32], off offset:112 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v64i16_to_v128i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 34d7ed9290b67..3e96ab1d597d6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -2675,79 +2675,76 @@ define <4 x i32> @bitcast_v8bf16_to_v4i32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0 ; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -4122,18 +4119,18 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -4147,107 +4144,103 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7140,79 +7133,76 @@ define <4 x float> @bitcast_v8bf16_to_v4f32(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0 ; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -8603,18 +8593,18 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -8628,107 +8618,103 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11253,79 +11239,76 @@ define <2 x i64> @bitcast_v8bf16_to_v2i64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0 ; GFX11-TRUE16-NEXT: .LBB66_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -12700,18 +12683,18 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -12725,107 +12708,103 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14952,79 +14931,76 @@ define <2 x double> @bitcast_v8bf16_to_v2f64(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v8, v10 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v5 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v9 :: v_dual_and_b32 v5, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v13, v11, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v8, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v13, v0, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v7, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v3, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v6, v2 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v7 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v5, v0 ; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -16407,18 +16383,18 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -16432,107 +16408,103 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18254,83 +18226,83 @@ define <8 x i16> @bitcast_v8bf16_to_v8i16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v1, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v12, v12, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v12, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v9, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v12, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v13, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v11, v12 :: v_dual_and_b32 v3, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v14, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v9, v15, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v7 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.h ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v3, v0, 16, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v1, 16, v2 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v6, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v7, 16, v5 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v4, 16, v6 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -19840,18 +19812,18 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -19865,107 +19837,103 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -21172,79 +21140,79 @@ define <8 x half> @bitcast_v8bf16_to_v8f16(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v2.l +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v8 :: v_dual_and_b32 v5, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_cndmask_b32 v1, v7, v9 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v1, v7 :: v_dual_and_b32 v7, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v11, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v7 :: v_dual_add_f32 v7, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v12, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_cndmask_b32 v2, v2, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v14, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v10, v14, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v13, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v7, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v2.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v5, v3 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v8, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v9, v4 ; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -22758,18 +22726,18 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -22783,107 +22751,103 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23876,87 +23840,92 @@ define <16 x i8> @bitcast_v8bf16_to_v16i8(<8 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_lshlrev_b32 v1, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v14, v4, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_and_b32 v3, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v14, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v9, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v2, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v13, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v11 -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v13, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v13, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v8, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v12, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v8, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v3, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v2, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v14, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v10, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v0, v9, v13, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v13, v13 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, v0, v8 :: v_dual_and_b32 v5, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v12.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v7, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v5, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v8, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v11, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v10, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v15, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v14, v10, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v2, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v14, v9, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v1, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v13, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 24, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v7, v2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v9, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v9, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[2:3] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h @@ -24976,18 +24945,18 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -25001,107 +24970,103 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v10 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v7.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v8.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v5.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v8, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll index 2c78e34823742..5344095e99217 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll @@ -659,7 +659,8 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 @@ -1132,7 +1133,8 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index a19567bbe24f6..f8ffaa456c2b3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -6296,33 +6296,32 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -6334,188 +6333,194 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v15.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v18.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v15.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v16.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v8.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v8.h, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v9.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v9.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13330,33 +13335,32 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -13368,188 +13372,194 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v15.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v18.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v15.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v16.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v8.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v8.h, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v9.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v9.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19882,33 +19892,32 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -19920,188 +19929,194 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v15.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v18.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v15.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v16.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v8.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v8.h, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v9.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v9.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25924,33 +25939,32 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -25962,188 +25976,194 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v15.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.h, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v4.h, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v14.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v18.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v15.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v15.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v13.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v16.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v11.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v12.l, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v8.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v8.h, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v9.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v9.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index e773b546afe1b..0cefbc1c2dee5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -2966,20 +2966,20 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -2995,17 +2995,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow @@ -3029,17 +3029,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_4: ; %end @@ -3047,100 +3047,105 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: @@ -5033,50 +5038,48 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 @@ -5091,228 +5094,243 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v15.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v15.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v16.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v18.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v11.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9933,20 +9951,20 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -9962,17 +9980,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB32_2: ; %Flow @@ -9992,17 +10010,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB32_4: ; %end @@ -10010,100 +10028,105 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: @@ -12014,50 +12037,48 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3 @@ -12072,228 +12093,243 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v22.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v24.h, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v19.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v15.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v19.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v15.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v16.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v18.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v12.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v13.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v14.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v14.l, v6.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v11.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v17, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16322,20 +16358,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -16351,17 +16387,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow @@ -16385,17 +16421,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_4: ; %end @@ -16403,100 +16439,105 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8: @@ -22438,20 +22479,20 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -22467,17 +22508,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB60_2: ; %Flow @@ -22501,17 +22542,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB60_4: ; %end @@ -22519,100 +22560,105 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8: @@ -28813,50 +28859,50 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3 @@ -28871,228 +28917,243 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v30.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v23.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v22.h, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v23.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v25.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v21.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v24.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v25.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v26.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v16.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v16.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v17.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v17.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -30847,20 +30908,20 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -30876,17 +30937,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB74_2: ; %Flow @@ -30905,17 +30966,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %end @@ -30923,100 +30984,105 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8: @@ -32944,50 +33010,50 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3 @@ -33002,228 +33068,243 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v30.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v5.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v30.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v34.l, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v23.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v22.h, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v23.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v25.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v21.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v24.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v25.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v20.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v26.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v18.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v19.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v21.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v20.h, v6.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v30.l, 3 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v16.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v16.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v17.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v17.h, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v3, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v14, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -34993,20 +35074,20 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -35022,17 +35103,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB78_2: ; %Flow @@ -35059,17 +35140,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %end @@ -35077,100 +35158,105 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v2.h, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v6.h, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v29, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v30, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v26, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v17, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v19, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[9:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index bfed1f4304dd9..48c9b8775a474 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -2257,8 +2257,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -2273,18 +2273,19 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -2294,16 +2295,17 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4504,8 +4506,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -4520,18 +4522,19 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true @@ -4541,16 +4544,17 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6463,8 +6467,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -6479,18 +6483,19 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true @@ -6500,16 +6505,17 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8110,8 +8116,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -8126,18 +8132,19 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true @@ -8147,16 +8154,17 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9471,8 +9479,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -9487,18 +9495,19 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true @@ -9508,16 +9517,17 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10183,8 +10193,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -10199,18 +10209,19 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v0.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true @@ -10220,16 +10231,17 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll index 45e205b3ca556..68312b89142c7 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll @@ -145,37 +145,36 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7fc0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11-TRUE16-NEXT: .LBB0_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -797,40 +796,40 @@ define <3 x i16> @bitcast_v3bf16_to_v3i16(<3 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB4_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, 0x7fc0, 16, v2 ; GFX11-TRUE16-NEXT: .LBB4_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index d52451418e49a..5aac06a7f3a2b 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -8768,32 +8768,32 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -8812,26 +8812,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow @@ -8864,26 +8864,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB24_4: ; %end @@ -8891,151 +8891,156 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -12465,13 +12470,13 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 @@ -12487,81 +12492,83 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -12574,366 +12581,384 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v51.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v52.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.h, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v26.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v20.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v52.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v29.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v27.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v30.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v48.l, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v48.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v23.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v24.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v26.h, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v36.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v21.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v22.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v21.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v22.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v16.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v16.h, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v17.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v17.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23563,32 +23588,32 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -23607,26 +23632,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow @@ -23651,26 +23676,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_4: ; %end @@ -23678,151 +23703,156 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -27383,13 +27413,13 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 @@ -27405,81 +27435,83 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -27492,366 +27524,384 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v51.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v52.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.h, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v26.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v20.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v52.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v29.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v27.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v30.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v48.l, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v48.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v23.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v24.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v26.h, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v36.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v21.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v22.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v21.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v22.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v16.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v16.h, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v17.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v17.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -37866,32 +37916,32 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -37910,26 +37960,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow @@ -37967,26 +38017,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB68_4: ; %end @@ -37994,151 +38044,156 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -41573,13 +41628,13 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 @@ -41595,81 +41650,83 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -41682,366 +41739,384 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v51.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v52.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.h, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v26.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v20.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v52.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v29.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v27.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v30.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v48.l, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v48.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v23.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v24.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v26.h, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v36.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v21.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v22.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v21.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v22.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v16.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v16.h, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v17.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v17.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -51220,32 +51295,32 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -51264,26 +51339,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow @@ -51308,26 +51383,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB84_4: ; %end @@ -51335,151 +51410,156 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -54909,13 +54989,13 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 ; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 @@ -54931,81 +55011,83 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -55018,366 +55100,384 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v52.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v51.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v52.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v2.h, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v6, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v39.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v2.h, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v25.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v26.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v6.h, v21.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v20.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v11.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v50.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v49.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v54.h, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v52.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v51.h, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v52.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v51.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v50.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v29.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v28.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v27.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v30.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v39.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v39.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v48.l, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v48.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v23.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v9 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v24.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v5, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v26.h, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v36.h, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v21.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v22.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v21.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v22.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v11, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v18.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v19.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v20.h, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v20.l, v12.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v33.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v31.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v16.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v16.h, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v17.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v17.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v23, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v14, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v15, v18 +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -64473,32 +64573,32 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -64517,26 +64617,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow @@ -64569,26 +64669,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB96_4: ; %end @@ -64596,151 +64696,156 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -76596,32 +76701,32 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -76640,26 +76745,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow @@ -76692,26 +76797,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB104_4: ; %end @@ -76719,151 +76824,156 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.h, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v2.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v19.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v7.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v53, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v8.h, v20.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v34, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v21, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v12.h, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v49, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v51, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v35, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v21, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v22, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v19, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -85582,58 +85692,58 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -85648,297 +85758,304 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v3.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v6.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v13.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.h ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v17, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v20, v22, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v26.h -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v21, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v21, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v27 +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v20, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v4, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v28.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v17, v22, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v18, v22, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v17, v23, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v29, v17, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v18, v19 :: v_dual_add_f32 v18, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v18, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v23 :: v_dual_add_f32 v18, 0x40c00000, v22 ; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v29 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v17, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v19, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v19, v22 :: v_dual_and_b32 v20, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_lshlrev_b32 v5, 16, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v8, 0x40c00000, v8 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v8, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v35.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v17, v8, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v6, v17 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v33 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v34.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v36, v5, v22 :: v_dual_and_b32 v23, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v6, v17, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v31 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v5, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v21, v20 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v35.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v10 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v7, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v7, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v38.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_add_f32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48 +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v39 :: v_dual_cndmask_b32 v52, v22, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10 +; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v22, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v54.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v19, v25, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v24, v50 :: v_dual_add_f32 v9, 0x40c00000, v23 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v24, v50, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v52 +; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11 ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v19, v21, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v65.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 24, v12 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v21, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v21, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v66, v21, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v14, v25, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v23, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v68.h ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21 @@ -85950,42 +86067,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v14, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v13, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v13, v25, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v23, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v23, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v85.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v86.h ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v37, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v70.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v67 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v71.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v66 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v19, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v81.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v84.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v82.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v85.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v23, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v82 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14 +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v81 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9 ; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13 ; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v21, v7 ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v18, v17 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] @@ -85994,160 +86111,159 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v112.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v24.l ; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v2.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v100.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v31.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v86.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v16, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v87.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v65.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v80.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v24 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v70.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v54.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v23, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v16, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v14.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.h, v17.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v51.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v50.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v66.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v21, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v22, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v23, v24 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v25, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index a40ee1698b8e0..6fe66655de3d6 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -2160,46 +2160,47 @@ define i64 @bitcast_v4bf16_to_i64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: .LBB22_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -3064,12 +3065,13 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -3083,66 +3085,61 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5307,46 +5304,47 @@ define double @bitcast_v4bf16_to_f64(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB46_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: .LBB46_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -6216,12 +6214,13 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -6235,66 +6234,61 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8166,46 +8160,47 @@ define <2 x i32> @bitcast_v4bf16_to_v2i32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB66_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: .LBB66_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -9068,12 +9063,13 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -9087,66 +9083,61 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10698,46 +10689,47 @@ define <2 x float> @bitcast_v4bf16_to_v2f32(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v7, v10 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v4 ; GFX11-TRUE16-NEXT: .LBB82_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -11611,12 +11603,13 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -11630,66 +11623,61 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12867,49 +12855,49 @@ define <4 x i16> @bitcast_v4bf16_to_v4i16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v1, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v0, 16, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v3, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v2, 16, v3 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -13841,12 +13829,13 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -13860,66 +13849,61 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14696,46 +14680,46 @@ define <4 x half> @bitcast_v4bf16_to_v4f16(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB102_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v6, v7 :: v_dual_add_f32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_and_b32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v11 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h ; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v3, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v3, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0 ; GFX11-TRUE16-NEXT: .LBB102_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -15671,12 +15655,13 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -15690,66 +15675,61 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16347,42 +16327,41 @@ define <8 x i8> @bitcast_v4bf16_to_v8i8(<4 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v8.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v0 :: v_dual_lshlrev_b32 v0, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_add_f32 v1, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v12, v4, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v10, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v0 :: v_dual_add_f32 v0, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v9.l +; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v4, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v9, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v6, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v12, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v9, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v11, v5, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v3, v2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v1, v6 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v2, v1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v3, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[10:11], 24, v[8:9] @@ -16987,12 +16966,13 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -17006,66 +16986,61 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index 5163539046bb0..e5245f7bd71d3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1102,15 +1102,15 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -1126,76 +1126,79 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v3.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2084,62 +2087,57 @@ define <3 x i32> @bitcast_v6bf16_to_v3i32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB10_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v0, v1, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v5, v7, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v11, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v6, v8 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v10, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v6 :: v_dual_cndmask_b32 v0, v0, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v5 ; GFX11-TRUE16-NEXT: .LBB10_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -4243,15 +4241,15 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -4267,76 +4265,79 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v3.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5229,62 +5230,57 @@ define <3 x float> @bitcast_v6bf16_to_v3f32(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_lshlrev_b32 v3, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v2, 0x40c00000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v0, v1, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_cndmask_b32 v5, v7, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v11, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v6, v8 :: v_dual_and_b32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v10, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v12, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v6 :: v_dual_cndmask_b32 v0, v0, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v11, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v10, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v10, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v5 +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v3, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v4, v5 ; GFX11-TRUE16-NEXT: .LBB26_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -6889,16 +6885,16 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -6914,77 +6910,79 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -7783,67 +7781,68 @@ define <12 x i8> @bitcast_v6bf16_to_v12i8(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v13.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_add_f32 v3, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v11, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v10.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v11, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v11, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v10, v4, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v4, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v10, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v0, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h +; GFX11-TRUE16-NEXT: v_add3_u32 v13, v0, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v8, v13, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v12, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v13, v14, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v10, v10, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v5, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v12, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v7, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v2, v1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v11, 0x7fc07fc0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v8.h ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[12:13] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v7, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v5, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v13 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[16:17], 24, v[10:11] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v10 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %end @@ -8652,16 +8651,16 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -8677,77 +8676,79 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10064,16 +10065,16 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -10089,77 +10090,79 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v6.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11446,59 +11449,61 @@ define <6 x half> @bitcast_v6bf16_to_v6f16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB48_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v6, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v6, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v4, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v0, 0x40c00000, v0 :: v_dual_cndmask_b32 v1, v8, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v9, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v3 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, v8, v12 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v7, v12 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v7, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v11, v5, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v10, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v6, v0 +; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v7, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v4, v5 ; GFX11-TRUE16-NEXT: .LBB48_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -12390,64 +12395,66 @@ define <6 x i16> @bitcast_v6bf16_to_v6i16(<6 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB52_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_add_f32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_lshlrev_b32 v4, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v2, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v6, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v9, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, v6, v8 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v8, v11, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v11, v12, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v9, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v11, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v9, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v8, v9 :: v_dual_and_b32 v0, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v11, v12, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v10, v9, 0x7fff +; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v11, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v12, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v7, v10, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v11, v12, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v9, v10 :: v_dual_add_f32 v0, 0x40c00000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x400000, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v6, v13, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v7, v13, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v7, v10, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.h ; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v0, 16, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v4 -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v1, v1, 16, v3 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v5, 16, v4 ; GFX11-TRUE16-NEXT: .LBB52_2: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll new file mode 100644 index 0000000000000..535f05bc01b42 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-bf16-gfx11plus.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s + +@global_smem = external local_unnamed_addr addrspace(1) global [0 x i8], align 16 + +define amdgpu_kernel void @v_atomicrmw_fadd_bf16(ptr addrspace(1) %out, i1 %in, ptr addrspace(1) %ptr) #0 { +; GFX11-TRUE16-LABEL: v_atomicrmw_fadd_bf16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1] offset:4 +; GFX11-TRUE16-NEXT: s_and_b32 s0, s2, -4 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s3 +; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s2, 3 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: s_lshl_b32 s4, 0xffff, s2 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-TRUE16-NEXT: s_not_b32 s3, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, 0 +; GFX11-TRUE16-NEXT: .p2align 6 +; GFX11-TRUE16-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v0, v4, v[0:1], s[0:1] glc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_gl1_inv +; GFX11-TRUE16-NEXT: buffer_gl0_inv +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-TRUE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_atomicrmw_fadd_bf16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_load_b64 s[2:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[0:1] offset:4 +; GFX11-FAKE16-NEXT: s_and_b32 s0, s2, -4 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, s3 +; GFX11-FAKE16-NEXT: s_and_b32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX11-FAKE16-NEXT: s_lshl_b32 s2, s2, 3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_lshl_b32 s4, 0xffff, s2 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-FAKE16-NEXT: s_not_b32 s3, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, 0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-FAKE16-NEXT: .p2align 6 +; GFX11-FAKE16-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_bfe_u32 v4, v0, 16, 1 +; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX11-FAKE16-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_gl1_inv +; GFX11-FAKE16-NEXT: buffer_gl0_inv +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-FAKE16-NEXT: s_or_b32 s4, vcc_lo, s4 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; GFX11-FAKE16-NEXT: s_cbranch_execnz .LBB0_1 +; GFX11-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX11-FAKE16-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid + %load = load <4 x bfloat>, ptr addrspace(1) %in.gep + %extract1 = extractelement <4 x bfloat> %load, i64 3 + %fadd = atomicrmw fadd ptr addrspace(1) %out, bfloat %extract1 syncscope("agent") acq_rel + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 2bdf994496421..cd6d741beeab3 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -9082,17 +9082,19 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_fadd_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13318,9 +13320,10 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) { ; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_0: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 1.0, v0 +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 1.0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -13413,9 +13416,10 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) { ; GFX11TRUE16-LABEL: v_fadd_bf16_fpimm_1: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 0x42280000, v0 +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, 0x42280000, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -13515,17 +13519,19 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_fsub_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -14275,17 +14281,19 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_fmul_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -18568,32 +18576,34 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_fdiv_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_div_scale_f32 v2, null, v1, v1, v0 -; GFX11TRUE16-NEXT: v_rcp_f32_e32 v3, v2 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11TRUE16-NEXT: v_div_scale_f32 v1, null, v0, v0, v2 +; GFX11TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, v2, v0, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_rcp_f32_e32 v3, v1 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11TRUE16-NEXT: v_fma_f32 v4, -v2, v3, 1.0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_fma_f32 v4, -v1, v3, 1.0 ; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v4, v3 -; GFX11TRUE16-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mul_f32_e32 v4, v5, v3 +; GFX11TRUE16-NEXT: v_fma_f32 v6, -v1, v4, v5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_fma_f32 v6, -v2, v4, v5 ; GFX11TRUE16-NEXT: v_fmac_f32_e32 v4, v6, v3 +; GFX11TRUE16-NEXT: v_fma_f32 v1, -v1, v4, v5 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_fma_f32 v2, -v2, v4, v5 -; GFX11TRUE16-NEXT: v_div_fmas_f32 v2, v2, v3, v4 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v2, v1, v0 +; GFX11TRUE16-NEXT: v_div_fmas_f32 v1, v1, v3, v4 +; GFX11TRUE16-NEXT: v_div_fixup_f32 v0, v1, v0, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19018,17 +19028,19 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_minnum_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_min_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23270,17 +23282,19 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) { ; GFX11TRUE16-LABEL: v_maxnum_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -27591,11 +27605,12 @@ define bfloat @v_sqrt_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_sqrt_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0 -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xf800000, v1 +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x4f800000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_sqrt_f32_e32 v1, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff @@ -27730,9 +27745,10 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) { ; GFX11TRUE16-LABEL: v_ldexp_bf16_i32: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v2, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -27836,17 +27852,18 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) { ; GFX11TRUE16-LABEL: v_frexp_bf16_i16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11TRUE16-NEXT: v_frexp_mant_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11TRUE16-NEXT: v_frexp_exp_i32_f32_e32 v1, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -28019,11 +28036,12 @@ define bfloat @v_log_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_log_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo -; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff @@ -28177,13 +28195,14 @@ define bfloat @v_log2_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_log2_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff ; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 @@ -28367,11 +28386,12 @@ define bfloat @v_log10_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_log10_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo -; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_log_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff @@ -28580,25 +28600,26 @@ define bfloat @v_exp_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_exp_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0 -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v2, v1 -; GFX11TRUE16-NEXT: v_fma_f32 v3, 0x3fb8aa3b, v0, -v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v1 +; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v1 +; GFX11TRUE16-NEXT: v_fma_f32 v2, 0x3fb8aa3b, v1, -v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v3, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX11TRUE16-NEXT: v_fmamk_f32 v3, v0, 0x32a5705f, v3 -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX11TRUE16-NEXT: v_fmamk_f32 v2, v1, 0x32a5705f, v2 +; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -28744,13 +28765,14 @@ define bfloat @v_exp2_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_exp2_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff ; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 @@ -28937,25 +28959,26 @@ define bfloat @v_exp10_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_exp10_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0 -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v2, v1 -; GFX11TRUE16-NEXT: v_fma_f32 v3, 0x40549a78, v0, -v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, 0x40549a78, v1 +; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v1 +; GFX11TRUE16-NEXT: v_fma_f32 v2, 0x40549a78, v1, -v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v3, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_sub_f32_e32 v1, v1, v2 -; GFX11TRUE16-NEXT: v_fmamk_f32 v3, v0, 0x33979a37, v3 -; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2 -; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 -; GFX11TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX11TRUE16-NEXT: v_fmamk_f32 v2, v1, 0x33979a37, v2 +; GFX11TRUE16-NEXT: v_sub_f32_e32 v0, v0, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v3 +; GFX11TRUE16-NEXT: v_exp_f32_e32 v0, v0 ; GFX11TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX11TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0 -; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo +; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v1 +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7f800000, v0, vcc_lo ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -29066,9 +29089,10 @@ define bfloat @v_ceil_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_ceil_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_ceil_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_ceil_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29163,9 +29187,10 @@ define bfloat @v_trunc_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_trunc_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29260,9 +29285,10 @@ define bfloat @v_rint_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_rint_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29357,9 +29383,10 @@ define bfloat @v_nearbyint_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_nearbyint_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29483,16 +29510,17 @@ define bfloat @v_round_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_round_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_trunc_f32_e32 v1, v0 -; GFX11TRUE16-NEXT: v_sub_f32_e32 v2, v0, v1 +; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: v_sub_f32_e32 v2, v1, v0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cmp_ge_f32_e64 s0, |v2|, 0.5 ; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, s0 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v2, v0 -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX11TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v2, v1 +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -29594,9 +29622,10 @@ define bfloat @v_roundeven_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_roundeven_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_rndne_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29691,9 +29720,10 @@ define bfloat @v_floor_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_floor_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_floor_f32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_floor_f32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29786,9 +29816,10 @@ define bfloat @v_canonicalize_bf16(bfloat %a) { ; GFX11TRUE16-LABEL: v_canonicalize_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX11TRUE16-NEXT: v_max_f32_e32 v0, v1, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -29916,15 +29947,27 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_oeq_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_oeq_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_oeq_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp oeq bfloat %a, %b ret i1 %op } @@ -29979,15 +30022,27 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ogt_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ogt_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ogt_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ogt bfloat %a, %b ret i1 %op } @@ -30042,15 +30097,27 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_oge_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_oge_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_ge_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_oge_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp oge bfloat %a, %b ret i1 %op } @@ -30105,15 +30172,27 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_olt_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_olt_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_olt_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp olt bfloat %a, %b ret i1 %op } @@ -30168,15 +30247,27 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ole_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ole_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ole_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ole bfloat %a, %b ret i1 %op } @@ -30231,15 +30322,27 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_one_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_one_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_lg_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_one_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_lg_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp one bfloat %a, %b ret i1 %op } @@ -30294,15 +30397,27 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_uno_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_uno_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_uno_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp uno bfloat %a, %b ret i1 %op } @@ -30357,15 +30472,27 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ueq_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ueq_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ueq_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_nlg_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ueq bfloat %a, %b ret i1 %op } @@ -30420,15 +30547,27 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ugt_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ugt_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_nle_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ugt_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_nle_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ugt bfloat %a, %b ret i1 %op } @@ -30483,15 +30622,27 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_uge_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_uge_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_uge_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp uge bfloat %a, %b ret i1 %op } @@ -30546,15 +30697,27 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ult_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ult_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_nge_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ult_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ult bfloat %a, %b ret i1 %op } @@ -30609,15 +30772,27 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_ule_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_ule_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_ule_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp ule bfloat %a, %b ret i1 %op } @@ -30672,15 +30847,27 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) { ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fcmp_une_bf16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fcmp_une_bf16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11TRUE16-NEXT: v_cmp_neq_f32_e32 vcc_lo, v1, v2 +; GFX11TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fcmp_une_bf16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cmp_neq_f32_e32 vcc_lo, v0, v1 +; GFX11FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fcmp une bfloat %a, %b ret i1 %op } @@ -30763,13 +30950,22 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) { ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fptosi_bf16_to_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i16: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i16: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fptosi bfloat %x to i16 ret i16 %op } @@ -31144,13 +31340,22 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) { ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fptosi_bf16_to_i32: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cvt_i32_f32_e32 v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i32: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i32: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0 +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fptosi bfloat %x to i32 ret i32 %op } @@ -31494,27 +31699,50 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) { ; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fptosi_bf16_to_i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f32_e32 v0, v0 -; GFX11-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| -; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_floor_f32_e32 v1, v1 -; GFX11-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| -; GFX11-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v2 -; GFX11-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_xor_b32_e32 v0, v0, v3 -; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11TRUE16-LABEL: v_fptosi_bf16_to_i64: +; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_trunc_f32_e32 v0, v1 +; GFX11TRUE16-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| +; GFX11TRUE16-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_floor_f32_e32 v1, v1 +; GFX11TRUE16-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| +; GFX11TRUE16-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11TRUE16-NEXT: v_cvt_u32_f32_e32 v0, v2 +; GFX11TRUE16-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_xor_b32_e32 v0, v0, v3 +; GFX11TRUE16-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11FAKE16-LABEL: v_fptosi_bf16_to_i64: +; GFX11FAKE16: ; %bb.0: +; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_trunc_f32_e32 v0, v0 +; GFX11FAKE16-NEXT: v_mul_f32_e64 v1, 0x2f800000, |v0| +; GFX11FAKE16-NEXT: v_ashrrev_i32_e32 v3, 31, v0 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_floor_f32_e32 v1, v1 +; GFX11FAKE16-NEXT: v_fma_f32 v2, 0xcf800000, v1, |v0| +; GFX11FAKE16-NEXT: v_cvt_u32_f32_e32 v1, v1 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11FAKE16-NEXT: v_cvt_u32_f32_e32 v0, v2 +; GFX11FAKE16-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_xor_b32_e32 v0, v0, v3 +; GFX11FAKE16-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v3 +; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11FAKE16-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31] %op = fptosi bfloat %x to i64 ret i64 %op } @@ -42575,18 +42803,21 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11TRUE16-LABEL: v_fma_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11TRUE16-NEXT: v_fmac_f32_e32 v3, v1, v2 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: v_bfe_u32 v0, v3, 16, 1 +; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v3 +; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v3, 0x7fff ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_fmac_f32_e32 v2, v0, v1 -; GFX11TRUE16-NEXT: v_bfe_u32 v0, v2, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v1, 0x400000, v2 -; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_add3_u32 v0, v0, v2, 0x7fff ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -43457,26 +43688,30 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) { ; GFX11TRUE16-LABEL: v_fmuladd_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11TRUE16-NEXT: v_mul_f32_e32 v0, v1, v3 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX11TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11TRUE16-NEXT: v_dual_cndmask_b32 v0, v1, v3 :: v_dual_lshlrev_b32 v1, 16, v2 ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index 348862d4d8ced..f4b432dce8c8a 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -5100,55 +5100,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5257,53 +5258,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB16_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB16_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -5619,48 +5621,49 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5773,46 +5776,47 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB17_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -6124,15 +6128,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -6146,39 +6150,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_add_f32_e32 v6, v6, v8 +; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6193,14 +6196,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -6208,7 +6211,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB18_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6384,16 +6387,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-TRUE16-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -6405,39 +6408,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB18_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB18_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, v4, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, v6, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -6451,14 +6453,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB18_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -6468,7 +6470,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index ab867b089b875..6f1675edbe58a 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -4228,55 +4228,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4385,53 +4386,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4749,48 +4751,49 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v1, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4903,46 +4906,47 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v0, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5256,15 +5260,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -5278,39 +5282,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v4, v4, v10 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v6, v6, v8 +; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5325,14 +5328,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5340,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5516,16 +5519,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -5537,39 +5540,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v4, v4, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_max_f32_e32 v6, v6, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5583,14 +5585,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -5600,7 +5602,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index 1a25904dd553f..acb27be1846b9 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -4228,55 +4228,56 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4385,53 +4386,54 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB13_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v2 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v3 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB13_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v3 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory: @@ -4749,48 +4751,49 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: s_addk_co_i32 s16, 0x200 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, -4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX12-TRUE16-NEXT: s_and_b32 s4, s16, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX12-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen ; GFX12-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX12-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX12-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v0, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX12-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v1, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX12-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -4903,46 +4906,47 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_addk_i32 s16, 0x200 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, -4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s4 ; GFX11-TRUE16-NEXT: s_and_b32 s4, s16, 3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s4, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_lshl_b32 s5, 0xffff, s4 -; GFX11-TRUE16-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB14_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s4, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v1, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v0, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s4, v4 -; GFX11-TRUE16-NEXT: v_and_or_b32 v0, v1, s6, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v4 +; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0 -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v4 ; GFX11-TRUE16-NEXT: s_or_b32 s5, vcc_lo, s5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5 @@ -5256,15 +5260,15 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, exec_lo ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX12-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX12-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -5278,39 +5282,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen +; GFX12-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX12-TRUE16-NEXT: ; %bb.2: ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX12-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX12-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v4, v4, v10 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v6, v6, v8 +; GFX12-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 -; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX12-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5325,14 +5328,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN +; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX12-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -5340,7 +5343,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX12-TRUE16-NEXT: s_cbranch_execnz .LBB15_3 ; GFX12-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX12-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -5516,16 +5519,16 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 0x200, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0x200, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 3, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, -4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, -4, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v9, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v11, v7 ; GFX11-TRUE16-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s5, v1 @@ -5537,39 +5540,38 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_and_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 -; GFX11-TRUE16-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen +; GFX11-TRUE16-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_1 ; GFX11-TRUE16-NEXT: ; %bb.2: ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v5 ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB15_3: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v5.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v4, v4, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, 0x400000, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v4, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.h +; GFX11-TRUE16-NEXT: v_min_f32_e32 v6, v6, v8 +; GFX11-TRUE16-NEXT: v_bfe_u32 v9, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v9, v9, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v9, v12, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v7, v5 -; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v9, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v5 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, v6 +; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v0 @@ -5583,14 +5585,14 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc +; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc ; GFX11-TRUE16-NEXT: s_xor_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB15_4 ; GFX11-TRUE16-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v7, v8 ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv ; GFX11-TRUE16-NEXT: s_or_b32 s1, vcc_lo, s1 @@ -5600,7 +5602,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd ; GFX11-TRUE16-NEXT: ; %bb.6: ; %atomicrmw.end ; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v7, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v4, v8 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory: diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index a1aef8ddf6bba..da4914016151d 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -2748,100 +2748,101 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, v10.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 1, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, v8.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 3, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, v6.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 2, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 1, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v8.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, v4.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 3, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 2, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, v6.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, v6.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 1, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v4.l, 1 ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 3 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, v28.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, v26.l, 1 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, v3.l, 15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 1, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 2, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 2, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 1, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 3, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, v24.l, 1 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 1, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v5.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, v22.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, v20.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, v18.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 1, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, v16.l, 1 ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, v14.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 1, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, v12.l, 1 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v30.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 3, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 2, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 3, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 2, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 3, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 2, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 3, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 2, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v8.l, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 3, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 2, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 2, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, v4.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, v5.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, v7.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v12.h, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, v26.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1 +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, v22.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1 +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 3, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 2, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v12.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v5.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v7.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, v3.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v6.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 3, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 2, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 3, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 2, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 3, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 2, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.h, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v8.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 3, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 2, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v28.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v23.l, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.h, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v1.l, 15 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, v3.l, 15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 4, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, v4.l, 15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 12, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v30.h, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v24.l, v22.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v14.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 12, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v20.h, 15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 4, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 15 +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index 745e047348626..86e890b06989a 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -1771,33 +1771,35 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff00, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.h ; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x900, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x900, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.h ; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x900, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 ; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index 738bad7ad1809..f26b72027a784 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -2811,20 +2811,20 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3f80 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2852,20 +2852,20 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3f80 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2945,20 +2945,20 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3f80 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x3f00, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x3f00, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -2986,20 +2986,20 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x3f80 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x3f00, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x3f00, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3105,34 +3105,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2bf16_test3: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0x3f80 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, 0x4000, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, 0x4000, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x3f80 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v1, v3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, 0x4000, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, 0x4000, s0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test3: @@ -3170,34 +3170,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2bf16_test3: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0x3f80 -; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v4 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, 0x4000, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, 0x4000, s0 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x3f80 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v1, v3 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, 0x4000, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, 0x4000, s0 ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1 -; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test3: @@ -3314,34 +3314,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX11-SDAG-TRUE16-LABEL: fmul_select_v2bf16_test4: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0x3f80 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, 0x3f00, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, 0x3f00, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x3f80 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v1, v3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, 0x3f00, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, 0x3f00, s0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1 -; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: fmul_select_v2bf16_test4: @@ -3379,34 +3379,34 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a ; GFX11-GISEL-TRUE16-LABEL: fmul_select_v2bf16_test4: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0x3f80 -; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v2, v4 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, 0x3f00, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, 0x3f00, s0 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x3f80 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, v1, v3 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 +; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.l, 0x3f00, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, 0x3f00, s0 ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v3, v1 -; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v2, v4, v3 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v3 +; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: fmul_select_v2bf16_test4: @@ -3498,20 +3498,20 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4100 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3539,20 +3539,20 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4100 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4000, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4000, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3634,20 +3634,20 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4040 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xc100, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xc100, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3675,20 +3675,20 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4040 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xc100, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xc100, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3769,20 +3769,20 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc080 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4100, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4100, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3810,20 +3810,20 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc080 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x4100, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x4100, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -3902,12 +3902,13 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-TRUE16-LABEL: fmul_select_bf16_test8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0, 0x8000, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -3940,12 +3941,13 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-TRUE16-LABEL: fmul_select_bf16_test8: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, 0, 0x8000, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, 0, 0x8000, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v3, v0 ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 @@ -4033,20 +4035,20 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc200 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xc180, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xc180, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4074,20 +4076,20 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc200 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xc180, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xc180, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4169,20 +4171,20 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xdb80 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xe000, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xe000, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4210,20 +4212,20 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xdb80 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0xe000, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0xe000, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4305,20 +4307,20 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4c00 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x3480, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x3480, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4346,20 +4348,20 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4c00 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, 0x3480, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, 0x3480, vcc_lo +; GFX11-GISEL-TRUE16-NEXT: v_mul_f32_e32 v0, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-GISEL-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-GISEL-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11-GISEL-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll index 45fe2d07226a1..85e56a243cdc9 100644 --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll @@ -77,11 +77,20 @@ define i32 @divergent_vec_0_i16(i16 %a) { ; GFX906-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: divergent_vec_0_i16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: divergent_vec_0_i16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: divergent_vec_0_i16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x i16> poison, i16 0, i32 0 %vec = insertelement <2 x i16> %tmp, i16 %a, i32 1 %val = bitcast <2 x i16> %vec to i32 @@ -160,11 +169,20 @@ define i32 @divergent_vec_i16_0(i16 %a) { ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: divergent_vec_i16_0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: divergent_vec_i16_0: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: divergent_vec_i16_0: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x i16> poison, i16 %a, i32 0 %vec = insertelement <2 x i16> %tmp, i16 0, i32 1 %val = bitcast <2 x i16> %vec to i32 @@ -243,11 +261,20 @@ define float @divergent_vec_f16_0(half %a) { ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: divergent_vec_f16_0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: divergent_vec_f16_0: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: divergent_vec_f16_0: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %tmp = insertelement <2 x half> poison, half %a, i32 0 %vec = insertelement <2 x half> %tmp, half 0.0, i32 1 %val = bitcast <2 x half> %vec to float diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll index 6cd439999a554..d8f81db70e309 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll @@ -624,31 +624,30 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3] +; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_dual_mul_f32 v0, v0, v3 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, v1, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_mul_f32 v2, v1, v2 :: v_dual_lshlrev_b32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, v1, v3 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo @@ -809,35 +808,35 @@ define amdgpu_kernel void @v_fabs_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-TRUE16-NEXT: s_load_b32 s4, s[4:5], 0x10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[2:3] -; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s4, 16 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_dual_mul_f32 v0, s2, v0 :: v_dual_lshlrev_b32 v1, 16, v1 ; GFX11-TRUE16-NEXT: s_and_b32 s2, s4, 0xffff0000 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_mul_f32_e32 v1, s2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, s2, v1 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l +; GFX11-TRUE16-NEXT: s_lshl_b32 s2, s4, 16 +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v0, s2, v1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX11-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm @@ -988,34 +987,36 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_dual_mul_f32 v0, 4.0, v0 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 2.0, v1 -; GFX11-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_f32_e32 v2, 4.0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 2.0, v1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v1, v1, v2, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 -; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo -; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc -; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo ; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v1, off dlc ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-TRUE16-NEXT: global_store_d16_hi_b16 v[0:1], v0, off dlc +; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: v_extract_fabs_fold_v2bf16: diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 8581e4d030261..8c7d5cffe39d9 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -11974,7 +11974,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -11988,20 +11988,22 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12121,7 +12123,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -12136,19 +12138,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12425,34 +12429,34 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12578,12 +12582,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12594,19 +12597,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12891,34 +12896,34 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13045,12 +13050,11 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13061,19 +13065,21 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13355,45 +13361,45 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13505,45 +13511,46 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13806,45 +13813,45 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13957,45 +13964,46 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14259,27 +14267,28 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -14379,27 +14388,28 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -14629,32 +14639,33 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -14744,33 +14755,34 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14988,7 +15000,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -15002,18 +15014,20 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -15130,7 +15144,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -15145,17 +15159,19 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -15424,34 +15440,34 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15579,12 +15595,11 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -15595,19 +15610,21 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -15891,46 +15908,46 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16043,45 +16060,46 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 883063b5471ca..56ad91dd59ffb 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -9836,7 +9836,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -9850,20 +9850,22 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9983,7 +9985,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -9998,19 +10000,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10288,34 +10292,34 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10441,12 +10445,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10457,19 +10460,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10755,34 +10760,34 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10909,12 +10914,11 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10925,19 +10929,21 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11220,7 +11226,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -11234,18 +11240,20 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -11362,7 +11370,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -11377,17 +11385,19 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -11654,45 +11664,45 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11804,45 +11814,46 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12106,45 +12117,45 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -12257,45 +12268,46 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12560,27 +12572,28 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12680,27 +12693,28 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12931,32 +12945,33 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13046,33 +13061,34 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13294,34 +13310,34 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13449,12 +13465,11 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13465,19 +13480,21 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13762,46 +13779,46 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13914,45 +13931,46 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index c603421ca15b4..f0083bd23660a 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -9836,7 +9836,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -9850,20 +9850,22 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9983,7 +9985,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -9998,19 +10000,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10288,34 +10292,34 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10441,12 +10445,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10457,19 +10460,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10755,34 +10760,34 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10909,12 +10914,11 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10925,19 +10929,21 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -11220,7 +11226,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -11234,18 +11240,20 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -11362,7 +11370,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -11377,17 +11385,19 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -11654,45 +11664,45 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11804,45 +11814,46 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12106,45 +12117,45 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -12257,45 +12268,46 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12560,27 +12572,28 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12680,27 +12693,28 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12931,32 +12945,33 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13046,33 +13061,34 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no ; GFX11-TRUE16-LABEL: flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13294,34 +13310,34 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13449,12 +13465,11 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13465,19 +13480,21 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13762,46 +13779,46 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13914,45 +13931,46 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-LABEL: flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index c987effec3be3..3ee0bb2122abe 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -9419,7 +9419,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -9433,20 +9433,22 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9566,7 +9568,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_ret_bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -9581,19 +9583,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9870,34 +9874,34 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10023,12 +10027,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10039,19 +10042,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10336,34 +10341,34 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10490,12 +10495,11 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10506,19 +10510,21 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10800,7 +10806,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -10814,18 +10820,20 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -10942,7 +10950,7 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -10957,17 +10965,19 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -11233,45 +11243,45 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11383,45 +11393,46 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat % ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11684,45 +11695,45 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11835,45 +11846,46 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat % ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b_neg: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12137,27 +12149,28 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 @@ -12257,27 +12270,28 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr, ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -12507,32 +12521,33 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -12622,33 +12637,34 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr, ; GFX11-TRUE16-LABEL: flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12869,34 +12885,34 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13024,12 +13040,11 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: flat_load_b32 v5, v[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13040,19 +13055,21 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13336,46 +13353,46 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13488,45 +13505,46 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat ; GFX11-TRUE16-LABEL: flat_system_atomic_fsub_noret_bf16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: flat_load_b32 v4, v[0:1] +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc +; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll index 2f08931f2287e..87843522fe0ab 100644 --- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll @@ -1872,63 +1872,48 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b, ; GFX11-SDAG-TRUE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: @@ -1985,74 +1970,59 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b, ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v4 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v1 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0: @@ -2505,114 +2475,88 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v4, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0: @@ -2710,123 +2654,103 @@ define <2 x bfloat> @v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v6, v8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v4, v5 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: v_max3_v2bf16_maximumnum_maximumnum__v_v_v_0: diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll index 0742ac7b425a6..bc85dc2f1e9e1 100644 --- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll @@ -69,21 +69,16 @@ define bfloat @v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum(bfloat %a) #1 { ; GFX11-SDAG-TRUE16-LABEL: v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x4000, v0.l, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, 2.0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x4000, v1.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 4.0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4080, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4080, v1.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] %max = call bfloat @llvm.maximumnum.bf16(bfloat %a, bfloat 2.0) %med = call bfloat @llvm.minimumnum.bf16(bfloat %max, bfloat 4.0) @@ -196,35 +191,26 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f32_e64 s0, v2, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4000, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x4000, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_o_f32_e64 s0, v1, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, 2.0, v2 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4000, v2.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x4000, v0.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, 2.0, v2 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, 2.0, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4000, v1.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 4.0, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 4.0, v3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4080, v1.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 4.0, v2 ; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4080, v0.l, s0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] %max = call <2 x bfloat> @llvm.maximumnum.v2bf16(<2 x bfloat> %a, <2 x bfloat> splat (bfloat 2.0)) diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll index 969c6c3980fc3..7b2d793973d08 100644 --- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll +++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll @@ -1874,63 +1874,48 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b, ; GFX11-SDAG-TRUE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0: ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0: @@ -1987,74 +1972,59 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b, ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v4 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v0.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v1 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v3.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0: @@ -2510,114 +2480,88 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> % ; GFX11-SDAG-TRUE16: ; %bb.0: ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v6, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v4, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v3 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v1 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo -; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0: @@ -2715,123 +2659,103 @@ define <2 x bfloat> @v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0(<2 x bfloat> % ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v1.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.h, v1.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.l, s1 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v6, v8 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.l, v3.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v5.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v6.l, v1.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v4.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v4, v5 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.h ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v8 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v4.l, s1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v7 -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v4.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v4.h, s1 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v6, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v4, v4 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.h, s2 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, vcc_lo -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v1.l, s1 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v1.l, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v2.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.h, v2.h, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v3 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v6 -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v1, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v5.h, v0.l, v2.l, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v1 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v5.h, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v5.h, s1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v5.h, v0.l +; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.h, s0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v1.h, s2 +; GFX12-SDAG-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo ; GFX12-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX12-SDAG-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: v_min3_v2bf16_minimumnum_minimumnum__v_v_v_0: diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index a901d7f97eb37..f8ff8efbb1ef1 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -1109,18 +1109,19 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1191,19 +1192,20 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l ; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1289,23 +1291,24 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v0.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v0.h, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4 +; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: void_func_v8i8: @@ -1419,46 +1422,47 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v16i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v8.h, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v8.h, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.h, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v10.l, v8.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v6 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v4, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v12 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[6:9], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: void_func_v16i8: @@ -1654,85 +1658,84 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v28.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v7.h, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v8.h, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v10.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v11.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v13.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v14.l, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v15.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v11.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v8.h, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v8 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v9.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v9.l, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v10.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v5.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v1, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v17, v18 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v12, v19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v11, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index dd7f18357f965..facc91a7666d9 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4896,22 +4896,23 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 @@ -5156,18 +5157,22 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v6 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4 ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off @@ -5175,9 +5180,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 -; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 @@ -5440,36 +5442,35 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 ; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[0:1], off -; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 -; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v5 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 +; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 @@ -5911,81 +5912,83 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v12, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v2.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v0.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v3.h, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v13 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.h, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v10, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v0 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v1, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v3, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v7, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v13 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[7:10], off -; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[3:6], off +; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[6:9], off +; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[2:5], off ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 100a560c1d127..1f74fbdc46e98 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -12320,7 +12320,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -12334,20 +12334,22 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12467,7 +12469,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -12482,19 +12484,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12821,34 +12825,34 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12974,12 +12978,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12990,19 +12993,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13339,34 +13344,34 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13493,12 +13498,11 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13509,19 +13513,21 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13855,7 +13861,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -13869,18 +13875,20 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -13997,7 +14005,7 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -14012,17 +14020,19 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -14337,45 +14347,45 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -14487,45 +14497,46 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -14838,45 +14849,45 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -14989,45 +15000,46 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -15341,27 +15353,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -15461,27 +15474,28 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -15750,32 +15764,33 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -15865,33 +15880,34 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -16149,34 +16165,34 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -16304,12 +16320,11 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -16320,19 +16335,21 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -16668,46 +16685,46 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -16820,45 +16837,46 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index faa3ee61427a2..faa74fef2be2f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -8741,7 +8741,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -8755,20 +8755,22 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8888,7 +8890,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -8903,19 +8905,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9244,34 +9248,34 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9397,12 +9401,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9413,19 +9416,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9764,34 +9769,34 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9918,12 +9923,11 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9934,19 +9938,21 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10282,7 +10288,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -10296,18 +10302,20 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -10424,7 +10432,7 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -10439,17 +10447,19 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -10766,45 +10776,45 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10916,45 +10926,46 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11269,45 +11280,45 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11420,45 +11431,46 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11774,27 +11786,28 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -11894,27 +11907,28 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12185,32 +12199,33 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -12300,33 +12315,34 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12586,34 +12602,34 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12741,12 +12757,11 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12757,19 +12772,21 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_max_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13107,46 +13124,46 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_max_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13259,45 +13276,46 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-LABEL: global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_max_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_max_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index cb66f85ff3ae2..a46b0129b79e6 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -8741,7 +8741,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -8755,20 +8755,22 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -8888,7 +8890,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -8903,19 +8905,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9244,34 +9248,34 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9397,12 +9401,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9413,19 +9416,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9764,34 +9769,34 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9918,12 +9923,11 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9934,19 +9938,21 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10282,7 +10288,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -10296,18 +10302,20 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -10424,7 +10432,7 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -10439,17 +10447,19 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory( ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -10766,45 +10776,45 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -10916,45 +10926,46 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11269,45 +11280,45 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11420,45 +11431,46 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11774,27 +11786,28 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -11894,27 +11907,28 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_ ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB42_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12185,32 +12199,33 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -12300,33 +12315,34 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_ ; GFX11-TRUE16-LABEL: global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB43_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12586,34 +12602,34 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -12741,12 +12757,11 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -12757,19 +12772,21 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_min_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13107,46 +13124,46 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_min_num_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13259,45 +13276,46 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine ; GFX11-TRUE16-LABEL: global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_min_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_min_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index f869b5778bfb2..053efdcb76261 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -9266,7 +9266,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -9280,20 +9280,22 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9413,7 +9415,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_ret_bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -9428,19 +9430,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9767,34 +9771,34 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB33_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -9920,12 +9924,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -9936,19 +9939,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10285,34 +10290,34 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB34_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10439,12 +10444,11 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -10455,19 +10459,21 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -10801,7 +10807,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -10815,18 +10821,20 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -10943,7 +10951,7 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 @@ -10958,17 +10966,19 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 @@ -11283,45 +11293,45 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11433,45 +11443,46 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB36_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -11784,45 +11795,45 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -11935,45 +11946,46 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b_neg: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB37_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -12287,27 +12299,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -12407,27 +12420,28 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB38_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v4 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v3, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v3, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v5 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -12696,32 +12710,33 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX12-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_DEV -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -12811,33 +12826,34 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr ; GFX11-TRUE16-LABEL: global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off offset:2046 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB39_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v2, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.h -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v5 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.h +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 @@ -13095,34 +13111,34 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX12-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_not_b32_e32 v4, v4 ; GFX12-TRUE16-NEXT: .LBB40_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13250,12 +13266,11 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 -; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: global_load_b32 v5, v[0:1], off ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) @@ -13266,19 +13281,21 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v2 -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v5, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v5, v5, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v5, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v5, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v7 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -13614,46 +13631,46 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 +; GFX12-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX12-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX12-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX12-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX12-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX12-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX12-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff +; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX12-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX12-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX12-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: global_inv scope:SCOPE_SYS -; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe ; GFX12-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe @@ -13766,45 +13783,46 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1 ; GFX11-TRUE16-LABEL: global_system_atomic_fsub_noret_bf16__offset12b_pos: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 3, v4 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 -; GFX11-TRUE16-NEXT: global_load_b32 v3, v[0:1], off -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3 +; GFX11-TRUE16-NEXT: global_load_b32 v4, v[0:1], off +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 3, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff -; GFX11-TRUE16-NEXT: v_not_b32_e32 v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff +; GFX11-TRUE16-NEXT: v_not_b32_e32 v6, v3 ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB41_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_sub_f32_e32 v3, v3, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v8, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add3_u32 v8, v8, v3, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_sub_f32_e32 v2, v2, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: v_and_or_b32 v2, v3, v5, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v7 +; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: buffer_gl1_inv ; GFX11-TRUE16-NEXT: buffer_gl0_inv -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 ; GFX11-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll index d588f0e0897b7..723e3ef15553a 100644 --- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -4007,6 +4007,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(ptr addrspace(1 ; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] ; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, 0, 16, v0 ; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi: @@ -4053,6 +4055,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(ptr a ; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 ; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, 0, 16, v0 ; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: @@ -4411,7 +4415,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(ptr addrspace(1 ; GFX12-GISEL-TRUE16: ; %bb.0: ; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] ; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 ; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi: @@ -4457,7 +4461,7 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(ptr a ; GFX12-GISEL-TRUE16: ; %bb.0: ; GFX12-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] offset:-128 ; GFX12-GISEL-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX12-GISEL-TRUE16-NEXT: v_lshl_or_b32 v0, v0, 16, 0 ; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-FAKE16-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: @@ -4882,7 +4886,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv(ptr addrspace(1) inreg %arg) { ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB132_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc @@ -5002,7 +5006,7 @@ define amdgpu_ps void @global_addr_64bit_lsr_iv_multiload(ptr addrspace(1) inreg ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s0 ; GFX12-GISEL-NEXT: .LBB133_1: ; %bb3 ; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX12-GISEL-NEXT: v_add_co_u32 v4, vcc, v0, v2 ; GFX12-GISEL-NEXT: s_wait_alu 0xfffd ; GFX12-GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 5e502882a2645..7ebd69204d87f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -2724,31 +2724,31 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.l, 8, v4.l +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3 ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4 -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l -; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v2.l +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.l +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v2.l, v6.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-DL-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.l +; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.h ; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v0.h, v1.l ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v6 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2 +; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.h, v4.h, v0.l ; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-DL-TRUE16-NEXT: global_store_b8 v5, v0, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll index 0d3340006f17e..3261e4cae5bcd 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll @@ -917,47 +917,91 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre } define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) { - ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat - ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX11-WF32-NEXT: {{ $}} - ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-WF32-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc - ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 - ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX11-WF32-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec - ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-TRUE16-LABEL: name: amdgpu_cs_chain_cc_bfloat + ; DAGISEL-GFX11-WF32-TRUE16: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[COPY]], %subreg.hi16 + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF32-TRUE16-NEXT: S_ENDPGM 0 ; - ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_cc_bfloat - ; DAGISEL-GFX11-WF64: bb.0 (%ir-block.0): - ; DAGISEL-GFX11-WF64-NEXT: liveins: $sgpr0, $vgpr8 - ; DAGISEL-GFX11-WF64-NEXT: {{ $}} - ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-WF64-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc - ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 - ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 - ; DAGISEL-GFX11-WF64-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec - ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 + ; DAGISEL-GFX11-WF32-FAKE16-LABEL: name: amdgpu_cs_chain_cc_bfloat + ; DAGISEL-GFX11-WF32-FAKE16: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: {{ $}} + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF32-FAKE16-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-TRUE16-LABEL: name: amdgpu_cs_chain_cc_bfloat + ; DAGISEL-GFX11-WF64-TRUE16: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_MOV_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64 0, 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE killed [[V_MOV_B16_t16_e64_]], %subreg.lo16, [[COPY]], %subreg.hi16 + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF64-TRUE16-NEXT: S_ENDPGM 0 + ; + ; DAGISEL-GFX11-WF64-FAKE16-LABEL: name: amdgpu_cs_chain_cc_bfloat + ; DAGISEL-GFX11-WF64-FAKE16: bb.0 (%ir-block.0): + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: liveins: $sgpr0, $vgpr8 + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: {{ $}} + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 16, [[COPY]], implicit $exec + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY1]], 16, implicit-def dead $scc + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, killed [[S_LSHL_B32_]], 0, killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_BFE_U32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_U32_e64 [[V_ADD_F32_e64_]], 16, 1, implicit $exec + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767 + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304 + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF64-FAKE16-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat ; DAGISEL-GFX10-WF32: bb.0 (%ir-block.0): diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll index 0f37639059169..52f6dab902b3e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.bf16.ll @@ -16,14 +16,15 @@ define bfloat @v_exp2_bf16(bfloat %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -90,26 +91,25 @@ define bfloat @v_exp2_fabs_bf16(bfloat %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0x7fff, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -169,26 +169,25 @@ define bfloat @v_exp2_fneg_fabs_bf16(bfloat %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b16 v1.h, 0x8000, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -249,26 +248,25 @@ define bfloat @v_exp2_fneg_bf16(bfloat %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v1, v1, v0, 0x7fff ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -328,14 +326,15 @@ define bfloat @v_exp2_bf16_fast(bfloat %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v2 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) @@ -642,47 +641,47 @@ define <2 x bfloat> @v_exp2_fneg_fabs_v2bf16(<2 x bfloat> %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 15 -; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l -; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v2.l -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 -; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, s0 -; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v1, v1, v2 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0 -; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, 0x8000, v1.l +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v2 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42800000, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v0, v4 -; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v2, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, 0x8000, v0.l ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0 +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v3 ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v2, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_fabs_v2bf16: @@ -774,46 +773,44 @@ define <2 x bfloat> @v_exp2_fneg_v2bf16(<2 x bfloat> %in) { ; GFX1200-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.h -; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0x42800000, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 0x42800000, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v3 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo ; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 0xffffffc0, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v2, v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, 0x8000, v0.l +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0xc2fc0000, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 0x42800000, s0 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1200-SDAG-TRUE16-NEXT: v_add_f32_e32 v0, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v2 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 0xffffffc0, s0 ; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v0, v0 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_exp_f32_e32 v1, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v3 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v2 -; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v1, v1, v3 +; GFX1200-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1 +; GFX1200-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX1200-SDAG-TRUE16-NEXT: v_or_b32_e32 v4, 0x400000, v0 ; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff ; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h +; GFX1200-SDAG-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX1200-SDAG-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo +; GFX1200-SDAG-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v0, v1 ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-FAKE16-LABEL: v_exp2_fneg_v2bf16: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll index 0d5846a4a4985..5634df5d296f6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll @@ -6377,28 +6377,99 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log_f32_from_fpext_bf16: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo -; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 -; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 -; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo -; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX1100-SDAG-TRUE16-LABEL: v_log_f32_from_fpext_bf16: +; GFX1100-SDAG-TRUE16: ; %bb.0: +; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX1100-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-TRUE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-TRUE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-TRUE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-FAKE16-LABEL: v_log_f32_from_fpext_bf16: +; GFX1100-SDAG-FAKE16: ; %bb.0: +; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo +; GFX1100-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-FAKE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-FAKE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-FAKE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-SDAG-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-TRUE16-LABEL: v_log_f32_from_fpext_bf16: +; GFX1100-GISEL-TRUE16: ; %bb.0: +; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX1100-GISEL-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-TRUE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-TRUE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-GISEL-TRUE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-GISEL-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-GISEL-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-FAKE16-LABEL: v_log_f32_from_fpext_bf16: +; GFX1100-GISEL-FAKE16: ; %bb.0: +; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo +; GFX1100-GISEL-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-FAKE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-FAKE16-NEXT: v_mul_f32_e32 v1, 0x3f317217, v0 +; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-GISEL-FAKE16-NEXT: v_fma_f32 v2, 0x3f317217, v0, -v1 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3377d1cf, v2 +; GFX1100-GISEL-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x41b17218, vcc_lo +; GFX1100-GISEL-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log_f32_from_fpext_bf16: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll index 8006876dbe3ff..8d1a23119cd2b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll @@ -6377,28 +6377,99 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log10_f32_from_fpext_bf16: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo -; GFX1100-NEXT: v_ldexp_f32 v0, v0, v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 -; GFX1100-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| -; GFX1100-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 -; GFX1100-NEXT: v_add_f32_e32 v1, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo -; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX1100-SDAG-TRUE16-LABEL: v_log10_f32_from_fpext_bf16: +; GFX1100-SDAG-TRUE16: ; %bb.0: +; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX1100-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-TRUE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-TRUE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-TRUE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-FAKE16-LABEL: v_log10_f32_from_fpext_bf16: +; GFX1100-SDAG-FAKE16: ; %bb.0: +; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo +; GFX1100-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-SDAG-FAKE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-FAKE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-SDAG-FAKE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-SDAG-FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-SDAG-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-SDAG-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-TRUE16-LABEL: v_log10_f32_from_fpext_bf16: +; GFX1100-GISEL-TRUE16: ; %bb.0: +; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX1100-GISEL-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-TRUE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-TRUE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-GISEL-TRUE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-TRUE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-GISEL-TRUE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-GISEL-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-FAKE16-LABEL: v_log10_f32_from_fpext_bf16: +; GFX1100-GISEL-FAKE16: ; %bb.0: +; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc_lo +; GFX1100-GISEL-FAKE16-NEXT: v_ldexp_f32 v0, v0, v1 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX1100-GISEL-FAKE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-FAKE16-NEXT: v_mul_f32_e32 v1, 0x3e9a209a, v0 +; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e64 s0, 0x7f800000, |v0| +; GFX1100-GISEL-FAKE16-NEXT: v_fma_f32 v2, 0x3e9a209a, v0, -v1 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-GISEL-FAKE16-NEXT: v_fmamk_f32 v2, v0, 0x3284fbcf, v2 +; GFX1100-GISEL-FAKE16-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, v0, v1, s0 +; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x411a209b, vcc_lo +; GFX1100-GISEL-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log10_f32_from_fpext_bf16: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll index c1ac74e5094b0..7ca72bfbe59e6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll @@ -3830,20 +3830,67 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) { ; GFX900-NEXT: v_sub_f32_e32 v0, v0, v1 ; GFX900-NEXT: s_setpc_b64 s[30:31] ; -; GFX1100-LABEL: v_log2_f32_from_fpext_bf16: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX1100-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 -; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo -; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo -; GFX1100-NEXT: v_ldexp_f32 v0, v0, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_log_f32_e32 v0, v0 -; GFX1100-NEXT: s_waitcnt_depctr 0xfff -; GFX1100-NEXT: v_sub_f32_e32 v0, v0, v1 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; GFX1100-SDAG-TRUE16-LABEL: v_log2_f32_from_fpext_bf16: +; GFX1100-SDAG-TRUE16: ; %bb.0: +; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1100-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-SDAG-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX1100-SDAG-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1100-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-SDAG-TRUE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-SDAG-FAKE16-LABEL: v_log2_f32_from_fpext_bf16: +; GFX1100-SDAG-FAKE16: ; %bb.0: +; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-SDAG-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1100-SDAG-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo +; GFX1100-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-SDAG-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-SDAG-FAKE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-SDAG-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-SDAG-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-TRUE16-LABEL: v_log2_f32_from_fpext_bf16: +; GFX1100-GISEL-TRUE16: ; %bb.0: +; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.l, 0 +; GFX1100-GISEL-TRUE16-NEXT: v_mov_b16_e32 v1.h, v0.l +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1100-GISEL-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v1 +; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 32, vcc_lo +; GFX1100-GISEL-TRUE16-NEXT: v_ldexp_f32 v0, v1, v0 +; GFX1100-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1100-GISEL-TRUE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-TRUE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1100-GISEL-FAKE16-LABEL: v_log2_f32_from_fpext_bf16: +; GFX1100-GISEL-FAKE16: ; %bb.0: +; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1100-GISEL-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1100-GISEL-FAKE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0 +; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc_lo +; GFX1100-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo +; GFX1100-GISEL-FAKE16-NEXT: v_ldexp_f32 v0, v0, v2 +; GFX1100-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-GISEL-FAKE16-NEXT: v_log_f32_e32 v0, v0 +; GFX1100-GISEL-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX1100-GISEL-FAKE16-NEXT: v_sub_f32_e32 v0, v0, v1 +; GFX1100-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_log2_f32_from_fpext_bf16: ; R600: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll index 6e94896fa206e..c0fb1450ab682 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -193,13 +193,22 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src } define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, half %src2) #0 { -; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack: ; SDAG-GFX9: ; %bb.0: @@ -265,13 +274,22 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha } define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src1, half %src2) #0 { -; SDAG-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: -; SDAG-GFX11: ; %bb.0: -; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] -; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; SDAG-GFX11-TRUE16: ; %bb.0: +; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: +; SDAG-GFX11-FAKE16: ; %bb.0: +; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext: ; SDAG-GFX9: ; %bb.0: @@ -569,3 +587,4 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" attributes #1 = { nounwind readnone speculatable } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GISEL-GFX11-FAKE16: {{.*}} +; SDAG-GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll index 6246f2fd4fa5d..ca16e251d51cf 100644 --- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll @@ -118,34 +118,29 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX11-TRUE16-LABEL: v_maximumnum_bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_bf16: @@ -181,40 +176,34 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_bf16: @@ -339,21 +328,21 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX11-TRUE16-LABEL: v_maximumnum_bf16_nnan: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_bf16_nnan: @@ -381,25 +370,25 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_bf16_nnan: @@ -630,58 +619,46 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v3, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16: @@ -738,62 +715,56 @@ define <2 x bfloat> @v_maximumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16: @@ -1012,34 +983,29 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h ; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.h, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_nnan: @@ -1085,36 +1051,35 @@ define <2 x bfloat> @v_maximumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h ; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.h, v0.l, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v0.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_nnan: @@ -1444,66 +1409,67 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v8 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16: @@ -1575,77 +1541,80 @@ define <3 x bfloat> @v_maximumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v8 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v0, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16: @@ -1939,41 +1908,40 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v6 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v3.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v0.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_nnan: @@ -2029,48 +1997,50 @@ define <3 x bfloat> @v_maximumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v6 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v3.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v8 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v0.l, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_nnan: @@ -2507,85 +2477,83 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v10, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v9, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v11, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v8, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16: @@ -2680,99 +2648,98 @@ define <4 x bfloat> @v_maximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v10, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v9, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v11, v12 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v7.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v7.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v8.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v8, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v0.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v0, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v5.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16: @@ -3158,53 +3125,52 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v5, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v7, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v6, v8 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.h -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v5.h, v0.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_nnan: @@ -3273,62 +3239,63 @@ define <4 x bfloat> @v_maximumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v5, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v7, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v6, v8 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.h -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v1.l, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.h -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v3.h -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v5.h, v0.l, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_nnan: @@ -3957,125 +3924,120 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.h, v6.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v5.h, v9.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v13, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v9.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v6.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v4.h, v9.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v9.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v8.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s2 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v13, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v7.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v11.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v9.l, v8.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v6.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.l, v8.l, s3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.l, v10.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v10.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v12.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v10, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v4.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v3.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v9.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v11.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, v9.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v8.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v5.l, v12.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v12.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v9.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v4.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v12, v12 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v4.l, v1.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v11.h, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v1, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.h, v1.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v10.h, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v9.h, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v1.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v3.l, v0.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v0, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v9.h, v0.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v8.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v9.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v8.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v6bf16: @@ -4206,142 +4168,141 @@ define <6 x bfloat> @v_maximumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.h, v6.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v5.h, v9.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v8 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v13, v13 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v8.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s2 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v13, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v7.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v13 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v9.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v6.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v11.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v9.l, v8.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v6.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.l, v8.l, s3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v13.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v4.h, v9.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v10 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v10.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.l, v10.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v10.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v12.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v9.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v12.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v10, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v9.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v11 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v11.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v8 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v4.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, v9.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v8.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v5.l, v12.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v12.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v9 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v12.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v5, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v9.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v4.l, s3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v12, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v4.l, v1.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v1.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v11.h, s7 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v1, v9 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.h, v1.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v9.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v10.h, s6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v9.h, s3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.l, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v1.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v3.l, v0.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v0, v9 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v9.h, v0.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v8.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v11 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v9.h, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v8.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v0.h, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v3.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s4 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v6bf16: @@ -5219,171 +5180,160 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.h, v6.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v7.h, v8.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v15, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v5.h, v12.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v13, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v10.l, v8.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v11.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v11.l, v9.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v15.l, v9.l, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v7.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v17, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v17, v17 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v8.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v6.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v9.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v5.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v10.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v16, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v4.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v7.l, v16.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v16.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v16.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v7, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v6.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v16, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v6.l, v2.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v10.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v16, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v6.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v2, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v12.h, v2.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v12.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.h, v13.h, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.h, v12.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v5.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.h, v11.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v4.h, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v16, v16 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v11, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v9.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.h, v10.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v12.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v13, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v8.l, v10.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v10.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v15, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v4.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v6.l, v2.l, s3 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v13 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v6.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v3.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v4.h, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v1.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v5.l, v1.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v1, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v12.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v12.h, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v4.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.l, v0.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v0, v12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v12.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v12.h, v0.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v11.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.h, v11.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v14.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v15.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v12.h, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v14.h, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v15.h, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v1.h, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v0.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v8bf16: @@ -5546,201 +5496,187 @@ define <8 x bfloat> @v_maximumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v7 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.h, v6.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo +; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v7.h, v8.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v9.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v15, v17 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v5.h, v12.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v13, v18 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v10.l, v8.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v11.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v11.l, v9.l, s3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v15.l, v9.l, s4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v10.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.h, v11.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v4.h, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v16, v16 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v11, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v7.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v17, v17 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v17, v17 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v9.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.h, v10.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v12.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v8.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v13, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v6.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v13.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v8.l, v10.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v10.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v11.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v9.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v5.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v14 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v10.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v16, v16 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v15, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v4.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v15 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v4.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v6.l, v2.l, s3 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v7.l, v16.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v16.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v16.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v16.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v7, v7 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v6.l, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v16, v16 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v6.l, v2.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v16, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v2, v12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v12.h, v2.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v12.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.h, v13.h, s8 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v14, v13 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.h, v12.h, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v5.l, s5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v5.l, v1.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v1.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v6.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v1, v12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v12.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v7, v7 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v12.h, s5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v4.l, s6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.l, v0.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v0, v12 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v12.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v3.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v12.h, v0.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v11.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v4.h, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v1.h, s3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.h, v11.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v14.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v15.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v12.h, s9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v14.h, s6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v15.h, s7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v1.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v0.h, s3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v8 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v8bf16: @@ -7352,341 +7288,314 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX11-TRUE16-LABEL: v_maximumnum_v16bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v18, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v16.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v16.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v17.l, v16.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v17.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v6.h, v14.h, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v29, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v14.h, v16.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v16.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v32, v32 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.h, v16.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v32, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v18.l, v16.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v32, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v5.h, v13.h, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v32, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v13.h, v16.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v16.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v32, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v16, v25 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v25.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v32, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v25.h, v16.h, s1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v20.l, v16.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v20.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v32, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v4.h, v12.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v12.h, v16.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v16.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v16, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v26.h, v16.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.h, v21.l, v16.h, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v21.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.h, v11.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v16.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v16.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v16, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v27.h, v16.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.h, v22.l, v16.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v22.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v2.h, v10.h, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.h, v10.h, v16.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v16.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v16, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v28.h, v16.h, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.h, v23.l, v16.h, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v1.h, v9.h, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v9.h, v16.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v16.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v16, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.l, v29.h, v16.h, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.h, v24.l, v16.h, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v24.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v30, v30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v0.h, v8.h, s6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v8.h, v16.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v16.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v16, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v30.h, v16.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v15.h, v16.h, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v15.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v16.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v7.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v7.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s7, v7, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v16.h, v7.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v6.h, v16.h, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v7.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v14.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v14.l, v6.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v14.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s8, v6, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.h, v6.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v5.h, v16.h, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v6.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v13.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v13.l, v5.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v13.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s9, v5, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.h, v16.h, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v5.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v12.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v12.l, v4.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v12.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s10, v4, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v16.h, v4.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v16.h, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.l, v11.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v11.l, v3.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v11.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s11, v3, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.h, v3.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v3.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v2.h, v16.h, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v10.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v10.l, v2.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v6.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v20.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v24, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v19.l, v7.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v19.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.l, v7.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v20.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v20.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v19.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v20.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v3.h, v11.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v26, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v19.l, v22.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v11.h, v20.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v20.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v5.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v21.l, v20.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v20.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v22.l, v20.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v9.h, v23.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.l, v21.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v27, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v7.l, v6.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v21, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v7.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v5.l, v23.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v23.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v21.l, v23.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v15.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v20.l, v6.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v23, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.l, v16.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v7.l, v19.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v16.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v14.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v19.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v24, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v17.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v7.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v16.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.l, v16.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.l, v17.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v17.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v16.l, v17.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v14.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v14.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v18, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v10.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s12, v2, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v16.h, v2.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v16.h, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v9.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v9.l, v1.h, s14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v9.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s13, v1, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v0.l, v8.l, s14 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v16.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v1.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, s15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.l, v17.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v18.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v20.h, v25.h, s17 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v32.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.h, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.h, v18.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v20.l, v1.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v8.l, v32.h, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v18.l, v0.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v28.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v32, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v21.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v29.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v23.h, v28.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v16.h, v32.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v29.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.l, v32.h, s15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v23.l, v1.h, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v22.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v24.l, v8.h, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v16.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v15.h, v0.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v12.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.l, v4.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v13.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v10.l, v2.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v8.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v12.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v22, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v13.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v21 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v17 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v2.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v8.l, v1.h, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v10.l, v0.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v13.l, v2.h, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v16bf16: @@ -8005,406 +7914,355 @@ define <16 x bfloat> @v_maximumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v18, v5 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v14 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v6.l, v5.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v20.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v24, v23 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v19.l, v7.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v19.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.l, v7.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v24, v25 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v20.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v20.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v19.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v20.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v22.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v20 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v3.h, v11.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v26, v21 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v19.l, v22.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v11.h, v20.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v20.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v5.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v5.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo +; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v16, v17 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v16.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v23 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v16.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v21.l, v20.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v20.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v22.l, v20.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v9 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v23 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v21.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v9.h, v23.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.l, v21.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v27, v25 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo -; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v7.l, v6.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v8 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v21, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v7.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v5.l, v23.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v23.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v24 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v21.l, v23.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v15.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v20.l, v6.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v23, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.l, v16.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v17 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v7.l, v19.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v19.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v16.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v15.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v14.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v19.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v24, v23 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v17.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v7.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v17.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v14.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v16.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.l, v16.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.l, v17.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v17.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v16.l, v17.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v14.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v17.l, v16.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v17.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v6.h, v14.h, s0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v29, v29 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v14.h, v16.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v16.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v32, v32 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v16, v18 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v14.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.h, v16.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v32, v32 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v18.l, v16.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v32, v32 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v5.h, v13.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v32, v32 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v13.h, v16.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v16.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v32, v32 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v16, v25 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v25.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v32, v32 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v15.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v18, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v25.h, v16.h, s1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v20.l, v16.h, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v20.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v32, v32 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v32.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v4.h, v12.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v12.h, v16.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v16.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v16, v26 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v12.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.l, v4.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v26.h, v16.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.h, v21.l, v16.h, s3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v21.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v13.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v10.l, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.h, v11.h, s3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v16.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v16.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v16, v27 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v12.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v27.h, v16.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.h, v22.l, v16.h, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v22.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v1.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v8.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v12.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v22, v17 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v2.h, v10.h, s4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.h, v10.h, v16.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v16.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v16, v28 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v18, v17 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v28.h, v16.h, s4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.h, v23.l, v16.h, s5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v22, v21 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v13.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v1.h, v9.h, s5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v9.h, v16.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v16.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v16, v29 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.l, v29.h, v16.h, s5 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.h, v24.l, v16.h, s6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v24.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v30, v30 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v30.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v0.h, v8.h, s6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v8.h, v16.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v16.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v16, v30 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v30.h, v16.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v15.h, v16.h, s7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v15.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v15.l, s6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v7.h, s8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v7.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s7, v7, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v16.h, v7.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v16.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s8 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v6.h, v16.h, s7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v7.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v14.l, s8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v14.l, v6.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v6.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v14.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s8, v6, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.h, v6.h, s8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v16.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v5.h, v16.h, s8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v6.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v13.l, s9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v13.l, v5.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v5.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v13.h, s8 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s9, v5, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v16.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.h, v16.h, s9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v12.l, s10 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v12.l, v4.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v4.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v21 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v17 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v12.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s10, v4, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v16.h, v4.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v16.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v16.h, s10 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.l, v11.l, s11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v11.l, v3.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v11.h, s10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s11, v3, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.h, v3.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v16.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v3.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v2.h, v16.h, s11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v3.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v10.l, s12 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v10.l, v2.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v2.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v8.l, v1.h, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v10.l, v0.h, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v13.l, v2.h, s3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v10.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s12, v2, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v16.h, v2.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v16.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v16.h, s12 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v9.l, s13 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v16 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v9.l, v1.h, s14 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v1.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v8 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v9.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s13, v1, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v0.l, v8.l, s14 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v16.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v1.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v17.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, s15 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.l, v17.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v18.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v20.h, v25.h, s17 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v32.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.h, s16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.h, v18.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v26.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v20.l, v1.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v8.l, v32.h, s14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v18.l, v0.h, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v28.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v32, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s13 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v21.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v29.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v23.h, v28.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v16.h, v32.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v16.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v29.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.l, v32.h, s15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v23.l, v1.h, s4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v22.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v24.l, v8.h, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v16.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v8.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v15.h, v0.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v16bf16: @@ -11681,666 +11539,619 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-LABEL: v_maximumnum_v32bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: scratch_load_b32 v68, off, s32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v36.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.h, v30.h, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v55, v55 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v85, v85 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v30.h, v32.l, s2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v36.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v35, v35 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v37, v37 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v39, v39 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v49, v49 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v51, v51 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v53, v53 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v54, v54 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v65, v65 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v67, v67 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v69, v69 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v71, v71 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v86, v86 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v0.h, v16.h, s29 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v55.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v36, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v38, v38 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v48, v48 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v50, v50 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v52, v52 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v64, v64 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v66, v66 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v68, v68 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v70, v70 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v80, v80 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v83, v83 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v13.h, v29.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v12.h, v28.h, s5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v11.h, v27.h, s7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v10.h, v26.h, s9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v9.h, v25.h, s11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v24.h, s13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v7.h, v23.h, s15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v6.h, v22.h, s17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v5.h, v21.h, s19 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v4.h, v20.h, s21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v3.h, v19.h, s23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v85.l, v16.h, v54.l, s40 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s27 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v29.h, v33.l, s4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v28.h, v34.l, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v27.h, v35.l, s8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v26.h, v36.l, s10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v25.h, v37.l, s12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v24.h, v38.l, s14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v23.h, v39.l, s16 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v22.h, v48.l, s18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v21.h, v49.l, s20 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v81.l, v20.h, v50.l, s22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.l, v19.h, v51.l, s24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v54.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s40, v86, v118 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v87, v87 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v96, v96 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v36.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v39.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v50.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v51.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.l, v18.h, v52.l, s26 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v84.l, v17.h, v53.l, s28 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v65.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v67.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v68.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v69.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v80.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v81.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v97, v97 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, s41 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, v37.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, v52.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v119 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v128 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v129 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v130 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v132 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v133 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v134 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 16, v144 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 16, v145 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s63, v116, v86 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v86.l, v55.l, v32.l, s40 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v14.l, s42 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 16, v146 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v146, 16, v147 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s42, v87, v118 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s43, v96, v119 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s45, v98, v129 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s56, v101, v132 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s59, v112, v135 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s60, v113, v144 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v86.l, v32.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v86.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v35.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v36.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v39.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v50.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v51.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s46, v99, v130 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s61, v114, v145 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s62, v115, v146 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.l, v65.l, v34.l, s43 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v98.l, v67.l, v36.l, s45 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v101.l, v70.l, v39.l, s56 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v112.l, v81.l, v50.l, s59 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v113.l, v82.l, v51.l, s60 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v55.l, s16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v118 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v33.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v37.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v48.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v52.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v70.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v81.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v87.l, v64.l, v33.l, s42 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v99.l, v68.l, v37.l, s46 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v114.l, v83.l, v52.l, s61 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v115.l, v84.l, v53.l, s62 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v96.l, v34.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v98.l, v36.l, s5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v101.l, v39.l, s8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v101.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v112.l, v50.l, s11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v112.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v113.l, v51.l, s12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v113.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, v38.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v49.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v97, v128 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v116.l, v85.l, v54.l, s63 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v87.l, v33.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v99.l, v37.l, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v114.l, v52.l, s13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v114.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v115.l, v53.l, s14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v115.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v39 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v35.h, v81.l, s26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v35.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s57, v102, v133 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v97.l, v66.l, v35.l, s44 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v116.l, v54.l, s15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v51 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v48.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v65.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v66.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s47, v100, v131 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s58, v103, v134 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v102.l, v71.l, v48.l, s57 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v96.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v97.l, v35.l, s4 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v52 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v53 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v38.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v49.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v64.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v71.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v100.l, v69.l, v38.l, s47 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v103.l, v80.l, v49.l, s58 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v87.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v102.l, v48.l, s9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v102.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v30.h, v65.l, s18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v128 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v66.l, s19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v129 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v80.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0, v85.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v38.l, s7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v103.l, v49.l, s10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v103.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v14.h, v64.l, s17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v119 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v34.h, v71.l, s24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 16, v48 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v65 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v66 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v86.l, v13.h, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v49 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v37.h, v85.l, s41 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v64 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v71 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v96.l, v30.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v97.l, v32.l, s4 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v67.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v68.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v98.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v99.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v87.l, v38.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v102.l, v38.h, s9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v67.l, s20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v130 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v131 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0, v84.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v70 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v67 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0, v82.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v68 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v37.l, v84.l, s29 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v101.l, v34.l, s8 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v83.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v80 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v36.l, v82.l, s27 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.h, v115.l, v37.l, s14 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v69.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v36.h, v83.l, s28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v103.l, v35.l, s10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v113.l, v36.l, s12 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v100.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.h, v69.l, s22 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v81 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v114.l, v48.l, s13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v39, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v83, v83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v86, v86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.h, v0.l, v16.l, s42 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s43, 0, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v68 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v36, v35 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0, v35.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v31.l, v36.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v31.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v14.h, v30.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v30.h, v36.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v36, v37 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s44, 0, v37.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v37.h, v36.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.l, v36.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v13.h, v29.h, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v29.h, v36.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v38.h, v36.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.l, v36.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v33.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v12.h, v28.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v28.h, v36.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v36, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v39.h, v36.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v34.l, v36.h, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v34.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v48, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v36.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v50.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v15.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v98.l, v32.h, s5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v50.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v52, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v11.h, v27.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v27.h, v36.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v36, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v48.h, v36.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v35.l, v36.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v35.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v36.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v32.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v50.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v10.h, v26.h, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.h, v26.h, v36.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v36, v49 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v49.h, v36.h, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v38.l, v36.h, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v38.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v9.h, v25.h, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v25.h, v36.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v36, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v50.h, v36.h, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v48.l, v36.h, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v48.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v8.h, v24.h, s6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.h, v24.h, v36.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v36, v51 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v51.h, v36.h, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v50.l, v36.h, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v50.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v7.h, v23.h, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.h, v23.h, v36.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s7, v36, v52 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v52.h, v36.h, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v52.l, v36.h, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v52.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v6.h, v22.h, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.h, v22.h, v36.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s8, v36, v53 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v53.h, v36.h, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.h, v53.l, v36.h, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v53.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v5.h, v21.h, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.h, v36.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s9, v36, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v54.h, v36.h, s9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v54.l, v36.h, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v54.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v4.h, v20.h, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.h, v20.h, v36.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s10, v36, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v55.h, v36.h, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.h, v55.l, v36.h, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v55.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v3.h, v19.h, s11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.h, v19.h, v36.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s11, v36, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v64.h, v36.h, s11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v64.l, v36.h, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v64.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v2.h, v18.h, s12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.h, v18.h, v36.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s12, v36, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v65.h, v36.h, s12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.h, v65.l, v36.h, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v65.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v1.h, v17.h, s13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.h, v17.h, v36.h, s14 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v36.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s13, v36, v66 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v66.h, v36.h, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v66.l, v36.h, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v66.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v0.h, v16.h, s14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.h, v16.h, v36.h, s15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v68 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s14, v36, v67 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.h, v15.l, v68.l, s15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v67.h, v36.h, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v67.l, v36.h, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v67.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v82.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v68.l, v82.h, s15 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s15, v82, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v36.h, v82.h, s15 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v68.l, v82.h, s16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v14.h, v36.h, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v68.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v14.l, v30.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v28 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s15, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v30.l, v14.h, s17 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v14.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v38, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v112.l, v39.l, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v51.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v29 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v15.h, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v117, v117 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v31.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v27 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s16, v14, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v82, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v36.h, v14.h, s16 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v14.l, v14.h, s17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v13.h, v36.h, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v14.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.l, v29.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v24 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s16, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v29.l, v13.h, s18 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v23 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s17, v13, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v29.h, s16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v82, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v36.h, v13.h, s17 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v13.l, v13.h, s18 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v21 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.h, v12.h, v36.h, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v13.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.l, v28.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v20 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s17, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v28.l, v12.h, s19 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v12.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v19 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s18, v12, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v28.h, s17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v82, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v36.h, v12.h, s18 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v36.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v12.l, v12.h, s19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v36.h, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.l, v27.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s18, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v27.l, v11.h, s20 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v11.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v27.h, s18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s19, v11, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v36.h, v11.h, s19 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v11.l, v11.h, s20 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v10.h, v36.h, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v26.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s19, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v26.l, v10.h, s21 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v26.h, s19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s20, v10, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v36.h, v10.h, s20 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v10.l, v10.h, s21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v9.h, v36.h, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v25.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s20, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v25.l, v9.h, s22 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v9.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v25.h, s20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s21, v9, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v36.h, v9.h, s21 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v9.h, s22 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.h, v8.h, v36.h, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v24.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s21, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v24.l, v8.h, s23 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v8.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v24.h, s21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s22, v8, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v36.h, v8.h, s22 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v8.h, s23 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.h, v7.h, v36.h, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v23.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s22, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v23.l, v7.h, s24 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v23.h, s22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s23, v7, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v36.h, v7.h, s23 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.h, v6.h, v36.h, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v22.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s23, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v22.l, v6.h, s25 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v22.h, s23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s24, v6, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v36.h, v6.h, s24 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s25 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.h, v5.h, v36.h, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s24, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v21.l, v5.h, s26 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v5.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v21.h, s24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s25, v5, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v36.h, v5.h, s25 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v4.h, v36.h, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v20.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s25, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v20.l, v4.h, s27 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v20.h, s25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s26, v4, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v36.h, v4.h, s26 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v3.h, v36.h, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.h, v3.l, v19.l, s27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v81, v81 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s26, 0, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v52, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v52 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v52, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v29.l, v13.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v13.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v30.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v30.l, v13.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v51, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v29.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v28.l, v12.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v31.l, v12.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v11.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.h, v28.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v51, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v31.l, v10.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v12.l, v10.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.l, v10.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v27, v26 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v12.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v9.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v12.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v10.l, v9.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v12.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v11.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v25, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v22.l, v6.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v10.l, v7.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v10.l, v7.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v11 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v12, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v6.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v7.l, v4.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v18.l, v2.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v12, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v16 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v2.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v8.l, v1.h, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v6.l, v0.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v9.l, v2.h, s3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v49 :: v_dual_mov_b32 v2, v48 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v39 :: v_dual_mov_b32 v4, v38 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v36 :: v_dual_mov_b32 v6, v35 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v34 :: v_dual_mov_b32 v8, v33 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v32 :: v_dual_mov_b32 v10, v31 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v30 :: v_dual_mov_b32 v12, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v19.l, v83.h, s28 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v3, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s27, v83, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.h, v2.l, v18.l, s28 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v1.l, v17.l, s40 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v32.h, v37.h, s44 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v36.h, v83.h, s27 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s40, 0, v82.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v85, v85 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v18.h, s26 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v32.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v83.h, s41 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v87, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v36.h, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v31.h, v35.h, s42 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0, v39.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v19.h +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s45, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v18.l, v82.h, s29 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0, v38.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v34.h, v39.h, s42 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v82, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.h, v38.h, s29 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v34.l, v1.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v36.h, v82.h, s44 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v33.l, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v50.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v35.l, v1.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v82.h, s40 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v51.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v54.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v49.l, v50.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s45 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v36.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v52.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v51.l, v51.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v53.h +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v17.l, v19.h, s27 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v38.l, v0.h, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v69.l, v52.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v55.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v48.l, v1.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v19, v36 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v69.h, v53.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v70.h, v55.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v66.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v36.h, v19.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v65.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v50.l, v2.h, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v70.l, v54.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v52.l, v0.h, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v16.h, v19.h, s28 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v53.l, v1.h, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v71.h, v65.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v54.l, v2.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v67.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v36.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v64.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v65.l, v1.h, s12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v67.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v55.l, v3.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v16.l, v96.h, s41 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v71.l, v64.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v80.l, v66.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v17.l, s4 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v96, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v64.l, v0.h, s11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v66.l, v16.l, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v67.l, v15.l, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v68.l, v30.h, s15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v36.h, v96.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v16.l, v96.h, s43 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v17.h, v36.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v17.h, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v32bf16: @@ -12956,753 +12767,697 @@ define <32 x bfloat> @v_maximumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v30 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: scratch_load_b32 v68, off, s32 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v36.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.h, v30.h, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v16 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v55, v55 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v85, v85 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v30.h, v32.l, s2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v29 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v28 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v27 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v38.l, v36.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v27 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v26 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v35, v35 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v37, v37 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v39, v39 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v49, v49 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v51, v51 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v53, v53 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v54, v54 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v65, v65 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v67, v67 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v69, v69 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v71, v71 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v86, v86 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v0.h, v16.h, s29 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v32.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v55.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v36, v36 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v38, v38 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v48, v48 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v50, v50 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v52, v52 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v64, v64 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v66, v66 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v68, v68 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v70, v70 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v80, v80 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v83, v83 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v13.h, v29.h, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v12.h, v28.h, s5 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v11.h, v27.h, s7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v10.h, v26.h, s9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v9.h, v25.h, s11 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v24.h, s13 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v7.h, v23.h, s15 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v6.h, v22.h, s17 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v5.h, v21.h, s19 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v4.h, v20.h, s21 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v3.h, v19.h, s23 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v85.l, v16.h, v54.l, s40 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v14 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s25 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s27 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v29.h, v33.l, s4 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v28.h, v34.l, s6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v27.h, v35.l, s8 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v26.h, v36.l, s10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v25.h, v37.l, s12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v24.h, v38.l, s14 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v23.h, v39.l, s16 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v22.h, v48.l, s18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v21.h, v49.l, s20 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v81.l, v20.h, v50.l, s22 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.l, v19.h, v51.l, s24 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v116.l, v54.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s40, v86, v118 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v85.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v30 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v87, v87 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v96, v96 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v87.l, v33.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v34.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v98.l, v36.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v101.l, v39.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v112.l, v50.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v113.l, v51.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.l, v18.h, v52.l, s26 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v84.l, v17.h, v53.l, s28 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v64.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v65.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v66.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v67.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v68.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v69.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v133.l, v70.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v134.l, v71.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v135.l, v80.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v144.l, v81.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v145.l, v82.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v97, v97 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, s41 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v32.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v99.l, v37.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v114.l, v52.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v115.l, v53.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113 -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v146.l, v83.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v147.l, v84.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v119 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v128 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v129 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v130 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v131 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v132 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v133 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v134 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v135 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 16, v144 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 16, v145 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s63, v116, v86 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v86.l, v55.l, v32.l, s40 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v13 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v14.l, s42 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v55.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 16, v146 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v146, 16, v147 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s42, v87, v118 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s43, v96, v119 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s45, v98, v129 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s56, v101, v132 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s59, v112, v135 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s60, v113, v144 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v86.l, v32.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v86.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v34.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v97.l, v35.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v36.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v39.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v50.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v51.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s46, v99, v130 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s61, v114, v145 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s62, v115, v146 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.l, v65.l, v34.l, s43 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v98.l, v67.l, v36.l, s45 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v101.l, v70.l, v39.l, s56 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v112.l, v81.l, v50.l, s59 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v113.l, v82.l, v51.l, s60 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v55.l, s16 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v118 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v33.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v37.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v102.l, v48.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v52.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v53.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v70.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v81.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v87.l, v64.l, v33.l, s42 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v99.l, v68.l, v37.l, s46 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v114.l, v83.l, v52.l, s61 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v115.l, v84.l, v53.l, s62 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v96.l, v34.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v98.l, v36.l, s5 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v101.l, v39.l, s8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v39.l, v101.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v112.l, v50.l, s11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v112.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v113.l, v51.l, s12 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v113.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v55 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v100.l, v38.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v103.l, v49.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v54.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v97, v128 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v116.l, v85.l, v54.l, s63 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v87.l, v33.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v99.l, v37.l, s6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v114.l, v52.l, s13 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v114.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v115.l, v53.l, s14 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v115.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v39 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v35.h, v81.l, s26 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v35.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s57, v102, v133 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v97.l, v66.l, v35.l, s44 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v116.l, v54.l, s15 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v116.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v51 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v48.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v65.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v66.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s47, v100, v131 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s58, v103, v134 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v102.l, v71.l, v48.l, s57 -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v96.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v97.l, v35.l, s4 -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v97.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v52 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v53 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v38.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v49.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v64.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v71.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v100.l, v69.l, v38.l, s47 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v103.l, v80.l, v49.l, s58 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v87.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v102.l, v48.l, s9 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v48.l, v102.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v30.h, v65.l, s18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v128 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v66.l, s19 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v129 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v80.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0, v85.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v38.l, s7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v103.l, v49.l, s10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v103.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v14.h, v64.l, s17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v119 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v34.h, v71.l, s24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 16, v48 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v65 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v66 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v86.l, v13.h, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s25 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v49 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v37.h, v85.l, s41 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v64 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v71 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v96.l, v30.h, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v97.l, v32.l, s4 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v67.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v68.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v98.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v99.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v87.l, v38.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v102.l, v38.h, s9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v67.l, s20 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v130 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v131 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0, v84.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v70 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v67 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0, v82.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v68 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v37.l, v84.l, s29 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v101.l, v34.l, s8 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v83.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v80 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v36.l, v82.l, s27 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.h, v115.l, v37.l, s14 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v69.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v36.h, v83.l, s28 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v103.l, v35.l, s10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v113.l, v36.l, s12 -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v100.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.h, v69.l, s22 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v81 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v114.l, v48.l, s13 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v39, v39 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v39.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v9 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v8 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v23 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v22 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v21 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v20 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v7 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v19 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v18 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v17 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v15 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 16, v30 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v17 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v16 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v83, v83 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v83.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v86, v86 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v36.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.h, v0.l, v16.l, s42 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s43, 0, v96.h ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v50.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v15.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v68 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v31 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v98.l, v32.h, s5 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v50.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v52, v53 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v51.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v32.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v50.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v38, v53 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v112.l, v39.l, s11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v51.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v14.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v29 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v50 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v30.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v15.h, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v117, v117 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v31.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v52, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v52 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v36, v35 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0, v35.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v29.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v14.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v30.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v52, v51 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v31.l, v36.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v31.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v32, v32 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v14.h, v30.h, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v30.h, v36.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v36, v37 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s44, 0, v37.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v37.h, v36.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.l, v36.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v29.l, v13.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v28.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v13.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v30.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v13.h, v29.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v29.h, v36.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v36, v38 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v30.l, v13.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v51, v50 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v29.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v38.h, v36.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.l, v36.h, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v33.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v28.l, v12.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v12.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v12.h, v28.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v28.h, v36.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v36, v39 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v31.l, v12.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v27.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v11.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v28.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v31.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v39.h, v36.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v34.l, v36.h, s3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v34.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v48, v48 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v48.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v51, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.h, v28.l, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v11.h, v27.h, s3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v27.h, v36.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v36, v48 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v50 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v10.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v11.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v26.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v48.h, v36.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v35.l, v36.h, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v35.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v27.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v51, v50 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v10.h, v26.h, s4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.h, v26.h, v36.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s4, v36, v49 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v31.l, v10.h, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v12.l, v10.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v49.h, v36.h, s4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v38.l, v36.h, s5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v38.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.l, v10.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v27, v26 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v12.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v9.h, v25.h, s5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v25.h, v36.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s5, v36, v50 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v9.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v9.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v12.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v50.h, v36.h, s5 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v48.l, v36.h, s6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v48.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v10.l, v9.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v12.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v8.h, v24.h, s6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.h, v24.h, v36.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s6, v36, v51 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v11.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v26, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v51.h, v36.h, s6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v50.l, v36.h, s7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v50.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v11.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v8.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v11.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v7.h, v23.h, s7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.h, v23.h, v36.h, s8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s7, v36, v52 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v12.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v25, v24 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v52.h, v36.h, s7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v52.l, v36.h, s8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v52.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v22.l, v6.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v10.l, v7.h, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v6.h, v22.h, s8 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.h, v22.h, v36.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s8, v36, v53 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v10.l, v7.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v12, v11 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v9.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v22 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v53.h, v36.h, s8 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.h, v53.l, v36.h, s9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v53.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v5.h, v21.h, s9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.h, v36.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s9, v36, v54 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v8.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v54.h, v36.h, s9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v54.l, v36.h, s10 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v54.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v4.h, v20.h, s10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.h, v20.h, v36.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s10, v36, v55 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v55.h, v36.h, s10 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.h, v55.l, v36.h, s11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v55.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v9.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v12, v11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v3.h, v19.h, s11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.h, v19.h, v36.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s11, v36, v64 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v6.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v64.h, v36.h, s11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v64.l, v36.h, s12 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v64.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v65.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v7.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v2.h, v18.h, s12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.h, v18.h, v36.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s12, v36, v65 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v18.l, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v11, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v65.h, v36.h, s12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.h, v65.l, v36.h, s13 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v65.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v66.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v1.h, v17.h, s13 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.h, v17.h, v36.h, s14 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0, v36.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s13, v36, v66 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v66.h, v36.h, s13 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v66.l, v36.h, s14 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v66.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v67.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v0.h, v16.h, s14 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.h, v16.h, v36.h, s15 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v36.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v68 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s14, v36, v67 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.h, v15.l, v68.l, s15 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v67.h, v36.h, s14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v67.l, v36.h, s16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v67.l +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v82.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v68.l, v82.h, s15 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s15, v82, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v36.h, v82.h, s15 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v68.l, v82.h, s16 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v13 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v29 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v14.h, v36.h, s15 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v68.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v14.l, v30.l, s16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v28 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s15, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v30.l, v14.h, s17 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v14.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v27 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s16, v14, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v82, v82 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v36.h, v14.h, s16 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0, v36.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v26 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v14.l, v14.h, s17 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v25 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v13.h, v36.h, s16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v14.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.l, v29.l, s17 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v24 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s16, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v29.l, v13.h, s18 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v13.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v23 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s17, v13, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.l, v1.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v12, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v29.h, s16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v82, v82 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v36.h, v13.h, s17 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0, v36.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v22 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v13.l, v13.h, s18 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v11 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v21 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.h, v12.h, v36.h, s17 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v13.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.l, v28.l, s18 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v20 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s17, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v28.l, v12.h, s19 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v12.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v19 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s18, v12, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v28.h, s17 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v82, v82 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v36.h, v12.h, s18 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0, v36.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v12.l, v12.h, s19 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v10 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v36.h, s18 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.l, v27.l, s19 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s18, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v27.l, v11.h, s20 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v11.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v17, v16 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v27.h, s18 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s19, v11, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v36.h, v11.h, s19 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v11.l, v11.h, s20 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v9 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v10.h, v36.h, s19 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v11.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v26.l, s20 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s19, 0, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v26.l, v10.h, s21 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v10.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v26.h, s19 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s20, v10, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v36.h, v10.h, s20 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v10.l, v10.h, s21 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v8 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v9.h, v36.h, s20 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v10.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v25.l, s21 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s20, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v25.l, v9.h, s22 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v9.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v25.h, s20 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s21, v9, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v36.h, v9.h, s21 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v9.h, s22 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v7 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.h, v8.h, v36.h, s21 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v9.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v24.l, s22 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s21, 0, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v24.l, v8.h, s23 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v8.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v24.h, s21 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s22, v8, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v36.h, v8.h, s22 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v8.h, s23 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.h, v7.h, v36.h, s22 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v23.l, s23 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s22, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v23.l, v7.h, s24 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v7.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v23.h, s22 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s23, v7, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v36.h, v7.h, s23 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s24 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.h, v6.h, v36.h, s23 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v7.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v22.l, s24 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s23, 0, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v22.l, v6.h, s25 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v6.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v22.h, s23 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s24, v6, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v36.h, v6.h, s24 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s25 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.h, v5.h, v36.h, s24 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, s25 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s24, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v21.l, v5.h, s26 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v5.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v2.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v8.l, v1.h, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v6.l, v0.h, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v9.l, v2.h, s3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v29 :: v_dual_mov_b32 v1, v49 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v48 :: v_dual_mov_b32 v3, v39 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v38 :: v_dual_mov_b32 v5, v36 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v35 :: v_dual_mov_b32 v7, v34 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v33 :: v_dual_mov_b32 v9, v32 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v10, v31 :: v_dual_mov_b32 v11, v30 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v21.h, s24 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s25, v5, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v36.h, v5.h, s25 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s26 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v4.h, v36.h, s25 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v20.l, s26 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s25, 0, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v20.l, v4.h, s27 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0, v4.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v20.h, s25 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s26, v4, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v36.h, v4.h, s26 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s27 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v3.h, v36.h, s26 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v4.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.h, v3.l, v19.l, s27 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v81, v81 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s26, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v19.l, v83.h, s28 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v3, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0, v83.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s27, v83, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.h, v2.l, v18.l, s28 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v1.l, v17.l, s40 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v32.h, v37.h, s44 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v36.h, v83.h, s27 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s40, 0, v82.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v85, v85 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v18.h, s26 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v32.l, v1.l, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v83.h, s41 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v87, v87 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v36.h, s28 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v3.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v31.h, v35.h, s42 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0, v39.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0, v19.h +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s45, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v18.l, v82.h, s29 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0, v38.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v34.h, v39.h, s42 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v48.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s44, v82, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.h, v38.h, s29 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v34.l, v1.h, s2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v36.h, v82.h, s44 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v33.l, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v49.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v50.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v35.l, v1.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v82.h, s40 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v51.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v54.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v49.l, v50.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s45 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v36.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v1.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v52.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v51.l, v51.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v53.h +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v17.l, v19.h, s27 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v38.l, v0.h, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v69.l, v52.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v55.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v48.l, v1.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v19, v36 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v69.h, v53.h, s2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v70.h, v55.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v66.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v36.h, v19.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v65.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v50.l, v2.h, s6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v70.l, v54.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v52.l, v0.h, s7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v16.h, v19.h, s28 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v53.l, v1.h, s8 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v71.h, v65.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v54.l, v2.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v67.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v36.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v64.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v65.l, v1.h, s12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v67.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v55.l, v3.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v16.l, v96.h, s41 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v71.l, v64.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v80.l, v66.h, s2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v17.l, s4 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v96, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v64.l, v0.h, s11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v66.l, v16.l, s13 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v67.l, v15.l, s14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v68.l, v30.h, s15 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v36.h, v96.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v16.l, v96.h, s43 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v17.h, v36.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v17.h, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v32bf16: @@ -14612,34 +14367,29 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 { ; GFX11-TRUE16-LABEL: v_maximumnum_bf16_no_ieee: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_bf16_no_ieee: @@ -14675,40 +14425,34 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v1.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_bf16_no_ieee: @@ -14949,58 +14693,46 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v3, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee: @@ -15057,62 +14789,56 @@ define <2 x bfloat> @v_maximumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v2bf16_no_ieee: @@ -15458,66 +15184,67 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v8 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee: @@ -15589,77 +15316,80 @@ define <3 x bfloat> @v_maximumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> % ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v8 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v4.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v7, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v1.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s3, v9, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v0, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v4.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v3bf16_no_ieee: @@ -16117,85 +15847,83 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> % ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v10, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v9, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v11, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v8, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee: @@ -16290,99 +16018,98 @@ define <4 x bfloat> @v_maximumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> % ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v10, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v6.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v9, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s1, v11, v12 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v7.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v1.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0, v7.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v8.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s0, v8, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v0.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_gt_f32_e64 s2, v0, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0, v6.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0, v5.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_maximumnum_v4bf16_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll index 678d0a432a44f..416a601797617 100644 --- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll @@ -120,34 +120,29 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX11-TRUE16-LABEL: v_minimumnum_bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_bf16: @@ -183,40 +178,34 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_bf16: @@ -344,21 +333,21 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX11-TRUE16-LABEL: v_minimumnum_bf16_nnan: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, s0 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_bf16_nnan: @@ -386,25 +375,25 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, v0.l +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v0.l, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_bf16_nnan: @@ -639,58 +628,46 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v3, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16: @@ -747,62 +724,56 @@ define <2 x bfloat> @v_minimumnum_v2bf16(<2 x bfloat> %x, <2 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16: @@ -1024,34 +995,29 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h ; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.h, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_nnan: @@ -1097,36 +1063,35 @@ define <2 x bfloat> @v_minimumnum_v2bf16_nnan(<2 x bfloat> %x, <2 x bfloat> %y) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h ; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v0.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.h, v0.l, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.h, v0.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, s1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.h, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_nnan: @@ -1459,66 +1424,67 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v8 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16: @@ -1590,77 +1556,80 @@ define <3 x bfloat> @v_minimumnum_v3bf16(<3 x bfloat> %x, <3 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v8 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v0, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16: @@ -1957,41 +1926,40 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v6 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v0.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_nnan: @@ -2047,48 +2015,50 @@ define <3 x bfloat> @v_minimumnum_v3bf16_nnan(<3 x bfloat> %x, <3 x bfloat> %y) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v6 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v8 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v3.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v0.h, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.h, v1.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.l, v0.l, vcc_lo ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.h, v0.l, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.h, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v0.l, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v0.h, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_nnan: @@ -2528,85 +2498,83 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v10, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v9, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v11, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v8, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16: @@ -2701,99 +2669,98 @@ define <4 x bfloat> @v_minimumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v10, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v9, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v11, v12 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v7.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v7.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v8.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v8, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v0.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v0, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v5.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16: @@ -3181,53 +3148,52 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v5, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v7, v6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v6, v8 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.h -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v5.h, v0.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_nnan: @@ -3296,62 +3262,63 @@ define <4 x bfloat> @v_minimumnum_v4bf16_nnan(<4 x bfloat> %x, <4 x bfloat> %y) ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v2 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v5, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v7, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v1.h, s4 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v6, v8 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.h -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v1.l, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v6 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, s4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.h, v1.l, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 ; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v2.h, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.h -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v3.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v3.h -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.h, v1.h, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v3.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v5.h, v0.l, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v0.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v8 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.h, v0.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v2.h, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v4.l, v1.l, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v1.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.h, v0.h, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_nnan: @@ -3984,125 +3951,120 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.h, v6.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v5.h, v9.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v13, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v9.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v6.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v4.h, v9.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v9.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v8.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s2 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v13, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v7.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v11.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v9.l, v8.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v6.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.l, v8.l, s3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.l, v10.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v10.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v12.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v10, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v4.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v11 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 ; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v3.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v9.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v11.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, v9.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v8.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v5.l, v12.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v12.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v5, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v9.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v4.l, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v12, v12 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v4.l, v1.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v11.h, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v1, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.h, v1.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v10.h, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v9.h, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v1.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v3.l, v0.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v0, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v9.h, v0.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v8.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v9.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v8.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s4 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v6bf16: @@ -4233,142 +4195,141 @@ define <6 x bfloat> @v_minimumnum_v6bf16(<6 x bfloat> %x, <6 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v7, v7 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v9, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v9.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v2.h, v5.h, vcc_lo ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v10, v10 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.h, v6.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v5.h, v9.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v13, v13 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v8 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v13, v13 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v1.h, v4.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.h, v8.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v3.h, s2 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v13, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v14 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v8.h, v9.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v7.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v13 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v9.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v6.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v9 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v3.h, v10.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v10.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v11.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v4.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v9.l, v8.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v6.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v13.l, v8.l, s3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v13.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.h, v7.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v15 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v4.h, v9.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v10 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v10.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v8, v8 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v12.l, v10.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v9.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v9.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v10.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v1.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v7 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v12.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v9.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v7.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v0.h, v3.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v5.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v12.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v10, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v9.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v11 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v11.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v9.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v8 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v4.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v8.l, v9.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v8.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.l, v5.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v5.l, v12.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v12.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v9 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v12.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v5, v5 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.h, v9.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v4.l, s3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v9.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v12, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v4.l, v1.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v1.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v2.h, v11.h, s7 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v1, v9 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.h, v1.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v9.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v5, v5 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.h, v10.h, s6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.h, v9.h, s3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v3.l, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v9.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v1.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v3.l, v0.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.h, s3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v0, v9 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.h, v5.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v9.h, v0.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v8.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v11 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v0.h, v9.h, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.h, v8.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v6.l, v0.h, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v3.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s4 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v6bf16: @@ -5250,171 +5211,160 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.h, v6.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v7.h, v8.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v15, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v5.h, v12.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v13, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v10.l, v8.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v11.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v11.l, v9.l, s3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v15.l, v9.l, s4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v7.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v17, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v17, v17 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v8.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v6.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v13.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v9.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v5.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v10.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v16, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v4.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v7.l, v16.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v16.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v16.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v7, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.h, v12.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v6.l, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v16, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v6.l, v2.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v10.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v16, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v6.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v2, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v12.h, v2.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v12.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.h, v13.h, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.h, v12.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v5.l, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, s2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.h, v11.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v4.h, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v16, v16 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v11, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v9.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.h, v10.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v12.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v13, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v8.l, v10.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v10.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v15, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v4.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v6.l, v2.l, s3 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v13 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v6.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v3.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v4.h, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v1.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v5.l, v1.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v1, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v12.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v7, v7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v12.h, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v4.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.l, v0.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v0, v12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v12.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v12.h, v0.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v11.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.h, v11.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v14.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v15.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v12.h, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v14.h, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v15.h, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v1.h, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v0.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v8bf16: @@ -5577,201 +5527,187 @@ define <8 x bfloat> @v_minimumnum_v8bf16(<8 x bfloat> %x, <8 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v7 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v5 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v12.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v3.h, v7.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v11, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v2.h, v6.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v3.h, v7.h, vcc_lo +; GFX12-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v7.h, v8.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v6.h, v9.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v1.h, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v9.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v15, v17 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v5.h, v12.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v13, v18 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v10.l, v8.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v11.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v11.l, v9.l, s3 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v14.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v13.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v15.l, v9.l, s4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v10.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.h, v11.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v20 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v0.h, v4.h, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v16, v16 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v11, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v7.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v17, v17 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v6 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v17, v17 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v13.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v15.l, v9.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.h, v10.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v14.l, v12.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v11.h, v12.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v7.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v8.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v12.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v2.h, v6.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v13, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v6.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v13.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v14.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v8.l, v10.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v12, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v14 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v10.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.h, v8.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v11.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v7.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v5 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v13.h, v12.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v5.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v13, v13 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v9.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v14, v14 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v1.h, v5.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v5.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v14 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v14.h, v12.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v10.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v16, v16 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v15, 16, v15 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v14, v14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v0.h, v4.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v15, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v4.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v15 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v3.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v4.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v15.h, v12.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v12.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v6.l, v2.l, s3 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v11.l, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v11.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.h, v7.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.l, v7.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v7.l, v16.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v12.h, v16.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v16.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v7, v7 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.h, v12.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v3.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v6.l, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v12.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v16, v16 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v4 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v6.l, v2.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v2.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v16, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v2, v12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v12.h, v2.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v12.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v7, v7 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.h, v13.h, s8 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v14, v13 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v5.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.h, v12.h, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v5.l, s5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v12.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v9.l, v2.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v5.l, v1.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v1.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v6.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v2.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v5.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v1, v12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v12.h, v1.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v12.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v1.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v7, v7 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v12.h, s5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v4.l, s6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v12.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.l, v0.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v5.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v0, v12 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v12.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v3.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v12.h, v0.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v11.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v4.l, v4.h, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v1.h, s3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.h, v11.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v14.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v15.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v12.h, s9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.h, v0.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.h, v14.h, s6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v15.h, s7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v1.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v10.l, v0.h, s3 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v7.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v9 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, v8 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v11.l, v5.l, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v4.l, s1 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v8bf16: @@ -7391,341 +7327,314 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX11-TRUE16-LABEL: v_minimumnum_v16bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v18, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v16.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v16, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v16.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v17.l, v16.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v17.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v6.h, v14.h, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v29, v29 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v14.h, v16.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v32, v32 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.h, v16.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v32, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v18.l, v16.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v32, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v5.h, v13.h, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v32, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v13.h, v16.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v16.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v32, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v16, v25 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v25.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v32, v32 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v25.h, v16.h, s1 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v20.l, v16.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v20.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v32, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v4.h, v12.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v12.h, v16.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v16, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v26.h, v16.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.h, v21.l, v16.h, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v21.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.h, v11.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v16.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v16, v27 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v27.h, v16.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.h, v22.l, v16.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v22.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v2.h, v10.h, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.h, v10.h, v16.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v16, v28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v28.h, v16.h, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.h, v23.l, v16.h, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v1.h, v9.h, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v9.h, v16.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v16, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.l, v29.h, v16.h, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.h, v24.l, v16.h, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v24.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v30, v30 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v0.h, v8.h, s6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v8.h, v16.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v16, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v30.h, v16.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v15.h, v16.h, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v15.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v15.l, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v16.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v7.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v7.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s7, v7, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v16.h, v7.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v6.h, v16.h, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v7.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v14.l, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v14.l, v6.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v14.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s8, v6, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.h, v6.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v5.h, v16.h, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v6.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v13.l, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v13.l, v5.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v13.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s9, v5, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.h, v16.h, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v5.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v12.l, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v12.l, v4.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v12.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s10, v4, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v16.h, v4.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v16.h, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.l, v11.l, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v11.l, v3.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v11.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s11, v3, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.h, v3.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v3.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v2.h, v16.h, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v10.l, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v10.l, v2.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v6.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v20.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v24, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v19.l, v7.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v19.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.l, v7.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v24, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v20.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v20.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v19.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v20.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v3.h, v11.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v26, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v19.l, v22.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v11.h, v20.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v20.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v5.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v5.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.l, v21.l, v20.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v20.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v22.l, v20.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v9.h, v23.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.l, v21.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v27, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v7.l, v6.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v21, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v7.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.l, v5.l, v23.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v23.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v21.l, v23.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v15.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v20.l, v6.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v23, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.l, v16.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v7.l, v19.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v16.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v14.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v19.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v24, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v17.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v7.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v16.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.l, v16.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.l, v17.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v17.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v16.l, v17.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v14.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v14.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v18, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v10.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s12, v2, v16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v16.h, v2.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v31, v31 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v16.h, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v9.l, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v9.l, v1.h, s14 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v9.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s13, v1, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v0.l, v8.l, s14 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v1.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v17.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, s15 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.l, v17.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v18.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v20.h, v25.h, s17 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v32.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.h, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.h, v18.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v20.l, v1.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v8.l, v32.h, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v18.l, v0.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v28.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v32, v16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v21.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v29.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v23.h, v28.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v16.h, v32.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v29.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.l, v32.h, s15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v23.l, v1.h, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v22.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v24.l, v8.h, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v16.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v15.h, v0.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v12.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.l, v4.h, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v13.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v10.l, v2.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v8.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v12.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v22, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v13.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v21 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v17 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v2.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v8.l, v1.h, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.l, v10.l, v0.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.l, v13.l, v2.h, s3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v16bf16: @@ -8044,406 +7953,355 @@ define <16 x bfloat> @v_minimumnum_v16bf16(<16 x bfloat> %x, <16 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v16, v7 :: v_dual_mov_b32 v17, v6 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v18, v5 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v15 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff0000, v7 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff0000, v15 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v6 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v14 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v16 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v17 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v15.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v5.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.h, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v19 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.h, v14.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v14.h, v7.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v7.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v18.h, v13.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v6.l, v5.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v13.h, v20.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v24, v23 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v19.l, v7.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v26 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v19.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v23.l, v7.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v21.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v12 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v24, v25 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v4.h, v12.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v20.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v20.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v19.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v12.h, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v21 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v6.l, v20.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v11 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v22.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v20 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v3.h, v11.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v23.l, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v26, v21 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v19.l, v22.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v11.h, v20.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v20.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v7.l, v5.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v5.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v13 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v12 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v7.h, v15.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v15.h, v16.h, vcc_lo +; GFX12-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v16.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v16, v17 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v16.l ; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v19.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v23 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v16.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.h, v10.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v25 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.h, v16.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v21.l, v20.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v20.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v7.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v10.h, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v22.l, v20.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff0000, v9 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v22.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v23 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v1.h, v9.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v24, v24 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v21.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v26 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v19.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v9.h, v23.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v20.l, v21.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v24 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v27, v25 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v23.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v5.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v22.l, v19.l, vcc_lo -; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v7.l, v6.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v8 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v20.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v21, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v0.h, v8.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v7.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v25 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v5.l, v23.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v23.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.h, v19.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v16 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v24 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v21.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v21.l, v23.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v19.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v15 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v16.l, v15.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v22, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v20.l, v6.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v23, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v15.l, v16.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v17 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v7.l, v19.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v19.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v16.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v14 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v15.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v14.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v22, v22 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v19.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v25 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v24, v23 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v17.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v21.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v7.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v22 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v17.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v14.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v6.l, v16.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v13 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.l, v16.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v18.l, v13.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v7.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v6.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v14.l, v17.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v17.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v13.l, v15.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v24 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v16.l, v17.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v14.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v12 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v17.l, v16.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v17.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff0000, v9 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v6.h, v14.h, s0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v8 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v29, v29 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v14.h, v16.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v7 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v32, v32 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v16, v18 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v14 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v12.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v14.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.l, v18.h, v16.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v32, v32 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v13 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v18.l, v16.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v18.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v20, v20 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v32, v32 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v3 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v6.l, v15.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v5.h, v13.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v32, v32 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v13.h, v16.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v32, v32 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v16, v25 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v25.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v32, v32 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v11.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v13, v13 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v5.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v15.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v18, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v25.h, v16.h, s1 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v20.l, v16.h, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v20.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v21, v21 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v32, v32 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v32.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v3.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v21 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v5.l, v15.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v4.h, v12.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v12.h, v16.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v16, v26 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v10.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v17, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v13 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v12.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.l, v4.h, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.l, v26.h, v16.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.h, v21.l, v16.h, s3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v21.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v22, v22 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v13.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v10.l, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v3.h, v11.h, s3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v16.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v16, v27 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v18, v18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v8.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v21, v21 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v12.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.l, v27.h, v16.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.h, v22.l, v16.h, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v22.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v23, v23 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v1.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v10.l, v3.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v8.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v12.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v22, v17 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v0.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v2.h, v10.h, s4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.h, v10.h, v16.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v16, v28 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v11.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v18, v17 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.l, v28.h, v16.h, s4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.h, v23.l, v16.h, s5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v23.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v24, v24 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v16 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v22, v21 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v9.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v13.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v1.h, v9.h, s5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v9.h, v16.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v16, v29 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v3.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.l, v29.h, v16.h, s5 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.h, v24.l, v16.h, s6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v24.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v30, v30 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v30.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v0.h, v8.h, s6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v8.h, v16.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v16, v30 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v30.h, v16.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v15.h, v16.h, s7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v15.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v15.l, s6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v15.l, v7.h, s8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v7.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s7, v7, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v16.h, v7.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v16.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s8 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v6.h, v16.h, s7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v7.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v14.l, s8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v14.l, v6.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v6.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v11.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v9.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v14.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s8, v6, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v16.h, v6.h, s8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v5.h, v16.h, s8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v6.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v13.l, s9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v13.l, v5.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v5.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v13.h, s8 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s9, v5, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v16.h, v5.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v16.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v4.h, v16.h, s9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v12.l, s10 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v12.l, v4.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v4.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v18 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v21 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v17 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v12 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v11.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v12.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s10, v4, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v16.h, v4.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v3.h, v16.h, s10 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v3.l, v11.l, s11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v11.l, v3.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v3.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v11.h, s10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s11, v3, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.h, v3.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v16.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.l, v3.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v1 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v2.h, v16.h, s11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v3.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v2.l, v10.l, s12 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v10.l, v2.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v2.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v2.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v8.l, v1.h, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.l, v10.l, v0.h, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.l, v13.l, v2.h, s3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v16 :: v_dual_mov_b32 v1, v14 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v15 :: v_dual_mov_b32 v3, v20 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v10.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s12, v2, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v16.h, v2.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v2.l, v2.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v31, v31 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v1.h, v16.h, s12 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.l, v9.l, s13 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v16.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v16 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v19 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v9.l, v1.h, s14 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v1.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v31, 16, v8 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v9.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s13, v1, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v0.l, v8.l, s14 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v16.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v31, v31 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v1.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v17.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v1.h, s15 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.l, v17.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v18.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v20.h, v25.h, s17 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v32.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.h, s16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v1.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v19.h, v18.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v26.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v20.l, v1.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v16 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v8.l, v32.h, s14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v18.l, v0.h, s0 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v21.h, v26.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v28.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v32, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s13 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v21.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v29.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v23.h, v28.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v16.h, v32.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v16.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v22.h, v27.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v24.h, v29.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v8.l, v32.h, s15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v23.l, v1.h, s4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v22.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v24.l, v8.h, s5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v25.l, v30.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v16.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.h, v8.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v15.h, v0.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v8.h, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v16bf16: @@ -11736,666 +11594,619 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX11-TRUE16-LABEL: v_minimumnum_v32bf16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: scratch_load_b32 v68, off, s32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v36.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.h, v30.h, s1 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v16 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v55, v55 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v85, v85 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v30.h, v32.l, s2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v36.l +; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v27 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v35, v35 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v37, v37 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v39, v39 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v49, v49 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v51, v51 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v53, v53 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v54, v54 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v65, v65 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v67, v67 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v69, v69 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v71, v71 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v86, v86 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v0.h, v16.h, s29 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v55.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v36, v36 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v38, v38 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v48, v48 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v50, v50 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v52, v52 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v64, v64 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v66, v66 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v68, v68 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v70, v70 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v80, v80 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v83, v83 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v13.h, v29.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v12.h, v28.h, s5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v11.h, v27.h, s7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v10.h, v26.h, s9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v9.h, v25.h, s11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v24.h, s13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v7.h, v23.h, s15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v6.h, v22.h, s17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v5.h, v21.h, s19 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v4.h, v20.h, s21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v3.h, v19.h, s23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v85.l, v16.h, v54.l, s40 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v14 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s27 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v29.h, v33.l, s4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v28.h, v34.l, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v27.h, v35.l, s8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v26.h, v36.l, s10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v25.h, v37.l, s12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v24.h, v38.l, s14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v23.h, v39.l, s16 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v22.h, v48.l, s18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v21.h, v49.l, s20 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v81.l, v20.h, v50.l, s22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.l, v19.h, v51.l, s24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.l, v54.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s40, v86, v118 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.l, v85.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v30 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v87, v87 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v96, v96 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.l, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.l, v36.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.l, v39.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.l, v50.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.l, v51.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.l, v18.h, v52.l, s26 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v84.l, v17.h, v53.l, s28 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v65.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v67.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v68.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v69.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v71.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.l, v80.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v81.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.l, v82.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v97, v97 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, s41 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.l, v37.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.l, v52.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.l, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.l, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v119 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v128 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v129 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v130 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v131 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v132 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v133 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v134 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v135 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 16, v144 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 16, v145 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s63, v116, v86 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v86.l, v55.l, v32.l, s40 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v14.l, s42 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 16, v146 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v146, 16, v147 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s42, v87, v118 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s43, v96, v119 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s45, v98, v129 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s56, v101, v132 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s59, v112, v135 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s60, v113, v144 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v86.l, v32.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.l, v86.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v34.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.l, v35.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v36.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v39.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v50.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v51.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s46, v99, v130 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s61, v114, v145 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s62, v115, v146 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.l, v65.l, v34.l, s43 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v98.l, v67.l, v36.l, s45 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v101.l, v70.l, v39.l, s56 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v112.l, v81.l, v50.l, s59 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v113.l, v82.l, v51.l, s60 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v55.l, s16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v118 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v33.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v37.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.l, v48.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v52.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v70.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v81.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v87.l, v64.l, v33.l, s42 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v99.l, v68.l, v37.l, s46 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v114.l, v83.l, v52.l, s61 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v115.l, v84.l, v53.l, s62 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v96.l, v34.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v98.l, v36.l, s5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v101.l, v39.l, s8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v101.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v112.l, v50.l, s11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v112.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v113.l, v51.l, s12 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v113.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.l, v38.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.l, v49.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v97, v128 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v116.l, v85.l, v54.l, s63 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v87.l, v33.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v99.l, v37.l, s6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v114.l, v52.l, s13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v114.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v115.l, v53.l, s14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v115.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v39 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v35.h, v81.l, s26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v35.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s57, v102, v133 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v97.l, v66.l, v35.l, s44 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v116.l, v54.l, s15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v116.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v51 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v48.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v65.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v66.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s47, v100, v131 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s58, v103, v134 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v102.l, v71.l, v48.l, s57 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.l, v96.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v97.l, v35.l, s4 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.l, v97.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v52 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v53 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v38.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v49.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v64.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v71.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v100.l, v69.l, v38.l, s47 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v103.l, v80.l, v49.l, s58 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v87.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v102.l, v48.l, s9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v102.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v30.h, v65.l, s18 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v128 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v66.l, s19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v129 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v80.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0x8000, v85.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v38.l, s7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v103.l, v49.l, s10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v103.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v14.h, v64.l, s17 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v119 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v34.h, v71.l, s24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 16, v48 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v65 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v66 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v86.l, v13.h, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s25 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v49 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v37.h, v85.l, s41 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v64 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v71 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v96.l, v30.h, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v97.l, v32.l, s4 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v67.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v68.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v98.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v99.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v87.l, v38.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v102.l, v38.h, s9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v67.l, s20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v130 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s21 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v131 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0x8000, v84.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v70 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v67 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0x8000, v82.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v68 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v37.l, v84.l, s29 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v101.l, v34.l, s8 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v83.l -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v80 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v36.l, v82.l, s27 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.h, v115.l, v37.l, s14 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v69.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v36.h, v83.l, s28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v103.l, v35.l, s10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v113.l, v36.l, s12 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.l, v100.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.h, v69.l, s22 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v81 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v114.l, v48.l, s13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v39, v39 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v22 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v21 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v6 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v18 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v17 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 16, v30 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v2 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v17 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v83, v83 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v86, v86 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v96.h, v0.l, v16.l, s42 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s43, 0x8000, v96.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v68 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v36, v35 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0x8000, v35.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v31.l, v36.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v31.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v32, v32 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v14.h, v30.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.h, v30.h, v36.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v36, v37 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s44, 0x8000, v37.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v37.h, v36.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.l, v36.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v13.h, v29.h, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v29.h, v36.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v36, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v38.h, v36.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.l, v36.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v33.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v12.h, v28.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.h, v28.h, v36.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v36, v39 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v39.h, v36.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.h, v34.l, v36.h, s3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v34.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v48, v48 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v36.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v50.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v15.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v31 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.h, v98.l, v32.h, s5 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v50.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v52, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v11.h, v27.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.h, v27.h, v36.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v36, v48 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v48.h, v36.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v35.l, v36.h, s4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v35.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v36.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v51.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v32.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v50.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v10.h, v26.h, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.h, v26.h, v36.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v36, v49 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v49.h, v36.h, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v38.l, v36.h, s5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v38.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v9.h, v25.h, s5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.h, v25.h, v36.h, s6 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v36, v50 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v50.h, v36.h, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v48.l, v36.h, s6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v48.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v8.h, v24.h, s6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.h, v24.h, v36.h, s7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v36, v51 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v50.l, v51.h, v36.h, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v51.l, v50.l, v36.h, s7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v50.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v7.h, v23.h, s7 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.h, v23.h, v36.h, s8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s7, v36, v52 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v52.l, v52.h, v36.h, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.l, v52.l, v36.h, s8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v52.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v6.h, v22.h, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.h, v22.h, v36.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s8, v36, v53 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v53.l, v53.h, v36.h, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v69.h, v53.l, v36.h, s9 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v53.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v5.h, v21.h, s9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.h, v36.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s9, v36, v54 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v54.l, v54.h, v36.h, s9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.l, v54.l, v36.h, s10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v54.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v4.h, v20.h, s10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.h, v20.h, v36.h, s11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s10, v36, v55 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v55.l, v55.h, v36.h, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v70.h, v55.l, v36.h, s11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v55.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v3.h, v19.h, s11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.h, v19.h, v36.h, s12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s11, v36, v64 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v64.l, v64.h, v36.h, s11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.l, v64.l, v36.h, s12 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v64.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v2.h, v18.h, s12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.h, v18.h, v36.h, s13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s12, v36, v65 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v65.l, v65.h, v36.h, s12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v71.h, v65.l, v36.h, s13 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v65.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v1.h, v17.h, s13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.h, v17.h, v36.h, s14 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s13, v36, v66 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v66.l, v66.h, v36.h, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v80.l, v66.l, v36.h, s14 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v66.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v0.h, v16.h, s14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.h, v16.h, v36.h, s15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v68 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s14, v36, v67 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.h, v15.l, v68.l, s15 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v67.l, v67.h, v36.h, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v67.l, v36.h, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v67.l +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v82.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v68.l, v82.h, s15 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s15, v82, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v68.l, v36.h, v82.h, s15 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v68.l, v82.h, s16 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.h, v14.h, v36.h, s15 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v68.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v14.l, v30.l, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v28 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s15, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v30.l, v14.h, s17 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v14.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v38, v53 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.h, v112.l, v39.l, s11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v51.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v29 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v15.h, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v117, v117 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v31.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v27 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s16, v14, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v82, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v36.h, v14.h, s16 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v26 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v14.l, v14.h, s17 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v25 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.h, v13.h, v36.h, s16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v14.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.l, v29.l, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v24 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s16, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v29.l, v13.h, s18 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v13.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v23 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s17, v13, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v29.h, s16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v82, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v36.h, v13.h, s17 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v13.l, v13.h, s18 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v11 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v21 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.h, v12.h, v36.h, s17 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v13.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.l, v28.l, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v20 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s17, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v28.l, v12.h, s19 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v12.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v82, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v19 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s18, v12, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v28.h, s17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v82, v82 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v36.h, v12.h, s18 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v12.l, v12.h, s19 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v36.h, s18 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.l, v27.l, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s18, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v27.l, v11.h, s20 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v11.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v27.h, s18 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s19, v11, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v36.h, v11.h, s19 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v11.l, v11.h, s20 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.h, v10.h, v36.h, s19 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v26.l, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s19, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v26.l, v10.h, s21 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v10.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v26.h, s19 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s20, v10, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v36.h, v10.h, s20 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v10.l, v10.h, s21 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.h, v9.h, v36.h, s20 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v10.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v25.l, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s20, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v25.l, v9.h, s22 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v9.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v25.h, s20 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s21, v9, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v36.h, v9.h, s21 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v9.h, s22 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v24.h, v8.h, v36.h, s21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v24.l, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s21, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v24.l, v8.h, s23 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v8.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v24.h, s21 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s22, v8, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v36.h, v8.h, s22 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v8.h, s23 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v23.h, v7.h, v36.h, s22 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v23.l, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s22, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v23.l, v7.h, s24 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v23.h, s22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s23, v7, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v36.h, v7.h, s23 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s24 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v22.h, v6.h, v36.h, s23 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v22.l, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s23, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v22.l, v6.h, s25 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v22.h, s23 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s24, v6, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v36.h, v6.h, s24 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s25 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v21.h, v5.h, v36.h, s24 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s24, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v21.l, v5.h, s26 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v5.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v21.h, s24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s25, v5, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v36.h, v5.h, s25 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s26 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v20.h, v4.h, v36.h, s25 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v20.l, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s25, 0, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v20.l, v4.h, s27 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0x8000, v4.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v20.h, s25 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s26, v4, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v36.h, v4.h, s26 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v81, v81 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v18.h, v3.h, v36.h, s26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v83.h, v3.l, v19.l, s27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v81, v81 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s26, 0, v36 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v52, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v52 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v29.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v52, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v29.l, v13.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v13.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v30.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v30.l, v13.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v51, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v29.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v28.l, v12.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v31.l, v12.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v11.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.h, v28.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v51, v50 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v37.l, v31.l, v10.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v25.l, v12.l, v10.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.l, v10.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v27, v26 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v12.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v9.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v12.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v10.l, v9.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v12.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v11.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v25, v24 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v22.l, v6.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v32.l, v10.l, v7.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v10.l, v7.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v11 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v9.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v22 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v6.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v12, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v6.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v35.l, v7.l, v4.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v18.l, v2.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.l, v0.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v12, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v16 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v49.l, v2.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v48.l, v8.l, v1.h, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v39.l, v6.l, v0.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v38.l, v9.l, v2.h, s3 -; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, v49 :: v_dual_mov_b32 v2, v48 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v39 :: v_dual_mov_b32 v4, v38 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v36 :: v_dual_mov_b32 v6, v35 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v7, v34 :: v_dual_mov_b32 v8, v33 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v32 :: v_dual_mov_b32 v10, v31 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v11, v30 :: v_dual_mov_b32 v12, v37 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v19.l, v83.h, s28 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v3, v3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0x8000, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.l +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s27, v83, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v82.h, v2.l, v18.l, s28 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v19.h, v1.l, v17.l, s40 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v32.h, v37.h, s44 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v36.h, v83.h, s27 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s40, 0x8000, v82.h +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v85, v85 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v18.h, s26 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v32.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v83.h, s41 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v87, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v36.h, s28 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v31.h, v35.h, s42 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0x8000, v39.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v19.h +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s45, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v18.l, v82.h, s29 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0x8000, v38.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v34.h, v39.h, s42 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v82, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.h, v38.h, s29 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v34.l, v1.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v36.h, v82.h, s44 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v33.l, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v50.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v35.l, v1.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v82.h, s40 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v51.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v54.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v49.l, v50.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s45 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v36.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v52.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v51.l, v51.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v53.h +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v17.l, v19.h, s27 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v38.l, v0.h, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v69.l, v52.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v55.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v48.l, v1.h, s5 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v19, v36 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v69.h, v53.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v70.h, v55.h, s4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v66.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.h, v36.h, v19.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v65.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v50.l, v2.h, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v70.l, v54.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v52.l, v0.h, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v16.h, v19.h, s28 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v53.l, v1.h, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v71.h, v65.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v54.l, v2.h, s9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v67.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v36.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.h +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v64.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v65.l, v1.h, s12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v67.h, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v55.l, v3.h, s10 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v36.h, v16.l, v96.h, s41 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v71.l, v64.h, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v80.l, v66.h, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v17.l, s4 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v96, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v64.l, v0.h, s11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v66.l, v16.l, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v67.l, v15.l, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v68.l, v30.h, s15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v16.l, v36.h, v96.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v16.l, v96.h, s43 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v17.h, v17.h, v36.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v17.h, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v32bf16: @@ -13011,753 +12822,697 @@ define <32 x bfloat> @v_minimumnum_v32bf16(<32 x bfloat> %x, <32 x bfloat> %y) { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v14 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v15 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v30 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v7 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v85, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: scratch_load_b32 v68, off, s32 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v15 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff0000, v14 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v30 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v29 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v35.l, v36.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v37.l, v36.l ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v13 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v13 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v12 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v14.h, v30.h, s1 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v11 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v9 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v8 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v24 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v6 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v5 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v69, 0xffff0000, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v71, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v86, 0xffff0000, v16 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v55, v55 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v85, v85 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v30.h, v32.l, s2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v29 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v28 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v27 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v38.l, v36.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v12 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v28 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff0000, v11 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v27 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v26 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v25 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v23 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v22 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff0000, v21 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v70, 0xffff0000, v20 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v80, 0xffff0000, v19 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v83, 0xffff0000, v1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v35, v35 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v37, v37 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v39, v39 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v49, v49 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v51, v51 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v53, v53 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v54, v54 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v65, v65 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v67, v67 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v69, v69 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v71, v71 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v86, v86 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v0.h, v16.h, s29 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v32.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v55.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v82, 0xffff0000, v18 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v84, 0xffff0000, v17 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v36, v36 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v38, v38 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v48, v48 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v50, v50 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v52, v52 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v64, v64 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v66, v66 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v68, v68 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v70, v70 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v80, v80 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v83, v83 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v13.h, v29.h, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v12.h, v28.h, s5 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v11.h, v27.h, s7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v10.h, v26.h, s9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v9.h, v25.h, s11 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v8.h, v24.h, s13 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v7.h, v23.h, s15 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v6.h, v22.h, s17 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v5.h, v21.h, s19 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v4.h, v20.h, s21 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v3.h, v19.h, s23 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v85.l, v16.h, v54.l, s40 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v118 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v15 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v14 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v2.h, v18.h, s25 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v1.h, v17.h, s27 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v29.h, v33.l, s4 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v28.h, v34.l, s6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v27.h, v35.l, s8 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v26.h, v36.l, s10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v25.h, v37.l, s12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v24.h, v38.l, s14 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v23.h, v39.l, s16 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v22.h, v48.l, s18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v21.h, v49.l, s20 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v81.l, v20.h, v50.l, s22 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.l, v19.h, v51.l, s24 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v116.l, v54.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s40, v86, v118 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v86.l, v85.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v30 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v87, v87 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v96, v96 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v87.l, v33.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v34.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v98.l, v36.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v101.l, v39.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v112.l, v50.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v113.l, v51.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.l, v18.h, v52.l, s26 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v84.l, v17.h, v53.l, s28 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v64.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v65.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v66.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v67.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v68.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v69.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v133.l, v70.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v134.l, v71.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v135.l, v80.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v144.l, v81.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v145.l, v82.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v116, 16, v116 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v86 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v97, v97 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v30.l, s41 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v32.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v99.l, v37.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v114.l, v52.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v115.l, v53.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v87 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v96, 16, v96 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v98, 16, v98 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v101, 16, v101 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v112, 16, v112 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v113, 16, v113 -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v146.l, v83.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v147.l, v84.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v118, 16, v119 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v119, 16, v128 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v128, 16, v129 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v129, 16, v130 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v130, 16, v131 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v131, 16, v132 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v132, 16, v133 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v133, 16, v134 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v134, 16, v135 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v135, 16, v144 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v144, 16, v145 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s63, v116, v86 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v86.l, v55.l, v32.l, s40 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v117, 16, v13 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v30.l, v14.l, s42 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v55.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v99, 16, v99 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v114, 16, v114 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v115, 16, v115 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v145, 16, v146 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v146, 16, v147 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s42, v87, v118 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s43, v96, v119 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s45, v98, v129 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s56, v101, v132 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s59, v112, v135 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s60, v113, v144 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v86.l, v32.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v118.l, v86.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v34.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v97.l, v35.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v36.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v39.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v50.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v51.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s46, v99, v130 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s61, v114, v145 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s62, v115, v146 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.l, v65.l, v34.l, s43 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v98.l, v67.l, v36.l, s45 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v101.l, v70.l, v39.l, s56 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v112.l, v81.l, v50.l, s59 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v113.l, v82.l, v51.l, s60 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.h, v55.l, s16 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v55, 16, v118 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v33.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v37.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v102.l, v48.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v52.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v53.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v97, 16, v97 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v70.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v81.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v87.l, v64.l, v33.l, s42 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v99.l, v68.l, v37.l, s46 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v114.l, v83.l, v52.l, s61 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v115.l, v84.l, v53.l, s62 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v96.l, v34.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v98.l, v36.l, s5 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v101.l, v39.l, s8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v39.l, v101.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v112.l, v50.l, s11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v112.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v113.l, v51.l, s12 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v113.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v55 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v100.l, v38.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v103.l, v49.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v54.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v102, 16, v102 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v97, v128 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v116.l, v85.l, v54.l, s63 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v87.l, v33.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v99.l, v37.l, s6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v114.l, v52.l, s13 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v114.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v115.l, v53.l, s14 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v115.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v34.l, v70.l, s23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v70, 16, v39 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v35.h, v81.l, s26 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v35.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v100, 16, v100 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v103, 16, v103 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s57, v102, v133 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v97.l, v66.l, v35.l, s44 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v116.l, v54.l, s15 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v116.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v51 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v48.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v65.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v66.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s47, v100, v131 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s58, v103, v134 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v102.l, v71.l, v48.l, s57 -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v128.l, v96.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v97.l, v35.l, s4 -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v129.l, v97.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v54 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v52 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v53 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v38.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v49.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v64.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v71.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v100.l, v69.l, v38.l, s47 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v103.l, v80.l, v49.l, s58 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v119.l, v87.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v102.l, v48.l, s9 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v48.l, v102.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v30.h, v65.l, s18 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v65, 16, v128 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v32.l, v66.l, s19 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v66, 16, v129 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v80.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0x8000, v85.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v38.l, s7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v103.l, v49.l, s10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v103.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v14.h, v64.l, s17 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v64, 16, v119 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v34.h, v71.l, s24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v71, 16, v48 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v65 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v66 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v86.l, v13.h, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v35.l, v80.l, s25 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v80, 16, v49 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v37.h, v85.l, s41 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v64 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v71 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v96.l, v30.h, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v97.l, v32.l, s4 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v67.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v68.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v130.l, v98.l -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v131.l, v99.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v87.l, v38.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v102.l, v38.h, s9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.h, v67.l, s20 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v67, 16, v130 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v33.l, v68.l, s21 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v68, 16, v131 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0x8000, v84.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v70 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v67 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0x8000, v82.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v68 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v37.l, v84.l, s29 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v101.l, v34.l, s8 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v83.l -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v80 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v36.l, v82.l, s27 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.h, v115.l, v37.l, s14 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v69.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v36.h, v83.l, s28 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v103.l, v35.l, s10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v113.l, v36.l, s12 -; GFX12-TRUE16-NEXT: v_mov_b16_e64 v132.l, v100.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.h, v69.l, s22 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v81 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v114.l, v48.l, s13 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v69, 16, v132 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v69 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v100.l, v33.h, s7 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v25 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v39, v39 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v39.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v10 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff0000, v9 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v8 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v24 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v23 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v22 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v21 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v20 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff0000, v7 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff0000, v6 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff0000, v5 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v4 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v64, 0xffff0000, v3 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v19 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v18 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v17 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v81, 0xffff0000, v16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v65, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v67, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v15 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v83, 16, v30 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v84, 16, v2 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v86, 16, v0 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v85, 16, v17 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v87, 16, v16 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v83, v83 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v83.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s42, v86, v86 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v96.l, v36.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v96.h, v0.l, v16.l, s42 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s43, 0x8000, v96.h ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff0000, v31 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v15.h, v31.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v55, v55 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v50.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v15.l +; GFX12-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff0000, v68 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v31.h, v50.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v54, 16, v31 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v98.l, v32.h, s5 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v51.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v99.l, v33.l, s6 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v54, v54 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v31.l, v15.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v50.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v52, v53 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v51.l, v50.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v38, 16, v52 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v53, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v51.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v32.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v50.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v38, v53 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v112.l, v39.l, s11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v15.h, v51.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v31.l, v15.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v14.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v29 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v50 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v30.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v33.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v32.l, v15.h, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v117, v117 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v31.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v15.h, v68.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v116.l, v49.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v29.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v52, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v31.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v29.l, v13.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v52 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.h, v68.h, v36.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v36, v35 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0x8000, v35.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v30.l, v14.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v13.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v29.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v28 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v14.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v31.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v35.h, v36.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v28.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v31.l, v14.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v30.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v52, v51 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.h, v31.l, v36.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v31.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v32, v32 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v14.h, v30.h, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.h, v30.h, v36.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v36, v37 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s44, 0x8000, v37.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v37.h, v36.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.h, v32.l, v36.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v32.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v33, v33 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.l, v28.l, v12.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v12.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.h, v30.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v29.l, v13.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v28.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v11 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v15.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v13.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v31.l, v12.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v52, 16, v27 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v30.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v13.h, v29.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.h, v29.h, v36.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v36, v38 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v30.l, v13.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v51, v50 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v27.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v52, v52 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v53 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v29.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v38.h, v36.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.h, v33.l, v36.h, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v33.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v34, v34 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v28.l, v12.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v12.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.l, v27.l, v11.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.h, v29.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v12.h, v28.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.h, v28.h, v36.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v36, v39 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v31.l, v12.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v27.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v11.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v26 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v28.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v31.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v39.h, v36.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.h, v34.l, v36.h, s3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v34.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v48, v48 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v48.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v26.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v50, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v52 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v30.l, v11.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v51, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.h, v28.l, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v11.h, v27.h, s3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.h, v27.h, v36.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v36, v48 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v26.l, v10.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v50 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.l, v27.l, v11.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v10.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v25 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v11.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v26.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v51, 16, v51 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v48.h, v36.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v35.l, v36.h, s4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v35.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v49, v49 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v49.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v25.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v28, v28 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v26.l, v11.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v27.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v52 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v51, v50 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v10.h, v26.h, s4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.h, v26.h, v36.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s5, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s4, v36, v49 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v25.l, v9.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v37.l, v31.l, v10.h, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.h, v27.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v28 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.l, v12.l, v10.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v11.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v9.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v50, 16, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.l, v26.l, v9.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v27 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v28 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v49.h, v36.h, s4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v38.l, v36.h, s5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v38.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s5, v50, v50 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v50.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v25.l, v10.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v50, v50 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v24 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v27, v26 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v12.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v24.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v28 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v9.h, v25.h, s5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.h, v25.h, v36.h, s6 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s6, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s5, v36, v50 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v11.l, v9.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v9.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.h, v12.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v24.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v26 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v50.h, v36.h, s5 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v48.l, v36.h, s6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v48.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s6, v51, v51 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v51.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s5, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v10.l, v9.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v8.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v12.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v23 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v8.h, v24.h, s6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.h, v24.h, v36.h, s7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s7, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s6, v36, v51 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v23.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v24, v24 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v11.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v27 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v31.l, v25.l, v8.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v26, v9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v50.l, v51.h, v36.h, s6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v51.l, v50.l, v36.h, s7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v50.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s7, v52, v52 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v52.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s6, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v23.l, v7.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v6 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.h, v11.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v24 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v12.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v24.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v25.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v22 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v8.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v26.l, v11.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v7.h, v23.h, s7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.h, v23.h, v36.h, s8 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s8, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s7, v36, v52 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v22.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v23, v23 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v11.l, v8.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v12.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v26 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v25, v24 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v52.l, v52.h, v36.h, s7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.l, v52.l, v36.h, s8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v52.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s8, v53, v53 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v53.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s7, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v22.l, v6.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v32.l, v10.l, v7.h, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.h, v12.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v23 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v9.l, v7.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v5 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v33.l, v11.l, v6.h, vcc_lo -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v22 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v6.h, v22.h, s8 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.h, v22.h, v36.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s9, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s8, v36, v53 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v10.l, v7.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v21 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v22.l, v10.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v12, v11 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v9.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v21.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v22 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v53.l, v53.h, v36.h, s8 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v69.h, v53.l, v36.h, s9 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v53.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s9, v54, v54 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v54.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s8, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v8.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.h, v9.l, s1 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v21.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v5.h, v21.h, s9 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.h, v21.h, v36.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s10, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s9, v36, v54 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v7.l, v6.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v21.l, v7.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v20 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v8.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v54.l, v54.h, v36.h, s9 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.l, v54.l, v36.h, s10 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v54.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s10, v55, v55 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v55.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s9, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v20.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v21 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v34.l, v10.l, v5.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v4.h, v20.h, s10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.h, v20.h, v36.h, s11 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s11, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s10, v36, v55 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v20.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.h, v8.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v11 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v9.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v19 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v20.l, v8.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v55.l, v55.h, v36.h, s10 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v70.h, v55.l, v36.h, s11 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v55.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s11, v64, v64 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v64.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s10, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v19.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v10, v10 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v8.l, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v9.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v20 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v12, v11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v3.h, v19.h, s11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.h, v19.h, v36.h, s12 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s12, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s11, v36, v64 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v19.l, v3.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v9.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v18 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v6.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v64.l, v64.h, v36.h, s11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.l, v64.l, v36.h, s12 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v64.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s12, v65, v65 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v65.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s11, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v18.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v35.l, v7.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v2.h, v18.h, s12 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.h, v18.h, v36.h, s13 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s13, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s12, v36, v65 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v18.l, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v11, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.l, v8.l, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v3.h, v6.l, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v65.l, v65.h, v36.h, s12 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v71.h, v65.l, v36.h, s13 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v65.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s13, v66, v66 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v66.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s12, 0, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v17 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v16 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v9.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v1.h, v17.h, s13 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.h, v17.h, v36.h, s14 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s14, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s13, v36, v66 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v17.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v10, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v16.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v11, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v66.l, v66.h, v36.h, s13 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v80.l, v66.l, v36.h, s14 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v66.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s14, v67, v67 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v67.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s13, 0, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v0.h, v16.h, s14 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.h, v16.h, v36.h, s15 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v68 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s14, v36, v67 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.h, v15.l, v68.l, s15 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s15, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v67.l, v67.h, v36.h, s14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v67.l, v36.h, s16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v67.l +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v82.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s14, 0, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v68.l, v82.h, s15 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s15, v82, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v68.l, v36.h, v82.h, s15 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s15, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v68.l, v82.h, s16 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s16, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v13 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v29 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v30.h, v14.h, v36.h, s15 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v68.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v14.l, v30.l, s16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v28 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s15, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v30.l, v14.h, s17 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v14.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v27 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s16, v14, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v82, v82 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v36.h, v14.h, s16 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s16, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v26 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v14.l, v14.h, s17 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s17, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v12 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v25 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.h, v13.h, v36.h, s16 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v14.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v13.l, v29.l, s17 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v24 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s16, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v29.l, v13.h, s18 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v13.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v23 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s17, v13, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v17.l, v1.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v3.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v16.l, v0.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v12, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v17.l, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.l, v14.l, v29.h, s16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v82, v82 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v36.h, v13.h, s17 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s17, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v22 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v13.l, v13.h, s18 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s18, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v11 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v21 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v28.h, v12.h, v36.h, s17 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v13.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v12.l, v28.l, s18 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v20 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s17, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v28.l, v12.h, s19 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v12.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v82, v82 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v82, 16, v19 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s18, v12, v36 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v4.l, v2.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v17 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.l, v13.l, v28.h, s17 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v82, v82 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v36.h, v12.h, s18 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s18, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v82.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v12.l, v12.h, s19 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s19, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v10 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v27.h, v11.h, v36.h, s18 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v12.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v11.l, v27.l, s19 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s18, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v27.l, v11.h, s20 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v11.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v2.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v17, v16 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v5 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v7.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.l, v12.l, v27.h, s18 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s19, v11, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v36.h, v11.h, s19 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s19, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v11.l, v11.h, s20 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s20, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v9 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v26.h, v10.h, v36.h, s19 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v11.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v10.l, v26.l, s20 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s19, 0, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v26.l, v10.h, s21 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v10.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.l, v11.l, v26.h, s19 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s20, v10, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v36.h, v10.h, s20 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s20, 0x8000, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v10.l, v10.h, s21 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s21, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v8 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v25.h, v9.h, v36.h, s20 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v10.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v9.l, v25.l, s21 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s20, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v25.l, v9.h, s22 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v9.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v1.h, v4.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v16.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v5.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v2.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v7.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.l, v10.l, v25.h, s20 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s21, v9, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v36.h, v9.h, s21 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s21, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v9.l, v9.h, s22 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s22, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v7 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v24.h, v8.h, v36.h, s21 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v9.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v8.l, v24.l, s22 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s21, 0, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v24.l, v8.h, s23 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v8.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.l, v9.l, v24.h, s21 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s22, v8, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v36.h, v8.h, s22 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s22, 0x8000, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v8.l, v8.h, s23 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s23, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v23.h, v7.h, v36.h, s22 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v7.l, v23.l, s23 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s22, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v23.l, v7.h, s24 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v7.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v12 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v7.l, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v8.l, v23.h, s22 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s23, v7, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v36.h, v7.h, s23 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s23, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v7.l, v7.h, s24 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s24, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v5 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v22.h, v6.h, v36.h, s23 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v7.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v6.l, v22.l, s24 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s23, 0, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v22.l, v6.h, s25 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v6.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v7.l, v22.h, s23 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s24, v6, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v36.h, v6.h, s24 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s24, 0x8000, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v6.l, v6.h, s25 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s25, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v12 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v16 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v11 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s3, 0, v10 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v29.l, v5.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v21.h, v5.h, v36.h, s24 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v6.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v5.l, v21.l, s25 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v36.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s24, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v21.l, v5.h, s26 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v5.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v49.l, v2.l, v1.l, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v48.l, v8.l, v1.h, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v39.l, v6.l, v0.h, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v38.l, v9.l, v2.h, s3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, v29 :: v_dual_mov_b32 v1, v49 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v2, v48 :: v_dual_mov_b32 v3, v39 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v38 :: v_dual_mov_b32 v5, v36 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v6, v35 :: v_dual_mov_b32 v7, v34 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v33 :: v_dual_mov_b32 v9, v32 -; GFX12-TRUE16-NEXT: v_dual_mov_b32 v10, v31 :: v_dual_mov_b32 v11, v30 -; GFX12-TRUE16-NEXT: v_mov_b32_e32 v12, v37 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v6.l, v21.h, s24 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s25, v5, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v36.h, v5.h, s25 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s25, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v5.l, v5.h, s26 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s26, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v20.h, v4.h, v36.h, s25 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v20.l, s26 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s25, 0, v36 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v20.l, v4.h, s27 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s27, 0x8000, v4.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v5.l, v20.h, s25 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s26, v4, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v36.h, v4.h, s26 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s26, 0x8000, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v4.l, v4.h, s27 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v81, v81 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v81, 16, v18 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v18.h, v3.h, v36.h, s26 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v4.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v83.h, v3.l, v19.l, s27 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s29, v81, v81 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s26, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v19.l, v83.h, s28 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s28, v84, v84 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s40, v3, v3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s41, 0x8000, v83.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v19.l, v36.l +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s27, v83, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v82.h, v2.l, v18.l, s28 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v19.h, v1.l, v17.l, s40 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v32.h, v37.h, s44 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v36.h, v83.h, s27 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s40, 0x8000, v82.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s27, v85, v85 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.l, v18.h, s26 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v14.h, v32.l, v1.l, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v3.l, v83.h, s41 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s41, v87, v87 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v36.h, s28 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v3.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v31.h, v35.h, s42 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s42, 0x8000, v39.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s28, 0x8000, v19.h +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s45, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v18.l, v82.h, s29 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s29, 0x8000, v38.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.h, v31.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v34.h, v39.h, s42 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v48.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s44, v82, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.h, v38.h, s29 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v12.h, v34.l, v1.h, s2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v37.l, v48.h, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v36.h, v82.h, s44 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v13.h, v33.l, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v49.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v50.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v11.h, v35.l, v1.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v82.h, s40 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v51.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v54.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v49.l, v50.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v0.l, s45 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v36.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v1.l +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v39.l, v49.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v52.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v51.l, v51.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v53.h +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v17.l, v19.h, s27 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v10.h, v38.l, v0.h, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v69.l, v52.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v55.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v9.h, v48.l, v1.h, s5 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v19, v36 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v69.h, v53.h, s2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v70.h, v55.h, s4 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v66.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.h, v36.h, v19.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v65.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v50.l, v2.h, s6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v70.l, v54.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v52.l, v0.h, s7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v16.h, v19.h, s28 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v53.l, v1.h, s8 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v71.h, v65.h, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v54.l, v2.h, s9 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v67.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.l, v17.l, v36.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.h +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v64.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v65.l, v1.h, s12 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v15.l, v67.h, s3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v55.l, v3.h, s10 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s4, 0, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v36.h, v16.l, v96.h, s41 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v71.l, v64.h, s0 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v80.l, v66.h, s2 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v16.h, v17.l, s4 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v96, v36 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v64.l, v0.h, s11 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v66.l, v16.l, s13 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v67.l, v15.l, s14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v15.l, v68.l, v30.h, s15 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v16.l, v36.h, v96.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v36.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v16.l, v96.h, s43 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v17.h, v17.h, v36.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v36.h, v16.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v36 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v17.h, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v32bf16: @@ -14669,34 +14424,29 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 { ; GFX11-TRUE16-LABEL: v_minimumnum_bf16_no_ieee: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_bf16_no_ieee: @@ -14732,40 +14482,34 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 { ; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.h, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v0.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v1.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v1.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v1.h, s0 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, vcc_lo +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_bf16_no_ieee: @@ -15009,58 +14753,46 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v3, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee: @@ -15117,62 +14849,56 @@ define <2 x bfloat> @v_minimumnum_v2bf16_no_ieee(<2 x bfloat> %x, <2 x bfloat> % ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v4, v4 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v1.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v0.h, v1.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v2.l, s1 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v0.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v6 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v4.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v3.l, v2.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.h, v4.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v1.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v7 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v3.l, s1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.h, v0.l, v1.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v1.l, v3.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v3, v4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.h, v3.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v0.l, v3.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v2.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v4.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v2.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v2.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v5.l, v0.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, s0 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v2bf16_no_ieee: @@ -15521,66 +15247,67 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> % ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v8 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee: @@ -15652,77 +15379,80 @@ define <3 x bfloat> @v_minimumnum_v3bf16_no_ieee(<3 x bfloat> %x, <3 x bfloat> % ; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v6, v6 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v5, v5 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v0.h, v2.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v9, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v2.h, v4.l, s1 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v7, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, vcc_lo -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v8 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v4.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v7, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s3, v9, v11 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v2.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v7, v7 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v4 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v5.l, v4.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v3.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v2.l, v0.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v4.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v7.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s2 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v8.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v4.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v4 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v5.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v5.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v5, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v5.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v3, v3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v0.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s0 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v9 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v0.h, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v0, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s1 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v4.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.h, v4.h, s2 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v0.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v7.l, v1.l, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s1 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v3bf16_no_ieee: @@ -16183,85 +15913,83 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> % ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v10, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v9, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v11, v12 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v7.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2 -; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v7.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v8, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v0, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3 +; GFX11-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v5.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee: @@ -16356,99 +16084,98 @@ define <4 x bfloat> @v_minimumnum_v4bf16_no_ieee(<4 x bfloat> %x, <4 x bfloat> % ; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v1 ; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v3 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff0000, v0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 ; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v5, v5 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s1, v6, v6 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v0 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v8, v8 -; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v1.h, v3.h, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v0.h, v2.h, s1 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v3 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v2 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.h, v4.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s4, v10, v10 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 -; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v11, v11 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v7 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.l, v2.h, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s2 -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s4 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v1.l, s3 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v2.l, v0.l, s0 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v10, v8 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v11.l, v1.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v12.l, v0.l -; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.l, v5.l, v4.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v15.l, v2.l -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v6.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v9, v10 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v12 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v8.l, v4.l, s1 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v12, 16, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v15 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v1.h, v3.h, vcc_lo +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v9, v9 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v2 ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v7.l, v6.l, s2 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v5.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v14.l, v8.l -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s1, v11, v12 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v7.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.h, v3.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s3, v9, v9 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v5 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v6.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v13 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v14 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.l, v5.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.h, v5.l, s0 -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v3.l, v1.l, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v4.h, v4.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s0, v8, v8 +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v1 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v6 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.l, v2.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v1.l -; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.h, v7.l, s2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX12-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v0.h, v2.h, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x8000, v3.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v7.h, v2.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v7 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s4, 0x8000, v7.h ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v2.l -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v7 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v5.l, v7.h, v6.h, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v3.l, s1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v5.l, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v5.l ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s1, 0, v10 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v8.l, v0.h, s0 -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.l, v2.h, s2 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v8.h, v1.l, v3.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v3.l, v8.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v8.h +; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s0, v8, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v6.h, v8.h, s0 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0x8000, v6.h +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v8.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_u_f32_e64 s2, v3, v3 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v3.l, v1.h, v7.h, s4 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.h, v0.h, v6.h, s0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v1.l +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v2.l, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v6.l +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s0, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v6.h, v2.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v0.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v2.h, s0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cmp_lt_f32_e64 s2, v0, v6 +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.h, v0.h, s2 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s2, 0x8000, v6.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v0.l, v0.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_u16_e64 s3, 0x8000, v5.h +; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff +; GFX12-TRUE16-NEXT: v_cndmask_b16 v2.l, v0.h, v6.h, s2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v4.h, v5.h, s3 +; GFX12-TRUE16-NEXT: v_cmp_eq_f32_e64 s2, 0, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.h, v4.l, v0.h, s1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffd -; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v6.l, v0.l, vcc_lo +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v3.l, vcc_lo ; GFX12-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX12-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s1 +; GFX12-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, s2 ; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-FAKE16-LABEL: v_minimumnum_v4bf16_no_ieee: diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index 42bd2ff8797a1..9f539bd4cf0f8 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -813,7 +813,8 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc @@ -824,11 +825,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, v2, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll index 184c80765430c..ddae1b296024e 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umax.ll @@ -1718,7 +1718,7 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_umax_v3i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX11-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h @@ -1751,7 +1751,7 @@ define i16 @test_vector_reduce_umax_v3i16(<3 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_pk_max_u16 v0, v0, v1 ; GFX12-SDAG-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v0.h From 2c2567da95c64ee1ed9104ee2539894242922b83 Mon Sep 17 00:00:00 2001 From: Eugene Epshteyn Date: Fri, 18 Jul 2025 12:58:09 -0400 Subject: [PATCH 366/813] [flang] Fixed a crash with undeclared variable in implicit-do loop (#149513) Fixed a crash in the following example: ``` subroutine sub() implicit none print *, (i, i = 1, 2) ! Problem: using undefined var in implied-do loop end subroutine sub ``` The error message was already generated, but the compiler crashed before it could display it. --- flang/lib/Semantics/check-do-forall.cpp | 4 +++- flang/test/Semantics/resolve40.f90 | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp index cc1d4bf58745a..e258df86a4b1c 100644 --- a/flang/lib/Semantics/check-do-forall.cpp +++ b/flang/lib/Semantics/check-do-forall.cpp @@ -1180,7 +1180,9 @@ void DoForallChecker::Leave(const parser::IoControlSpec &ioControlSpec) { void DoForallChecker::Leave(const parser::OutputImpliedDo &outputImpliedDo) { const auto &control{std::get(outputImpliedDo.t)}; const parser::Name &name{control.name.thing.thing}; - context_.CheckIndexVarRedefine(name.source, *name.symbol); + if (name.symbol) { + context_.CheckIndexVarRedefine(name.source, *name.symbol); + } } void DoForallChecker::Leave(const parser::StatVariable &statVariable) { diff --git a/flang/test/Semantics/resolve40.f90 b/flang/test/Semantics/resolve40.f90 index a91507aa62282..81bb5f989ec48 100644 --- a/flang/test/Semantics/resolve40.f90 +++ b/flang/test/Semantics/resolve40.f90 @@ -96,3 +96,10 @@ subroutine s12(x) !BECAUSE: 'x' is an INTENT(IN) dummy argument read(*,nml=nl) end + +subroutine s13() + implicit none + !ERROR: No explicit type declared for 'i' + !ERROR: No explicit type declared for 'i' + print *, (i, i = 1, 2) +end From 10518c76de091bf23e72a8761c1eff561ce6e074 Mon Sep 17 00:00:00 2001 From: Mohammadreza Ameri Mahabadian Date: Fri, 18 Jul 2025 17:59:39 +0100 Subject: [PATCH 367/813] =?UTF-8?q?[mlir][spirv]=20Add=20conversion=20pass?= =?UTF-8?q?=20to=20rewrite=20splat=20constant=20composite=E2=80=A6=20(#148?= =?UTF-8?q?910)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s to replicated form This adds a new SPIR-V dialect-level conversion pass `ConversionToReplicatedConstantCompositePass`. This pass looks for splat composite `spirv.Constant` or `spirv.SpecConstantComposite` and rewrites them into `spirv.EXT.ConstantCompositeReplicate` or `spirv.EXT.SpecConstantCompositeReplicate`, respectively. --------- Signed-off-by: Mohammadreza Ameri Mahabadian --- .../mlir/Dialect/SPIRV/Transforms/Passes.td | 7 + .../Dialect/SPIRV/Transforms/CMakeLists.txt | 2 + ...nvertToReplicatedConstantCompositePass.cpp | 129 ++++++++ .../replicated-const-composites.mlir | 283 ++++++++++++++++++ 4 files changed, 421 insertions(+) create mode 100644 mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp create mode 100644 mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir diff --git a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td index 2d9befe78001d..2016bea43fc8a 100644 --- a/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/SPIRV/Transforms/Passes.td @@ -77,4 +77,11 @@ def SPIRVWebGPUPreparePass : Pass<"spirv-webgpu-prepare", "spirv::ModuleOp"> { "and replacing with supported ones"; } +def SPIRVReplicatedConstantCompositePass + : Pass<"spirv-promote-to-replicated-constants", "spirv::ModuleOp"> { + let summary = "Convert splat composite constants and spec constants to " + "corresponding replicated constant composite ops defined by " + "SPV_EXT_replicated_composites"; +} + #endif // MLIR_DIALECT_SPIRV_TRANSFORMS_PASSES diff --git a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt index 68e0206e30a59..b947447dad46a 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/SPIRV/Transforms/CMakeLists.txt @@ -1,5 +1,6 @@ set(LLVM_OPTIONAL_SOURCES CanonicalizeGLPass.cpp + ConvertToReplicatedConstantCompositePass.cpp DecorateCompositeTypeLayoutPass.cpp LowerABIAttributesPass.cpp RewriteInsertsPass.cpp @@ -30,6 +31,7 @@ add_mlir_dialect_library(MLIRSPIRVConversion add_mlir_dialect_library(MLIRSPIRVTransforms CanonicalizeGLPass.cpp + ConvertToReplicatedConstantCompositePass.cpp DecorateCompositeTypeLayoutPass.cpp LowerABIAttributesPass.cpp RewriteInsertsPass.cpp diff --git a/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp new file mode 100644 index 0000000000000..dbbe23aa08b3c --- /dev/null +++ b/mlir/lib/Dialect/SPIRV/Transforms/ConvertToReplicatedConstantCompositePass.cpp @@ -0,0 +1,129 @@ +//===- ConvertToReplicatedConstantCompositePass.cpp -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to convert a splat composite spirv.Constant and +// spirv.SpecConstantComposite to spirv.EXT.ConstantCompositeReplicate and +// spirv.EXT.SpecConstantCompositeReplicate respectively. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" +#include "mlir/Dialect/SPIRV/Transforms/Passes.h" +#include "mlir/Transforms/WalkPatternRewriteDriver.h" + +namespace mlir::spirv { +#define GEN_PASS_DEF_SPIRVREPLICATEDCONSTANTCOMPOSITEPASS +#include "mlir/Dialect/SPIRV/Transforms/Passes.h.inc" + +namespace { + +static Type getArrayElemType(Attribute attr) { + if (auto typedAttr = dyn_cast(attr)) { + return typedAttr.getType(); + } + + if (auto arrayAttr = dyn_cast(attr)) { + return ArrayType::get(getArrayElemType(arrayAttr[0]), arrayAttr.size()); + } + + return nullptr; +} + +static std::pair +getSplatAttrAndNumElements(Attribute valueAttr, Type valueType) { + auto compositeType = dyn_cast_or_null(valueType); + if (!compositeType) + return {nullptr, 1}; + + if (auto splatAttr = dyn_cast(valueAttr)) { + return {splatAttr.getSplatValue(), splatAttr.size()}; + } + + if (auto arrayAttr = dyn_cast(valueAttr)) { + if (llvm::all_equal(arrayAttr)) { + Attribute attr = arrayAttr[0]; + uint32_t numElements = arrayAttr.size(); + + // Find the inner-most splat value for array of composites + auto [newAttr, newNumElements] = + getSplatAttrAndNumElements(attr, getArrayElemType(attr)); + if (newAttr) { + attr = newAttr; + numElements *= newNumElements; + } + return {attr, numElements}; + } + } + + return {nullptr, 1}; +} + +struct ConstantOpConversion final : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(spirv::ConstantOp op, + PatternRewriter &rewriter) const override { + auto [attr, numElements] = + getSplatAttrAndNumElements(op.getValue(), op.getType()); + if (!attr) + return rewriter.notifyMatchFailure(op, "composite is not splat"); + + if (numElements == 1) + return rewriter.notifyMatchFailure(op, + "composite has only one constituent"); + + rewriter.replaceOpWithNewOp( + op, op.getType(), attr); + return success(); + } +}; + +struct SpecConstantCompositeOpConversion final + : OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(spirv::SpecConstantCompositeOp op, + PatternRewriter &rewriter) const override { + auto compositeType = dyn_cast_or_null(op.getType()); + if (!compositeType) + return rewriter.notifyMatchFailure(op, "not a composite constant"); + + ArrayAttr constituents = op.getConstituents(); + if (constituents.size() == 1) + return rewriter.notifyMatchFailure(op, + "composite has only one consituent"); + + if (!llvm::all_equal(constituents)) + return rewriter.notifyMatchFailure(op, "composite is not splat"); + + auto splatConstituent = dyn_cast(constituents[0]); + if (!splatConstituent) + return rewriter.notifyMatchFailure( + op, "expected flat symbol reference for splat constituent"); + + rewriter.replaceOpWithNewOp( + op, TypeAttr::get(op.getType()), op.getSymNameAttr(), splatConstituent); + + return success(); + } +}; + +struct ConvertToReplicatedConstantCompositePass final + : spirv::impl::SPIRVReplicatedConstantCompositePassBase< + ConvertToReplicatedConstantCompositePass> { + void runOnOperation() override { + MLIRContext *context = &getContext(); + RewritePatternSet patterns(context); + patterns.add( + context); + walkAndApplyPatterns(getOperation(), std::move(patterns)); + } +}; + +} // namespace +} // namespace mlir::spirv diff --git a/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir b/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir new file mode 100644 index 0000000000000..56e26eee83ff9 --- /dev/null +++ b/mlir/test/Dialect/SPIRV/Transforms/replicated-const-composites.mlir @@ -0,0 +1,283 @@ +// RUN: mlir-opt --spirv-promote-to-replicated-constants --split-input-file %s | FileCheck %s + +spirv.module Logical GLSL450 requires #spirv.vce { + spirv.func @splat_vector_of_i32() -> (vector<3xi32>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : vector<3xi32> + %0 = spirv.Constant dense<2> : vector<3xi32> + spirv.ReturnValue %0 : vector<3xi32> + } + + spirv.func @splat_array_of_i32() -> (!spirv.array<3 x i32>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<3 x i32> + %0 = spirv.Constant [1 : i32, 1 : i32, 1 : i32] : !spirv.array<3 x i32> + spirv.ReturnValue %0 : !spirv.array<3 x i32> + } + + spirv.func @splat_array_of_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3 : i32] : !spirv.array<2 x !spirv.array<3 x i32>> + %0 = spirv.Constant [[3 : i32, 3 : i32, 3 : i32], [3 : i32, 3 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>> + } + + spirv.func @splat_array_of_non_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>> + %0 = spirv.Constant [[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<3 x i32>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>> + } + + spirv.func @splat_array_of_vectors_of_i32() -> (!spirv.array<2xvector<2xi32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>> + %0 = spirv.Constant [dense<[1, 2]> : vector<2xi32>, dense<[1, 2]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>> + spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>> + } + + spirv.func @splat_array_of_splat_vectors_of_i32() -> (!spirv.array<2 x vector<2xi32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.array<2 x vector<2xi32>> + %0 = spirv.Constant [dense<2> : vector<2xi32>, dense<2> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>> + spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>> + } + + spirv.func @splat_tensor_of_i32() -> (!spirv.array<2 x !spirv.array<3 x i32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3 : i32] : !spirv.array<2 x !spirv.array<3 x i32>> + %0 = spirv.Constant dense<3> : tensor<2x3xi32> : !spirv.array<2 x !spirv.array<3 x i32>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x i32>> + } + + spirv.func @splat_arm_tensor_of_i32() -> (!spirv.arm.tensor<2x3xi32>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2 : i32] : !spirv.arm.tensor<2x3xi32> + %0 = spirv.Constant dense<2> : !spirv.arm.tensor<2x3xi32> + spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xi32> + } + + spirv.func @array_of_splat_array_of_non_splat_vectors_of_i32() -> (!spirv.array<1 x !spirv.array<2 x vector<2xi32>>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1, 2]> : vector<2xi32>] : !spirv.array<1 x !spirv.array<2 x vector<2xi32>> + %0 = spirv.Constant [[dense<[1, 2]> : vector<2xi32>, dense<[1, 2]> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<2 x vector<2xi32>>> + spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<2 x vector<2xi32>>> + } + + spirv.func @array_of_one_splat_array_of_vector_of_one_i32() -> !spirv.array<1 x !spirv.array<2 x vector<1xi32>>> "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1> : vector<1xi32>] : !spirv.array<1 x !spirv.array<2 x vector<1xi32> + %cst = spirv.Constant [[dense<1> : vector<1xi32>], [dense<1> : vector<1xi32>]] : !spirv.array<1 x !spirv.array<2 x vector<1xi32>>> + spirv.ReturnValue %cst : !spirv.array<1 x !spirv.array<2 x vector<1xi32>>> + } + + spirv.func @splat_array_of_array_of_one_vector_of_one_i32() -> (!spirv.array<2 x !spirv.array<1 x vector<1xi32>>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1> : vector<1xi32>] : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>> + %0 = spirv.Constant [[dense<1> : vector<1xi32>], [dense<1> : vector<1xi32>]] : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<1 x vector<1xi32>>> + } + + spirv.func @array_of_one_array_of_one_splat_vector_of_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xi32>>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1 : i32] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>> + %0 = spirv.Constant [[dense<1> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>> + spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>> + } + + spirv.func @splat_array_of_splat_array_of_non_splat_array_of_i32() -> (!spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1 : i32, 2 : i32, 3 : i32]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>> + %0 = spirv.Constant [[[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]], [[1 : i32, 2 : i32, 3 : i32], [1 : i32, 2 : i32, 3 : i32]]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x i32>>> + } + + spirv.func @splat_vector_of_f32() -> (vector<3xf32>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : vector<3xf32> + %0 = spirv.Constant dense<2.0> : vector<3xf32> + spirv.ReturnValue %0 : vector<3xf32> + } + + spirv.func @splat_array_of_f32() -> (!spirv.array<3 x f32>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1.000000e+00 : f32] : !spirv.array<3 x f32> + %0 = spirv.Constant [1.0 : f32, 1.0 : f32, 1.0 : f32] : !spirv.array<3 x f32> + spirv.ReturnValue %0 : !spirv.array<3 x f32> + } + + spirv.func @splat_array_of_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3.000000e+00 : f32] : !spirv.array<2 x !spirv.array<3 x f32>> + %0 = spirv.Constant [[3.0 : f32, 3.0 : f32, 3.0 : f32], [3.0 : f32, 3.0 : f32, 3.0 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>> + } + + spirv.func @splat_array_of_non_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1.000000e+00 : f32, 2.000000e+00 : f32, 3.000000e+00 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>> + %0 = spirv.Constant [[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]] : !spirv.array<2 x !spirv.array<3 x f32>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>> + } + + spirv.func @splat_array_of_vectors_of_f32() -> (!spirv.array<2xvector<2xf32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1.000000e+00, 2.000000e+00]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>> + %0 = spirv.Constant [dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 2.0]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>> + spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>> + } + + spirv.func @splat_array_of_splat_vectors_of_f32() -> (!spirv.array<2 x vector<2xf32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : !spirv.array<2 x vector<2xf32>> + %0 = spirv.Constant [dense<2.0> : vector<2xf32>, dense<2.0> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>> + spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>> + } + + spirv.func @splat_tensor_of_f32() -> (!spirv.array<2 x !spirv.array<3 x f32>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [3.000000e+00 : f32] : !spirv.array<2 x !spirv.array<3 x f32>> + %0 = spirv.Constant dense<3.0> : tensor<2x3xf32> : !spirv.array<2 x !spirv.array<3 x f32>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<3 x f32>> + } + + spirv.func @splat_arm_tensor_of_f32() -> (!spirv.arm.tensor<2x3xf32>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [2.000000e+00 : f32] : !spirv.arm.tensor<2x3xf32> + %0 = spirv.Constant dense<2.0> : !spirv.arm.tensor<2x3xf32> + spirv.ReturnValue %0 : !spirv.arm.tensor<2x3xf32> + } + + spirv.func @array_of_splat_array_of_non_splat_vectors_of_f32() -> (!spirv.array<1 x !spirv.array<2 x vector<2xf32>>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<[1.000000e+00, 2.000000e+00]> : vector<2xf32>] : !spirv.array<1 x !spirv.array<2 x vector<2xf32>> + %0 = spirv.Constant [[dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 2.0]> : vector<2xf32>]] : !spirv.array<1 x !spirv.array<2 x vector<2xf32>>> + spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<2 x vector<2xf32>>> + } + + spirv.func @array_of_one_splat_array_of_vector_of_one_f32() -> !spirv.array<1 x !spirv.array<2 x vector<1xf32>>> "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1.000000e+00> : vector<1xf32>] : !spirv.array<1 x !spirv.array<2 x vector<1xf32> + %cst = spirv.Constant [[dense<1.0> : vector<1xf32>], [dense<1.0> : vector<1xf32>]] : !spirv.array<1 x !spirv.array<2 x vector<1xf32>>> + spirv.ReturnValue %cst : !spirv.array<1 x !spirv.array<2 x vector<1xf32>>> + } + + spirv.func @splat_array_of_array_of_one_vector_of_one_f32() -> (!spirv.array<2 x !spirv.array<1 x vector<1xf32>>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [dense<1.000000e+00> : vector<1xf32>] : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>> + %0 = spirv.Constant [[dense<1.0> : vector<1xf32>], [dense<1.0> : vector<1xf32>]] : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<1 x vector<1xf32>>> + } + + spirv.func @array_of_one_array_of_one_splat_vector_of_f32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xf32>>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate [1.000000e+00 : f32] : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>> + %0 = spirv.Constant [[dense<1.0> : vector<2xf32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>> + spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xf32>>> + } + + spirv.func @splat_array_of_splat_array_of_non_splat_array_of_f32() -> (!spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>>) "None" { + // CHECK: {{%.*}} = spirv.EXT.ConstantCompositeReplicate {{\[}}[1.000000e+00 : f32, 2.000000e+00 : f32, 3.000000e+00 : f32]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>> + %0 = spirv.Constant [[[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]], [[1.0 : f32, 2.0 : f32, 3.0 : f32], [1.0 : f32, 2.0 : f32, 3.0 : f32]]] : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>> + spirv.ReturnValue %0 : !spirv.array<2 x !spirv.array<2 x !spirv.array<3 x f32>>> + } + + spirv.func @array_of_one_i32() -> (!spirv.array<1 x i32>) "None" { + // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant [1 : i32] : !spirv.array<1 x i32> + spirv.ReturnValue %0 : !spirv.array<1 x i32> + } + + spirv.func @arm_tensor_of_one_i32() -> (!spirv.arm.tensor<1xi32>) "None" { + // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant dense<1> : !spirv.arm.tensor<1xi32> + spirv.ReturnValue %0 : !spirv.arm.tensor<1xi32> + } + + spirv.func @non_splat_vector_of_i32() -> (vector<3xi32>) "None" { + // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant dense<[0, 1, 2]> : vector<3xi32> + spirv.ReturnValue %0 : vector<3xi32> + } + + spirv.func @non_splat_array_of_vectors_of_i32() -> (!spirv.array<2xvector<2xi32>>) "None" { + // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant [dense<[1, 2]> : vector<2xi32>, dense<[1, 3]> : vector<2xi32>] : !spirv.array<2 x vector<2xi32>> + spirv.ReturnValue %0 : !spirv.array<2 x vector<2xi32>> + } + + spirv.func @array_of_one_f32() -> (!spirv.array<1 x f32>) "None" { + // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant [1.0 : f32] : !spirv.array<1 x f32> + spirv.ReturnValue %0 : !spirv.array<1 x f32> + } + + spirv.func @arm_tensor_of_one_f32() -> (!spirv.arm.tensor<1xf32>) "None" { + // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant dense<1.0> : !spirv.arm.tensor<1xf32> + spirv.ReturnValue %0 : !spirv.arm.tensor<1xf32> + } + + spirv.func @non_splat_vector_of_f32() -> (vector<3xf32>) "None" { + // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant dense<[0.0, 1.0, 2.0]> : vector<3xf32> + spirv.ReturnValue %0 : vector<3xf32> + } + + spirv.func @non_splat_array_of_vectors_of_f32() -> (!spirv.array<2xvector<2xf32>>) "None" { + // CHECK-NOT: spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant [dense<[1.0, 2.0]> : vector<2xf32>, dense<[1.0, 3.0]> : vector<2xf32>] : !spirv.array<2 x vector<2xf32>> + spirv.ReturnValue %0 : !spirv.array<2 x vector<2xf32>> + } + + spirv.func @array_of_one_array_of_one_non_splat_vector_of_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<2xi32>>>) "None" { + // CHECK-NOT spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant [[dense<[1, 2]> : vector<2xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>> + spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<2xi32>>> + } + + spirv.func @array_of_one_array_of_one_vector_of_one_i32() -> (!spirv.array<1 x !spirv.array<1 x vector<1xi32>>>) "None" { + // CHECK-NOT spirv.EXT.ConstantCompositeReplicate + %0 = spirv.Constant [[dense<1> : vector<1xi32>]] : !spirv.array<1 x !spirv.array<1 x vector<1xi32>>> + spirv.ReturnValue %0 : !spirv.array<1 x !spirv.array<1 x vector<1xi32>>> + } +} + +// ----- + +spirv.module Logical GLSL450 requires #spirv.vce { + + spirv.SpecConstant @sc_i32_1 = 1 : i32 + + // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_i32 (@sc_i32_1) : !spirv.array<3 x i32> + spirv.SpecConstantComposite @scc_splat_array_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.array<3 x i32> + + // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_i32 (@sc_i32_1) : !spirv.struct<(i32, i32, i32)> + spirv.SpecConstantComposite @scc_splat_struct_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.struct<(i32, i32, i32)> + + // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_i32 (@sc_i32_1) : vector<3xi32> + spirv.SpecConstantComposite @scc_splat_vector_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : vector<3 x i32> + + // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_arm_tensor_of_i32 (@sc_i32_1) : !spirv.arm.tensor<3xi32> + spirv.SpecConstantComposite @scc_splat_arm_tensor_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_1) : !spirv.arm.tensor<3xi32> + + spirv.SpecConstant @sc_f32_1 = 1.0 : f32 + + // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_array_of_f32 (@sc_f32_1) : !spirv.array<3 x f32> + spirv.SpecConstantComposite @scc_splat_array_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.array<3 x f32> + + // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_struct_of_f32 (@sc_f32_1) : !spirv.struct<(f32, f32, f32)> + spirv.SpecConstantComposite @scc_splat_struct_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.struct<(f32, f32, f32)> + + // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_vector_of_f32 (@sc_f32_1) : vector<3xf32> + spirv.SpecConstantComposite @scc_splat_vector_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : vector<3 x f32> + + // CHECK: spirv.EXT.SpecConstantCompositeReplicate @scc_splat_arm_tensor_of_f32 (@sc_f32_1) : !spirv.arm.tensor<3xf32> + spirv.SpecConstantComposite @scc_splat_arm_tensor_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_1) : !spirv.arm.tensor<3xf32> + + spirv.SpecConstant @sc_i32_2 = 2 : i32 + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_array_of_one_i32 (@sc_i32_1) : !spirv.array<1 x i32> + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_arm_tensor_of_one_i32 (@sc_i32_1) : !spirv.arm.tensor<1xi32> + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_non_splat_vector_of_i32 (@sc_i32_1, @sc_i32_1, @sc_i32_2) : vector<3 x i32> + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_non_splat_arm_tensor_of_i32 (@sc_i32_2, @sc_i32_1, @sc_i32_1) : !spirv.arm.tensor<3xi32> + + spirv.SpecConstant @sc_f32_2 = 2.0 : f32 + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_array_of_one_f32 (@sc_f32_1) : !spirv.array<1 x f32> + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_arm_tensor_of_one_f32 (@sc_f32_1) : !spirv.arm.tensor<1xf32> + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_non_splat_vector_of_f32 (@sc_f32_1, @sc_f32_1, @sc_f32_2) : vector<3 x f32> + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_non_splat_arm_tensor_of_f32 (@sc_f32_2, @sc_f32_1, @sc_f32_1) : !spirv.arm.tensor<3xf32> + + // CHECK-NOT: spirv.EXT.SpecConstantCompositeReplicate + spirv.SpecConstantComposite @scc_struct_of_i32_and_f32 (@sc_i32_1, @sc_i32_1, @sc_f32_1) : !spirv.struct<(i32, i32, f32)> +} From 7e0ae019f854c99ae0d6a220aba7fcd5407f2494 Mon Sep 17 00:00:00 2001 From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com> Date: Fri, 18 Jul 2025 20:00:04 +0300 Subject: [PATCH 368/813] [libc][math] Refactor exp10f16 implementation to header-only in src/__support/math folder. (#148408) Part of #147386 in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450 --- libc/shared/math.h | 1 + libc/shared/math/exp10f16.h | 29 ++++ libc/src/__support/math/CMakeLists.txt | 34 +++++ .../__support/math/exp10_float16_constants.h | 43 ++++++ libc/src/__support/math/exp10f16.h | 141 ++++++++++++++++++ libc/src/__support/math/exp10f16_utils.h | 64 ++++++++ libc/src/math/generic/CMakeLists.txt | 21 +-- libc/src/math/generic/exp10f16.cpp | 122 +-------------- libc/src/math/generic/exp10m1f16.cpp | 2 +- libc/src/math/generic/expxf16.h | 56 +------ .../llvm-project-overlay/libc/BUILD.bazel | 38 ++++- 11 files changed, 357 insertions(+), 194 deletions(-) create mode 100644 libc/shared/math/exp10f16.h create mode 100644 libc/src/__support/math/exp10_float16_constants.h create mode 100644 libc/src/__support/math/exp10f16.h create mode 100644 libc/src/__support/math/exp10f16_utils.h diff --git a/libc/shared/math.h b/libc/shared/math.h index 2ae7c1d58ae10..26f69d6fa43ea 100644 --- a/libc/shared/math.h +++ b/libc/shared/math.h @@ -14,6 +14,7 @@ #include "math/exp.h" #include "math/exp10.h" #include "math/exp10f.h" +#include "math/exp10f16.h" #include "math/expf.h" #include "math/expf16.h" #include "math/frexpf.h" diff --git a/libc/shared/math/exp10f16.h b/libc/shared/math/exp10f16.h new file mode 100644 index 0000000000000..8acdbdb7c70a1 --- /dev/null +++ b/libc/shared/math/exp10f16.h @@ -0,0 +1,29 @@ +//===-- Shared exp10f16 function --------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_MATH_EXP10F_H +#define LLVM_LIBC_SHARED_MATH_EXP10F_H + +#include "include/llvm-libc-macros/float16-macros.h" + +#ifdef LIBC_TYPES_HAS_FLOAT16 + +#include "shared/libc_common.h" +#include "src/__support/math/exp10f16.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using math::exp10f16; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LIBC_TYPES_HAS_FLOAT16 + +#endif // LLVM_LIBC_SHARED_MATH_EXP10F_H diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt index ad36679409f89..77a47c65489dd 100644 --- a/libc/src/__support/math/CMakeLists.txt +++ b/libc/src/__support/math/CMakeLists.txt @@ -198,3 +198,37 @@ add_header_library( libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.optimization ) + +add_header_library( + exp10_float16_constants + HDRS + exp10_float16_constants.h + DEPENDS + libc.src.__support.CPP.array +) + +add_header_library( + exp10f16_utils + HDRS + exp10f16_utils.h + DEPENDS + .expf16_utils + .exp10_float16_constants + libc.src.__support.FPUtil.fp_bits +) + +add_header_library( + exp10f16 + HDRS + exp10f16.h + DEPENDS + .exp10f16_utils + libc.src.__support.FPUtil.fp_bits + src.__support.FPUtil.FEnvImpl + src.__support.FPUtil.FPBits + src.__support.FPUtil.cast + src.__support.FPUtil.rounding_mode + src.__support.FPUtil.except_value_utils + src.__support.macros.optimization + src.__support.macros.properties.cpu_features +) diff --git a/libc/src/__support/math/exp10_float16_constants.h b/libc/src/__support/math/exp10_float16_constants.h new file mode 100644 index 0000000000000..f5928db740ee4 --- /dev/null +++ b/libc/src/__support/math/exp10_float16_constants.h @@ -0,0 +1,43 @@ +//===-- Constants for exp10f16 function -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_FLOAT16_CONSTANTS_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10_FLOAT16_CONSTANTS_H + +#include "include/llvm-libc-macros/float16-macros.h" +#include + +#ifdef LIBC_TYPES_HAS_FLOAT16 + +#include "src/__support/CPP/array.h" + +namespace LIBC_NAMESPACE_DECL { + +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN)); +static constexpr cpp::array EXP2_MID_BITS = { + 0x3f80'0000U, 0x3f8b'95c2U, 0x3f98'37f0U, 0x3fa5'fed7U, + 0x3fb5'04f3U, 0x3fc5'672aU, 0x3fd7'44fdU, 0x3fea'c0c7U, +}; + +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > round(log2(10), SG, RN); +static constexpr float LOG2F_10 = 0x1.a934fp+1f; + +// Generated by Sollya with the following commands: +// > display = hexadecimal; +// > round(log10(2), SG, RN); +static constexpr float LOG10F_2 = 0x1.344136p-2f; + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LIBC_TYPES_HAS_FLOAT16 + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H diff --git a/libc/src/__support/math/exp10f16.h b/libc/src/__support/math/exp10f16.h new file mode 100644 index 0000000000000..0d8b125348844 --- /dev/null +++ b/libc/src/__support/math/exp10f16.h @@ -0,0 +1,141 @@ +//===-- Implementation header for exp10f16 ----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H + +#include "include/llvm-libc-macros/float16-macros.h" + +#ifdef LIBC_TYPES_HAS_FLOAT16 + +#include "exp10f16_utils.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/rounding_mode.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" +#include "src/__support/macros/properties/cpu_features.h" + +namespace LIBC_NAMESPACE_DECL { + +namespace math { + +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS +#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT +static constexpr size_t N_EXP10F16_EXCEPTS = 5; +#else +static constexpr size_t N_EXP10F16_EXCEPTS = 8; +#endif + +static constexpr fputil::ExceptValues + EXP10F16_EXCEPTS = {{ + // x = 0x1.8f4p-2, exp10f16(x) = 0x1.3ap+1 (RZ) + {0x363dU, 0x40e8U, 1U, 0U, 1U}, + // x = 0x1.95cp-2, exp10f16(x) = 0x1.3ecp+1 (RZ) + {0x3657U, 0x40fbU, 1U, 0U, 0U}, + // x = -0x1.018p-4, exp10f16(x) = 0x1.bbp-1 (RZ) + {0xac06U, 0x3aecU, 1U, 0U, 0U}, + // x = -0x1.c28p+0, exp10f16(x) = 0x1.1ccp-6 (RZ) + {0xbf0aU, 0x2473U, 1U, 0U, 0U}, + // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ) + {0xc387U, 0x09a5U, 1U, 0U, 0U}, +#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT + // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ) + {0x4030U, 0x57c1U, 1U, 0U, 1U}, + // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ) + {0x406eU, 0x591fU, 1U, 0U, 1U}, + // x = 0x1.1b8p+2, exp10f16(x) = 0x1.a4p+14 (RZ) + {0x446eU, 0x7690U, 1U, 0U, 1U}, +#endif + }}; +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS + +static constexpr float16 exp10f16(float16 x) { + using FPBits = fputil::FPBits; + FPBits x_bits(x); + + uint16_t x_u = x_bits.uintval(); + uint16_t x_abs = x_u & 0x7fffU; + + // When |x| >= 5, or x is NaN. + if (LIBC_UNLIKELY(x_abs >= 0x4500U)) { + // exp10(NaN) = NaN + if (x_bits.is_nan()) { + if (x_bits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + return x; + } + + // When x >= 5. + if (x_bits.is_pos()) { + // exp10(+inf) = +inf + if (x_bits.is_inf()) + return FPBits::inf().get_val(); + + switch (fputil::quick_get_round()) { + case FE_TONEAREST: + case FE_UPWARD: + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_OVERFLOW); + return FPBits::inf().get_val(); + default: + return FPBits::max_normal().get_val(); + } + } + + // When x <= -8. + if (x_u >= 0xc800U) { + // exp10(-inf) = +0 + if (x_bits.is_inf()) + return FPBits::zero().get_val(); + + fputil::set_errno_if_required(ERANGE); + fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT); + + if (fputil::fenv_is_round_up()) + return FPBits::min_subnormal().get_val(); + return FPBits::zero().get_val(); + } + } + + // When x is 1, 2, 3, or 4. These are hard-to-round cases with exact results. + if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) { + switch (x_u) { + case 0x3c00U: // x = 1.0f16 + return fputil::cast(10.0); + case 0x4000U: // x = 2.0f16 + return fputil::cast(100.0); + case 0x4200U: // x = 3.0f16 + return fputil::cast(1'000.0); + case 0x4400U: // x = 4.0f16 + return fputil::cast(10'000.0); + } + } + +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) + return r.value(); +#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS + + // 10^x = 2^((hi + mid) * log2(10)) * 10^lo + auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x); + return fputil::cast(exp2_hi_mid * exp10_lo); +} + +} // namespace math + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LIBC_TYPES_HAS_FLOAT16 + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_H diff --git a/libc/src/__support/math/exp10f16_utils.h b/libc/src/__support/math/exp10f16_utils.h new file mode 100644 index 0000000000000..bffb81ba606bb --- /dev/null +++ b/libc/src/__support/math/exp10f16_utils.h @@ -0,0 +1,64 @@ +//===-- Common utils for exp10f16 -------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H + +#include "include/llvm-libc-macros/float16-macros.h" + +#ifdef LIBC_TYPES_HAS_FLOAT16 + +#include "exp10_float16_constants.h" +#include "expf16_utils.h" +#include "src/__support/FPUtil/FPBits.h" + +namespace LIBC_NAMESPACE_DECL { + +LIBC_INLINE static constexpr ExpRangeReduction +exp10_range_reduction(float16 x) { + // For -8 < x < 5, to compute 10^x, we perform the following range reduction: + // find hi, mid, lo, such that: + // x = (hi + mid) * log2(10) + lo, in which + // hi is an integer, + // mid * 2^3 is an integer, + // -2^(-4) <= lo < 2^(-4). + // In particular, + // hi + mid = round(x * 2^3) * 2^(-3). + // Then, + // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo + // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid + // by adding hi to the exponent field of 2^mid. 10^lo is computed using a + // degree-4 minimax polynomial generated by Sollya. + + float xf = x; + float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f)); + int x_hi_mid = static_cast(kf); + unsigned x_hi = static_cast(x_hi_mid) >> 3; + unsigned x_mid = static_cast(x_hi_mid) & 0x7; + // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x + float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf); + + uint32_t exp2_hi_mid_bits = + EXP2_MID_BITS[x_mid] + + static_cast(x_hi << fputil::FPBits::FRACTION_LEN); + float exp2_hi_mid = fputil::FPBits(exp2_hi_mid_bits).get_val(); + // Degree-4 minimax polynomial generated by Sollya with the following + // commands: + // > display = hexadecimal; + // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]); + // > 1 + x * P; + float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f, + 0x1.04b434p+1f, 0x1.2bcf9ep+0f); + return {exp2_hi_mid, exp10_lo}; +} + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LIBC_TYPES_HAS_FLOAT16 + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F16_UTILS_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 99db743315d43..fb253a4502700 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -1477,20 +1477,8 @@ add_entrypoint_object( HDRS ../exp10f16.h DEPENDS - .expxf16 - libc.hdr.errno_macros - libc.hdr.fenv_macros - libc.src.__support.CPP.array - libc.src.__support.FPUtil.cast - libc.src.__support.FPUtil.except_value_utils - libc.src.__support.FPUtil.fenv_impl - libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.nearest_integer - libc.src.__support.FPUtil.polyeval - libc.src.__support.FPUtil.rounding_mode - libc.src.__support.macros.optimization - libc.src.__support.macros.properties.cpu_features + libc.src.__support.math.exp10f16 + libc.src.errno.errno ) add_entrypoint_object( @@ -1519,7 +1507,6 @@ add_entrypoint_object( HDRS ../exp10m1f16.h DEPENDS - .expxf16 libc.hdr.errno_macros libc.hdr.fenv_macros libc.src.__support.FPUtil.cast @@ -1531,6 +1518,7 @@ add_entrypoint_object( libc.src.__support.FPUtil.rounding_mode libc.src.__support.macros.optimization libc.src.__support.macros.properties.cpu_features + libc.src.__support.math.exp10f16_utils ) add_entrypoint_object( @@ -5023,10 +5011,11 @@ add_header_library( HDRS expxf16.h DEPENDS - libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.cast libc.src.__support.FPUtil.multiply_add libc.src.__support.FPUtil.nearest_integer libc.src.__support.macros.attributes libc.src.__support.math.expf16_utils + libc.src.__support.math.exp10_float16_constants ) diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp index 31abf3b4f89b2..cb3c8599c9231 100644 --- a/libc/src/math/generic/exp10f16.cpp +++ b/libc/src/math/generic/exp10f16.cpp @@ -7,128 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/math/exp10f16.h" -#include "expxf16.h" -#include "hdr/errno_macros.h" -#include "hdr/fenv_macros.h" -#include "src/__support/CPP/array.h" -#include "src/__support/FPUtil/FEnvImpl.h" -#include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/cast.h" -#include "src/__support/FPUtil/except_value_utils.h" -#include "src/__support/FPUtil/multiply_add.h" -#include "src/__support/FPUtil/nearest_integer.h" -#include "src/__support/FPUtil/rounding_mode.h" -#include "src/__support/common.h" -#include "src/__support/macros/config.h" -#include "src/__support/macros/optimization.h" -#include "src/__support/macros/properties/cpu_features.h" +#include "src/__support/math/exp10f16.h" namespace LIBC_NAMESPACE_DECL { -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS -#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT -static constexpr size_t N_EXP10F16_EXCEPTS = 5; -#else -static constexpr size_t N_EXP10F16_EXCEPTS = 8; -#endif - -static constexpr fputil::ExceptValues - EXP10F16_EXCEPTS = {{ - // x = 0x1.8f4p-2, exp10f16(x) = 0x1.3ap+1 (RZ) - {0x363dU, 0x40e8U, 1U, 0U, 1U}, - // x = 0x1.95cp-2, exp10f16(x) = 0x1.3ecp+1 (RZ) - {0x3657U, 0x40fbU, 1U, 0U, 0U}, - // x = -0x1.018p-4, exp10f16(x) = 0x1.bbp-1 (RZ) - {0xac06U, 0x3aecU, 1U, 0U, 0U}, - // x = -0x1.c28p+0, exp10f16(x) = 0x1.1ccp-6 (RZ) - {0xbf0aU, 0x2473U, 1U, 0U, 0U}, - // x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ) - {0xc387U, 0x09a5U, 1U, 0U, 0U}, -#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT - // x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ) - {0x4030U, 0x57c1U, 1U, 0U, 1U}, - // x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ) - {0x406eU, 0x591fU, 1U, 0U, 1U}, - // x = 0x1.1b8p+2, exp10f16(x) = 0x1.a4p+14 (RZ) - {0x446eU, 0x7690U, 1U, 0U, 1U}, -#endif - }}; -#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS - -LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { - using FPBits = fputil::FPBits; - FPBits x_bits(x); - - uint16_t x_u = x_bits.uintval(); - uint16_t x_abs = x_u & 0x7fffU; - - // When |x| >= 5, or x is NaN. - if (LIBC_UNLIKELY(x_abs >= 0x4500U)) { - // exp10(NaN) = NaN - if (x_bits.is_nan()) { - if (x_bits.is_signaling_nan()) { - fputil::raise_except_if_required(FE_INVALID); - return FPBits::quiet_nan().get_val(); - } - - return x; - } - - // When x >= 5. - if (x_bits.is_pos()) { - // exp10(+inf) = +inf - if (x_bits.is_inf()) - return FPBits::inf().get_val(); - - switch (fputil::quick_get_round()) { - case FE_TONEAREST: - case FE_UPWARD: - fputil::set_errno_if_required(ERANGE); - fputil::raise_except_if_required(FE_OVERFLOW); - return FPBits::inf().get_val(); - default: - return FPBits::max_normal().get_val(); - } - } - - // When x <= -8. - if (x_u >= 0xc800U) { - // exp10(-inf) = +0 - if (x_bits.is_inf()) - return FPBits::zero().get_val(); - - fputil::set_errno_if_required(ERANGE); - fputil::raise_except_if_required(FE_UNDERFLOW | FE_INEXACT); - - if (fputil::fenv_is_round_up()) - return FPBits::min_subnormal().get_val(); - return FPBits::zero().get_val(); - } - } - - // When x is 1, 2, 3, or 4. These are hard-to-round cases with exact results. - if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) { - switch (x_u) { - case 0x3c00U: // x = 1.0f16 - return fputil::cast(10.0); - case 0x4000U: // x = 2.0f16 - return fputil::cast(100.0); - case 0x4200U: // x = 3.0f16 - return fputil::cast(1'000.0); - case 0x4400U: // x = 4.0f16 - return fputil::cast(10'000.0); - } - } - -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value())) - return r.value(); -#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS - - // 10^x = 2^((hi + mid) * log2(10)) * 10^lo - auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x); - return fputil::cast(exp2_hi_mid * exp10_lo); -} +LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) { return math::exp10f16(x); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp index 545c479694811..6c2fdbea418df 100644 --- a/libc/src/math/generic/exp10m1f16.cpp +++ b/libc/src/math/generic/exp10m1f16.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/exp10m1f16.h" -#include "expxf16.h" #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/FEnvImpl.h" @@ -21,6 +20,7 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" #include "src/__support/macros/properties/cpu_features.h" +#include "src/__support/math/exp10f16_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h index 05ac95d586823..b17b14fa2d756 100644 --- a/libc/src/math/generic/expxf16.h +++ b/libc/src/math/generic/expxf16.h @@ -17,18 +17,11 @@ #include "src/__support/macros/config.h" #include +#include "src/__support/math/exp10_float16_constants.h" #include "src/__support/math/expf16_utils.h" namespace LIBC_NAMESPACE_DECL { -// Generated by Sollya with the following commands: -// > display = hexadecimal; -// > for i from 0 to 7 do printsingle(round(2^(i * 2^-3), SG, RN)); -constexpr cpp::array EXP2_MID_BITS = { - 0x3f80'0000U, 0x3f8b'95c2U, 0x3f98'37f0U, 0x3fa5'fed7U, - 0x3fb5'04f3U, 0x3fc5'672aU, 0x3fd7'44fdU, 0x3fea'c0c7U, -}; - LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) { // For -25 < x < 16, to compute 2^x, we perform the following range reduction: // find hi, mid, lo, such that: @@ -66,53 +59,6 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) { return {exp2_hi_mid, exp2_lo}; } -// Generated by Sollya with the following commands: -// > display = hexadecimal; -// > round(log2(10), SG, RN); -static constexpr float LOG2F_10 = 0x1.a934fp+1f; - -// Generated by Sollya with the following commands: -// > display = hexadecimal; -// > round(log10(2), SG, RN); -static constexpr float LOG10F_2 = 0x1.344136p-2f; - -LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) { - // For -8 < x < 5, to compute 10^x, we perform the following range reduction: - // find hi, mid, lo, such that: - // x = (hi + mid) * log2(10) + lo, in which - // hi is an integer, - // mid * 2^3 is an integer, - // -2^(-4) <= lo < 2^(-4). - // In particular, - // hi + mid = round(x * 2^3) * 2^(-3). - // Then, - // 10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo - // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid - // by adding hi to the exponent field of 2^mid. 10^lo is computed using a - // degree-4 minimax polynomial generated by Sollya. - - float xf = x; - float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f)); - int x_hi_mid = static_cast(kf); - unsigned x_hi = static_cast(x_hi_mid) >> 3; - unsigned x_mid = static_cast(x_hi_mid) & 0x7; - // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x - float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf); - - uint32_t exp2_hi_mid_bits = - EXP2_MID_BITS[x_mid] + - static_cast(x_hi << fputil::FPBits::FRACTION_LEN); - float exp2_hi_mid = fputil::FPBits(exp2_hi_mid_bits).get_val(); - // Degree-4 minimax polynomial generated by Sollya with the following - // commands: - // > display = hexadecimal; - // > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]); - // > 1 + x * P; - float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f, - 0x1.04b434p+1f, 0x1.2bcf9ep+0f); - return {exp2_hi_mid, exp10_lo}; -} - // Generated by Sollya with the following commands: // > display = hexadecimal; // > round(log2(exp(1)), SG, RN); diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index e3d807a46fe6a..f0b45a99aae40 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -2073,6 +2073,7 @@ libc_support_library( ":__support_fputil_fp_bits", ":__support_fputil_nearest_integer", ":__support_math_expf16_utils", + ":__support_math_exp10_float16_constants", ], ) @@ -2276,6 +2277,38 @@ libc_support_library( ], ) +libc_support_library( + name = "__support_math_exp10_float16_constants", + hdrs = ["src/__support/math/exp10_float16_constants.h"], + deps = [ + ":__support_cpp_array", + ], +) + +libc_support_library( + name = "__support_math_exp10f16_utils", + hdrs = ["src/__support/math/exp10f16_utils.h"], + deps = [ + ":__support_math_exp10_float16_constants", + ":__support_math_expf16_utils", + ":__support_fputil_fp_bits", + ], +) + +libc_support_library( + name = "__support_math_exp10f16", + hdrs = ["src/__support/math/exp10f16.h"], + deps = [ + ":__support_math_exp10f16_utils", + ":__support_fputil_fp_bits", + ":__support_fputil_cast", + ":__support_fputil_rounding_mode", + ":__support_fputil_except_value_utils", + ":__support_macros_optimization", + ":__support_macros_properties_cpu_features", + ], +) + ############################### complex targets ################################ libc_function( @@ -2896,14 +2929,15 @@ libc_math_function( libc_math_function( name = "exp10f16", additional_deps = [ - ":expxf16", + ":__support_math_exp10f16", + ":errno", ], ) libc_math_function( name = "exp10m1f16", additional_deps = [ - ":expxf16", + ":__support_math_exp10f16_utils", ], ) From e11d28faee10dfb5ae6b8aaadadfd2ea1a2a446a Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 18 Jul 2025 13:05:08 -0400 Subject: [PATCH 369/813] [AMDGPU] Add support for `v_permlane16_swap_b32` on gfx1250 (#149518) Co-authored-by: Mekhanoshin, Stanislav --- .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 47 ++++++ llvm/lib/Target/AMDGPU/VOP1Instructions.td | 8 + llvm/lib/Target/AMDGPU/VOPInstructions.td | 11 +- .../AMDGPU/llvm.amdgcn.permlane16.swap.ll | 153 +++++++++++++++++- llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 6 + llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 6 + .../gfx1250_asm_vop3_from_vop1-fake16.s | 21 +++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 21 +++ 8 files changed, 270 insertions(+), 3 deletions(-) diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index d42e51d04ab9d..4c3f308a6cf75 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -5,6 +5,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable typedef unsigned int uint; +typedef unsigned int __attribute__((ext_vector_type(2))) uint2; typedef half __attribute__((ext_vector_type(2))) half2; // CHECK-LABEL: @test_setprio_inc_wg( @@ -368,6 +369,52 @@ void test_cvt_pk_f16_bf8(global half2* out, short a) out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a); } +// CHECK-LABEL: @test_permlane16_swap( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) +// CHECK-NEXT: [[OLD_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[OLD_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OLD_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[OLD:%.*]], ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP0]], i32 [[TMP1]], i1 false, i1 false) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i32, i32 } [[TMP2]], 0 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP2]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i64 0 +// CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP4]], i64 1 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP6]], ptr addrspace(1) [[TMP7]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP10:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP8]], i32 [[TMP9]], i1 true, i1 false) +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i32, i32 } [[TMP10]], 0 +// CHECK-NEXT: [[TMP12:%.*]] = extractvalue { i32, i32 } [[TMP10]], 1 +// CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP11]], i64 0 +// CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP12]], i64 1 +// CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP14]], ptr addrspace(1) [[TMP15]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[OLD_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 [[TMP16]], i32 [[TMP17]], i1 false, i1 true) +// CHECK-NEXT: [[TMP19:%.*]] = extractvalue { i32, i32 } [[TMP18]], 0 +// CHECK-NEXT: [[TMP20:%.*]] = extractvalue { i32, i32 } [[TMP18]], 1 +// CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> poison, i32 [[TMP19]], i64 0 +// CHECK-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP21]], i32 [[TMP20]], i64 1 +// CHECK-NEXT: [[TMP23:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store <2 x i32> [[TMP22]], ptr addrspace(1) [[TMP23]], align 8 +// CHECK-NEXT: ret void +// +void test_permlane16_swap(global uint2* out, uint old, uint src) { + *out = __builtin_amdgcn_permlane16_swap(old, src, false, false); + *out = __builtin_amdgcn_permlane16_swap(old, src, true, false); + *out = __builtin_amdgcn_permlane16_swap(old, src, false, true); +} + // CHECK-LABEL: @test_cvt_f32_fp8_e5m3( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 3ee90857b34b8..80eb5d8b7d571 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -1080,6 +1080,13 @@ multiclass VOP1_Real_FULL_t16_and_fake16_gfx1250< VOP1_Real_FULL_with_name; } +multiclass VOP1_Real_OpSelIsDPP_gfx1250 op> : VOP1_Real_e32 { + defvar ps = !cast(NAME#"_e64"); + def _e64_gfx1250 : + VOP3_Real_Gen, + VOP3OpSelIsDPP_gfx12<{0, 1, 1, op{6-0}}, ps.Pfl>; +} + defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name; defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name; @@ -1147,6 +1154,7 @@ defm V_MOV_B64 : VOP1_Real_FULL ; defm V_TANH_F32 : VOP1_Real_FULL; defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>; +defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_PRNG_B32 : VOP1_Real_FULL; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 2b91ea7386be4..a25ebdf3e5f6d 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -331,10 +331,19 @@ class VOP3OpSel_gfx9 op, VOPProfile P> : VOP3e_vi { // Special case for v_permlane16_swap_b32/v_permlane32_swap_b32 // op_sel[0]/op_sel[1] are treated as bound_ctrl and fi dpp operands. -class VOP3OpSelIsDPP_gfx9 op, VOPProfile P> : VOP3e_vi { +class VOP3OpSelIsDPP_base { bits<1> fi; bits<1> bound_ctrl; +} + +class VOP3OpSelIsDPP_gfx9 op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_vi { + // OPSEL[0] specifies FI + let Inst{11} = fi; + // OPSEL[1] specifies BOUND_CTRL + let Inst{12} = bound_ctrl; +} +class VOP3OpSelIsDPP_gfx12 op, VOPProfile P> : VOP3OpSelIsDPP_base, VOP3e_gfx11_gfx12 { // OPSEL[0] specifies FI let Inst{11} = fi; // OPSEL[1] specifies BOUND_CTRL diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll index 814086685880d..ed6a02b62ae9a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s ; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-SDAG %s ; RUN: not llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -filetype=null %s 2>&1 | FileCheck -check-prefix=ERR-GISEL %s @@ -17,6 +19,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv(i32 %vdst_old, i32 %src0_old) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_vv: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_vv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) ret { i32, i32 } %v } @@ -29,6 +43,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) { ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_vi: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v1, 1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_vi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false) ret { i32, i32 } %v } @@ -41,6 +71,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) { ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_vl: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v1, 0xc1d1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_vl: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0xc1d1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false) ret { i32, i32 } %v } @@ -54,6 +100,23 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) { ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_iv: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: v_mov_b32_e32 v0, 1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_iv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false) ret { i32, i32 } %v } @@ -67,6 +130,23 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_ss: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: v_mov_b32_e32 v1, s1 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_ss: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) ret { i32, i32 } %v } @@ -80,6 +160,23 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_sv: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v1, v0 +; GFX950-NEXT: v_mov_b32_e32 v0, s0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_sv: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) ret { i32, i32 } %v } @@ -92,6 +189,22 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_vs: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_mov_b32_e32 v1, s0 +; GFX950-NEXT: s_nop 1 +; GFX950-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_vs: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v1, s0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_permlane16_swap_b32_e32 v0, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) ret { i32, i32 } %v } @@ -102,6 +215,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi(i32 %vdst_old, i32 %src0_old) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_vv_fi: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 fi:1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 false) ret { i32, i32 } %v } @@ -112,6 +237,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_bc(i32 %vdst_old, i32 %src0_old) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_vv_bc: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_vv_bc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 true) ret { i32, i32 } %v } @@ -122,6 +259,18 @@ define { i32, i32 } @v_permlane16_swap_b32_vv_fi_bc(i32 %vdst_old, i32 %src0_old ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1 ; GCN-NEXT: s_setpc_b64 s[30:31] +; GFX950-LABEL: v_permlane16_swap_b32_vv_fi_bc: +; GFX950: ; %bb.0: +; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX950-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1 +; GFX950-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: v_permlane16_swap_b32_vv_fi_bc: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_permlane16_swap_b32_e64 v0, v1 bound_ctrl:1 fi:1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 true, i1 true) ret { i32, i32 } %v } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index 5f310a9954ad0..f2cf3d58fb0cf 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -627,3 +627,9 @@ v_cvt_f32_fp8_e32 v1, 3 v_cvt_f32_fp8_e32 v1, v3 // GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] + +v_permlane16_swap_b32 v1, v2 +// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] + +v_permlane16_swap_b32_e32 v1, v2 +// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index aa2e028f661e1..b1c4dc62edd6d 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -663,3 +663,9 @@ v_cvt_f32_fp8_e32 v1, 3 v_cvt_f32_fp8_e32 v1, v3 // GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] + +v_permlane16_swap_b32 v1, v2 +// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] + +v_permlane16_swap_b32_e32 v1, v2 +// GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 9c6a9127d82e4..6b45930a53d73 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -720,3 +720,24 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1] v_cvt_pk_f16_fp8 v1, s2 op_sel:[1] // GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00] + +v_permlane16_swap_b32_e64 v1, v2 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:0 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 fi:0 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:1 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 fi:1 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 2f57d1c331c42..ad00832f7543d 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -750,3 +750,24 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1] v_cvt_pk_f16_fp8 v1, s2 op_sel:[1] // GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00] + +v_permlane16_swap_b32_e64 v1, v2 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:0 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 fi:0 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:1 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 fi:1 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32 v1, v2 bound_ctrl:1 fi:1 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00] + +v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 +// GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00] From 2c50e4cac2c50dbbc9eb3ed78bc0178bfa26d23f Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 18 Jul 2025 13:08:50 -0400 Subject: [PATCH 370/813] [AMDGPU] Add support for `v_sat_pk4_i4_[i8,u8]` on gfx1250 (#149528) Co-authored-by: Mekhanoshin, Stanislav Co-authored-by: Foad, Jay --- clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 + .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl | 25 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 + .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + llvm/lib/Target/AMDGPU/SIInstrInfo.td | 1 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 5 + .../CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll | 305 ++++++++++++++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s | 51 +++ llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 30 ++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s | 16 + llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s | 24 ++ .../MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s | 16 + llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s | 24 ++ .../gfx1250_asm_vop3_from_vop1-fake16.s | 24 ++ .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 30 ++ .../gfx1250_asm_vop3_from_vop1_dpp16-fake16.s | 16 + .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s | 24 ++ .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 16 + .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 24 ++ .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 42 +++ .../AMDGPU/gfx1250_dasm_vop1_dpp16.txt | 22 ++ .../AMDGPU/gfx1250_dasm_vop1_dpp8.txt | 22 ++ .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 56 ++++ .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 24 ++ .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 24 ++ 25 files changed, 831 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index a916af7e0c2df..d4fef5d46af73 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -684,6 +684,8 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts") TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_i4_i8, "UsUi", "nc", "gfx1250-insts") +TARGET_BUILTIN(__builtin_amdgcn_sat_pk4_u4_u8, "UsUi", "nc", "gfx1250-insts") // GFX1250 WMMA builtins TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32") diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl index 4c3f308a6cf75..a21862c4a9395 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl @@ -5,6 +5,7 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable typedef unsigned int uint; +typedef unsigned short int ushort; typedef unsigned int __attribute__((ext_vector_type(2))) uint2; typedef half __attribute__((ext_vector_type(2))) half2; @@ -369,6 +370,30 @@ void test_cvt_pk_f16_bf8(global half2* out, short a) out[0] = __builtin_amdgcn_cvt_pk_f16_bf8(a); } +// CHECK-LABEL: @test_sat_pk4_i4_i8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca i32, align 4, addrspace(5) +// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr +// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr +// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i32 [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP1]], ptr [[TMP2]], align 2 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC_ADDR_ASCAST]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8 +// CHECK-NEXT: store i16 [[TMP4]], ptr [[TMP5]], align 2 +// CHECK-NEXT: ret void +// +void test_sat_pk4_i4_i8(ushort *out, uint src) +{ + *out = __builtin_amdgcn_sat_pk4_i4_i8(src); + *out = __builtin_amdgcn_sat_pk4_u4_u8(src); +} + // CHECK-LABEL: @test_permlane16_swap( // CHECK-NEXT: entry: // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index d8fda0e2bcfa3..ecda6c4efefe3 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -3572,6 +3572,12 @@ def int_amdgcn_cvt_f16_bf8 : ClangBuiltin<"__builtin_amdgcn_cvt_f16_bf8">, [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; +def int_amdgcn_sat_pk4_i4_i8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_i4_i8">, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; + +def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">, + DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; + //===----------------------------------------------------------------------===// // Special Intrinsics for backend internal use only. No frontend // should emit calls to these. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index cbbb57c6f8122..bf2f37bddb9ed 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4558,6 +4558,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_cvt_pk_u16: case Intrinsic::amdgcn_cvt_pk_f16_fp8: case Intrinsic::amdgcn_cvt_pk_f16_bf8: + case Intrinsic::amdgcn_sat_pk4_i4_i8: + case Intrinsic::amdgcn_sat_pk4_u4_u8: case Intrinsic::amdgcn_fmed3: case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_cubema: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index ab7d34002e9f1..9e1951e2946c4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2850,6 +2850,7 @@ def VOP_F16_I16 : VOPProfile <[f16, i16, untyped, untyped]>; def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>; def VOP_I16_I16 : VOPProfile <[i16, i16, untyped, untyped]>; def VOP_BF16_BF16 : VOPProfile<[bf16, bf16, untyped, untyped]>; +def VOP1_I16_I32 : VOPProfile<[i16, i32, untyped, untyped]>; def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>; def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>; diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 80eb5d8b7d571..f621f8581f778 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -803,6 +803,9 @@ let SubtargetPredicate = isGFX1250Plus in { def : Cvt_F_F8_Pat_ByteSel; def : Cvt_F_F8_Pat_ByteSel; } + + defm V_SAT_PK4_I4_I8 : VOP1Inst_t16<"v_sat_pk4_i4_i8", VOP1_I16_I32, int_amdgcn_sat_pk4_i4_i8>; + defm V_SAT_PK4_U4_U8 : VOP1Inst_t16<"v_sat_pk4_u4_u8", VOP1_I16_I32, int_amdgcn_sat_pk4_u4_u8>; } // End SubtargetPredicate = isGFX1250Plus let SubtargetPredicate = isGFX10Plus in { @@ -1158,6 +1161,8 @@ defm V_PERMLANE16_SWAP_B32 : VOP1_Real_OpSelIsDPP_gfx1250<0x049>; defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>; defm V_PRNG_B32 : VOP1_Real_FULL; defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">; +defm V_SAT_PK4_I4_I8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x073>; +defm V_SAT_PK4_U4_U8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x074>; defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>; defm V_CVT_PK_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x076>; defm V_CVT_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x077>; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll new file mode 100644 index 0000000000000..3a5507063b834 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll @@ -0,0 +1,305 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s + +declare i16 @llvm.amdgcn.sat.pk4.i4.i8(i32) #0 +declare i16 @llvm.amdgcn.sat.pk4.u4.u8(i32) #0 + +define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 { +; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_v: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_clause 0x1 +; SDAG-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_clause 0x1 +; SDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s2 +; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm +; +; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v: +; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_clause 0x1 +; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2 +; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-REAL16-NEXT: s_endpgm +; +; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_clause 0x1 +; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, s2 +; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-FAKE16-NEXT: s_endpgm + %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0 + store i16 %cvt, ptr %out, align 2 + ret void +} + +define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 { +; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s: +; SDAG-REAL16: ; %bb.1: +; SDAG-REAL16-NEXT: s_load_b32 s8, s[4:5], 0x0 +; SDAG-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-REAL16-NEXT: s_branch .LBB1_0 +; SDAG-REAL16-NEXT: .p2align 8 +; SDAG-REAL16-NEXT: ; %bb.2: +; SDAG-REAL16-NEXT: .LBB1_0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s8 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s: +; SDAG-FAKE16: ; %bb.1: +; SDAG-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x0 +; SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-FAKE16-NEXT: s_branch .LBB1_0 +; SDAG-FAKE16-NEXT: .p2align 8 +; SDAG-FAKE16-NEXT: ; %bb.2: +; SDAG-FAKE16-NEXT: .LBB1_0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, s8 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm +; +; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s: +; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_clause 0x1 +; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, s2 +; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-REAL16-NEXT: s_endpgm +; +; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_clause 0x1 +; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, s2 +; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-FAKE16-NEXT: s_endpgm + %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 %src) #0 + store i16 %cvt, ptr %out, align 2 + ret void +} + +define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 { +; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_i: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v1, 0x64 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm +; +; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i: +; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GISEL-REAL16-NEXT: v_sat_pk4_i4_i8_e32 v0.l, 0x64 +; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-REAL16-NEXT: s_endpgm +; +; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GISEL-FAKE16-NEXT: v_sat_pk4_i4_i8_e32 v0, 0x64 +; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-FAKE16-NEXT: s_endpgm + %cvt = call i16 @llvm.amdgcn.sat.pk4.i4.i8(i32 100) #0 + store i16 %cvt, ptr %out, align 2 + ret void +} + +define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 { +; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_v: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_clause 0x1 +; SDAG-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_clause 0x1 +; SDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s2 +; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm +; +; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v: +; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_clause 0x1 +; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2 +; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-REAL16-NEXT: s_endpgm +; +; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_clause 0x1 +; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, s2 +; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-FAKE16-NEXT: s_endpgm + %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0 + store i16 %cvt, ptr %out, align 2 + ret void +} + +define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 { +; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s: +; SDAG-REAL16: ; %bb.1: +; SDAG-REAL16-NEXT: s_load_b32 s8, s[4:5], 0x0 +; SDAG-REAL16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-REAL16-NEXT: s_branch .LBB4_0 +; SDAG-REAL16-NEXT: .p2align 8 +; SDAG-REAL16-NEXT: ; %bb.2: +; SDAG-REAL16-NEXT: .LBB4_0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s8 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s: +; SDAG-FAKE16: ; %bb.1: +; SDAG-FAKE16-NEXT: s_load_b32 s8, s[4:5], 0x0 +; SDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; SDAG-FAKE16-NEXT: s_branch .LBB4_0 +; SDAG-FAKE16-NEXT: .p2align 8 +; SDAG-FAKE16-NEXT: ; %bb.2: +; SDAG-FAKE16-NEXT: .LBB4_0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, s8 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm +; +; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s: +; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_clause 0x1 +; GISEL-REAL16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, s2 +; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-REAL16-NEXT: s_endpgm +; +; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_clause 0x1 +; GISEL-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x0 +; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 +; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, s2 +; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-FAKE16-NEXT: s_endpgm + %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 %src) #0 + store i16 %cvt, ptr %out, align 2 + ret void +} + +define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 { +; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_i: +; SDAG-REAL16: ; %bb.0: +; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64 +; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0 +; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; SDAG-REAL16-NEXT: s_endpgm +; +; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i: +; SDAG-FAKE16: ; %bb.0: +; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; SDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; SDAG-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v1, 0x64 +; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-FAKE16-NEXT: flat_store_b16 v0, v1, s[0:1] +; SDAG-FAKE16-NEXT: s_endpgm +; +; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i: +; GISEL-REAL16: ; %bb.0: +; GISEL-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GISEL-REAL16-NEXT: v_sat_pk4_u4_u8_e32 v0.l, 0x64 +; GISEL-REAL16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-REAL16-NEXT: s_wait_kmcnt 0x0 +; GISEL-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-REAL16-NEXT: s_endpgm +; +; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i: +; GISEL-FAKE16: ; %bb.0: +; GISEL-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GISEL-FAKE16-NEXT: v_sat_pk4_u4_u8_e32 v0, 0x64 +; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-FAKE16-NEXT: flat_store_b16 v1, v0, s[0:1] +; GISEL-FAKE16-NEXT: s_endpgm + %cvt = call i16 @llvm.amdgcn.sat.pk4.u4.u8(i32 100) #0 + store i16 %cvt, ptr %out, align 2 + ret void +} + +attributes #0 = { nounwind memory(none) } +attributes #1 = { nounwind } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s index f2cf3d58fb0cf..811c6ebfe0161 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s @@ -628,6 +628,57 @@ v_cvt_f32_fp8_e32 v1, 3 v_cvt_f32_fp8_e32 v1, v3 // GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] +v_cvt_pk_f32_bf8_e32 v[2:3], s3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[4:5], s5 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], s5 ; encoding: [0x05,0xde,0x08,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], 3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[4:5], 3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], 3 ; encoding: [0x83,0xde,0x08,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], v3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[4:5], v3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v3 ; encoding: [0x03,0xdf,0x08,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], s3 +// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], 3 +// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], v3 +// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e] + +v_sat_pk4_i4_i8 v1, v2 +// GFX1250: v_sat_pk4_i4_i8_e32 v1, v2 ; encoding: [0x02,0xe7,0x02,0x7e] + +v_sat_pk4_i4_i8 v1, s2 +// GFX1250: v_sat_pk4_i4_i8_e32 v1, s2 ; encoding: [0x02,0xe6,0x02,0x7e] + +v_sat_pk4_i4_i8 v1, 2 +// GFX1250: v_sat_pk4_i4_i8_e32 v1, 2 ; encoding: [0x82,0xe6,0x02,0x7e] + +v_sat_pk4_i4_i8 v1, 0x1234 +// GFX1250: v_sat_pk4_i4_i8_e32 v1, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00] + +v_sat_pk4_u4_u8 v1, v2 +// GFX1250: v_sat_pk4_u4_u8_e32 v1, v2 ; encoding: [0x02,0xe9,0x02,0x7e] + +v_sat_pk4_u4_u8 v1, s2 +// GFX1250: v_sat_pk4_u4_u8_e32 v1, s2 ; encoding: [0x02,0xe8,0x02,0x7e] + +v_sat_pk4_u4_u8 v1, 2 +// GFX1250: v_sat_pk4_u4_u8_e32 v1, 2 ; encoding: [0x82,0xe8,0x02,0x7e] + +v_sat_pk4_u4_u8 v1, 0x1234 +// GFX1250: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00] + v_permlane16_swap_b32 v1, v2 // GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index b1c4dc62edd6d..3ddbc365224db 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -664,6 +664,36 @@ v_cvt_f32_fp8_e32 v1, 3 v_cvt_f32_fp8_e32 v1, v3 // GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] +v_sat_pk4_i4_i8 v1, v2 +// GFX1250: v_sat_pk4_i4_i8_e32 v1, v2 ; encoding: [0x02,0xe7,0x02,0x7e] + +v_sat_pk4_i4_i8 v1, s2 +// GFX1250: v_sat_pk4_i4_i8_e32 v1, s2 ; encoding: [0x02,0xe6,0x02,0x7e] + +v_sat_pk4_i4_i8 v1, 2 +// GFX1250: v_sat_pk4_i4_i8_e32 v1, 2 ; encoding: [0x82,0xe6,0x02,0x7e] + +v_sat_pk4_i4_i8 v1, 0x1234 +// GFX1250: v_sat_pk4_i4_i8_e32 v1, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00] + +v_sat_pk4_i4_i8 v1.h, v2 +// GFX1250: v_sat_pk4_i4_i8_e32 v1.h, v2 ; encoding: [0x02,0xe7,0x02,0x7f] + +v_sat_pk4_u4_u8 v1, v2 +// GFX1250: v_sat_pk4_u4_u8_e32 v1, v2 ; encoding: [0x02,0xe9,0x02,0x7e] + +v_sat_pk4_u4_u8 v1, s2 +// GFX1250: v_sat_pk4_u4_u8_e32 v1, s2 ; encoding: [0x02,0xe8,0x02,0x7e] + +v_sat_pk4_u4_u8 v1, 2 +// GFX1250: v_sat_pk4_u4_u8_e32 v1, 2 ; encoding: [0x82,0xe8,0x02,0x7e] + +v_sat_pk4_u4_u8 v1, 0x1234 +// GFX1250: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00] + +v_sat_pk4_u4_u8 v1.h, v2 +// GFX1250: v_sat_pk4_u4_u8_e32 v1.h, v2 ; encoding: [0x02,0xe9,0x02,0x7f] + v_permlane16_swap_b32 v1, v2 // GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s index e1cd2e3043693..7386df87f8dab 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s @@ -693,3 +693,19 @@ v_cvt_pk_f16_bf8 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 v_cvt_pk_f16_fp8 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xea,0x02,0x7e,0x02,0xe4,0x04,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 +// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 +// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s index c1d3238b65cbd..0a46f2f074e10 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s @@ -749,3 +749,27 @@ v_cvt_pk_f16_fp8 v1, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 v_cvt_pk_f16_fp8 v1, v2.h quad_perm:[0,1,2,3] // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 +// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_i4_i8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 +// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_u4_u8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s index 100e9f92ff58b..e2763090a8d15 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8-fake16.s @@ -169,3 +169,19 @@ v_cvt_pk_f16_bf8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_pk_f16_fp8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xea,0x02,0x7e,0x02,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s index 2ae103545443c..359aadc49ccc4 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp8.s @@ -225,3 +225,27 @@ v_cvt_pk_f16_fp8 v1, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_pk_f16_fp8 v1, v2.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_i4_i8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_u4_u8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index 6b45930a53d73..aa4e49d85f1ff 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -721,6 +721,30 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1] v_cvt_pk_f16_fp8 v1, s2 op_sel:[1] // GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00] +v_sat_pk4_i4_i8 v150, v2 +// GFX1250: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00] + +v_sat_pk4_i4_i8 v150, s2 +// GFX1250: v_sat_pk4_i4_i8_e64 v150, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00] + +v_sat_pk4_i4_i8 v150, 2 +// GFX1250: v_sat_pk4_i4_i8_e64 v150, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00] + +v_sat_pk4_i4_i8 v150, 0x1234 +// GFX1250: v_sat_pk4_i4_i8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + +v_sat_pk4_u4_u8 v150, v2 +// GFX1250: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00] + +v_sat_pk4_u4_u8 v150, s2 +// GFX1250: v_sat_pk4_u4_u8_e64 v150, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00] + +v_sat_pk4_u4_u8 v150, 2 +// GFX1250: v_sat_pk4_u4_u8_e64 v150, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00] + +v_sat_pk4_u4_u8 v150, 0x1234 +// GFX1250: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + v_permlane16_swap_b32_e64 v1, v2 // GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index ad00832f7543d..8f0c43de07077 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -751,6 +751,36 @@ v_cvt_pk_f16_fp8 v1, v150 op_sel:[1] v_cvt_pk_f16_fp8 v1, s2 op_sel:[1] // GFX1250: v_cvt_pk_f16_fp8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x00,0x00,0x00] +v_sat_pk4_i4_i8 v150, v2 +// GFX1250: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00] + +v_sat_pk4_i4_i8 v150, s2 +// GFX1250: v_sat_pk4_i4_i8_e64 v150, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00] + +v_sat_pk4_i4_i8 v150, 2 +// GFX1250: v_sat_pk4_i4_i8_e64 v150, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00] + +v_sat_pk4_i4_i8 v150, 0x1234 +// GFX1250: v_sat_pk4_i4_i8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + +v_sat_pk4_i4_i8 v150.h, v2 +// GFX1250: v_sat_pk4_i4_i8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00] + +v_sat_pk4_u4_u8 v150, v2 +// GFX1250: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00] + +v_sat_pk4_u4_u8 v150, s2 +// GFX1250: v_sat_pk4_u4_u8_e64 v150, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00] + +v_sat_pk4_u4_u8 v150, 2 +// GFX1250: v_sat_pk4_u4_u8_e64 v150, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00] + +v_sat_pk4_u4_u8 v150, 0x1234 +// GFX1250: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + +v_sat_pk4_u4_u8 v150.h, v2 +// GFX1250: v_sat_pk4_u4_u8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00] + v_permlane16_swap_b32_e64 v1, v2 // GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s index 29bb842b529b7..b21fca654590a 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16-fake16.s @@ -717,3 +717,19 @@ v_cvt_pk_f16_fp8 v1, v128 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v2 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x02,0xe4,0x04,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150, v2 row_share:1 fi:1 +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150, v2 row_share:1 fi:1 +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s index 7df92751c38d1..f14705fa9143c 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp16.s @@ -765,3 +765,27 @@ v_cvt_pk_f16_fp8 v1, v128.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf fi:1 v_cvt_pk_f16_fp8 v1, v128.h quad_perm:[0,1,2,3] // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150, v2 row_share:1 fi:1 +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150, v2 row_share:1 fi:1 +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index d235aeb9f3e62..b2c2943e2a182 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -253,3 +253,19 @@ v_cvt_pk_f16_fp8 v1, v128 dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_pk_f16_fp8 v1, v2 op_sel:[1] dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v2 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index f25e2a5882436..e3c7c0f8cbc81 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -301,3 +301,27 @@ v_cvt_pk_f16_fp8 v1, v128.l dpp8:[7,6,5,4,3,2,1,0] fi:1 v_cvt_pk_f16_fp8 v1, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_i4_i8 v150.h, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_sat_pk4_u4_u8 v150.h, v2 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index aa968b2bb2bee..5b905820844af 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -809,3 +809,45 @@ 0x03,0xd9,0x02,0x7e # GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] + +0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00] + +0x82,0xe6,0x02,0x7e +# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, 2 ; encoding: [0x82,0xe6,0x02,0x7e] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, 2 ; encoding: [0x82,0xe6,0x02,0x7e] + +0x02,0xe6,0x02,0x7e +# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, s2 ; encoding: [0x02,0xe6,0x02,0x7e] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, s2 ; encoding: [0x02,0xe6,0x02,0x7e] + +0x02,0xe7,0x02,0x7e +# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, v2 ; encoding: [0x02,0xe7,0x02,0x7e] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, v2 ; encoding: [0x02,0xe7,0x02,0x7e] + +0x02,0xe7,0x02,0x7f +# GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.h, v2 ; encoding: [0x02,0xe7,0x02,0x7f] + +0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00] + +0x82,0xe8,0x02,0x7e +# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 2 ; encoding: [0x82,0xe8,0x02,0x7e] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 2 ; encoding: [0x82,0xe8,0x02,0x7e] + +0x02,0xe8,0x02,0x7e +# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, s2 ; encoding: [0x02,0xe8,0x02,0x7e] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, s2 ; encoding: [0x02,0xe8,0x02,0x7e] + +0x02,0xe9,0x02,0x7e +# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, v2 ; encoding: [0x02,0xe9,0x02,0x7e] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, v2 ; encoding: [0x02,0xe9,0x02,0x7e] + +0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00] + +0x02,0xe9,0x02,0x7f +# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.h, v2 ; encoding: [0x02,0xe9,0x02,0x7f] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt index 913a2a916ff62..c12ecb8d868aa 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp16.txt @@ -708,3 +708,25 @@ 0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff # GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.h quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff] # GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xea,0x02,0x7e,0x82,0xe4,0x00,0xff] + +0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff +# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x00,0xff] + +0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff +# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe6,0x02,0x7e,0x02,0x39,0x04,0xff] + +0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff +# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe6,0x02,0x7f,0x02,0x39,0x00,0xff] + +0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff +# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x00,0xff] + +0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff +# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0xfa,0xe8,0x02,0x7e,0x02,0x39,0x04,0xff] + +0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff +# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.h, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xe8,0x02,0x7f,0x02,0x39,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt index 4afe44e241bf3..d3706f975e914 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1_dpp8.txt @@ -208,3 +208,25 @@ 0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05 # GFX1250-REAL16: v_cvt_pk_f16_fp8_dpp v1, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05] # GFX1250-FAKE16: v_cvt_pk_f16_fp8_dpp v1, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xea,0x02,0x7e,0x82,0x77,0x39,0x05] + +0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05] + +0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe6,0x02,0x7e,0x02,0x77,0x39,0x05] + +0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_i4_i8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe6,0x02,0x7f,0x02,0x77,0x39,0x05] + +0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05] + +0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0xe8,0x02,0x7e,0x02,0x77,0x39,0x05] + +0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_u4_u8_dpp v1.h, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xe8,0x02,0x7f,0x02,0x77,0x39,0x05] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 1cf3b8807d044..1719592c3dccd 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -924,3 +924,59 @@ 0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00 # GFX1250-REAL16: v_cvt_pk_f16_fp8 v1, v2.h op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00] # GFX1250-FAKE16: v_cvt_pk_f16_fp8 v1, v2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0x02,0x01,0x00,0x00] + +0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf3,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + +0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, 2 ; encoding: [0x96,0x00,0xf3,0xd5,0x82,0x00,0x00,0x00] + +0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, s2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x00,0x00,0x00] + +0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00] + +0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf3,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64 v150, v2 ; encoding: [0x96,0x00,0xf3,0xd5,0x02,0x01,0x00,0x00] + +0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + +0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 2 ; encoding: [0x96,0x00,0xf4,0xd5,0x82,0x00,0x00,0x00] + +0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, s2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x00,0x00,0x00] + +0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00] + +0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] + +0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00] + +0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_permlane16_swap_b32_e64 v1, v2 ; encoding: [0x01,0x00,0xc9,0xd5,0x02,0x01,0x00,0x00] + +0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 ; encoding: [0x01,0x10,0xc9,0xd5,0x02,0x01,0x00,0x00] + +0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_permlane16_swap_b32_e64 v1, v2 bound_ctrl:1 fi:1 ; encoding: [0x01,0x18,0xc9,0xd5,0x02,0x01,0x00,0x00] + +0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_permlane16_swap_b32_e64 v1, v2 fi:1 ; encoding: [0x01,0x08,0xc9,0xd5,0x02,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 83a647ad7c658..34d2104a660d8 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -737,3 +737,27 @@ 0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff # GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff] # GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 op_sel:[1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x08,0xf5,0xd5,0xfa,0x00,0x00,0x00,0x80,0xe4,0x00,0xff] + +0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] + +0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff] + +0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf3,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] + +0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] + +0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 row_share:1 row_mask:0xf bank_mask:0xf fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x51,0x05,0xff] + +0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x40,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 quad_perm:[1,2,3,0] row_mask:0xf bank_mask:0xf ; encoding: [0x96,0x00,0xf4,0xd5,0xfa,0x00,0x00,0x00,0x02,0x39,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index ef5ede4d1d453..867fee512b424 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -289,3 +289,27 @@ 0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 # GFX1250-REAL16: v_cvt_pk_f16_fp8_e64_dpp v1, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_cvt_pk_f16_fp8_e64_dpp v1, v128 op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x08,0xf5,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + +0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] + +0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf3,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] + +0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_i4_i8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_i4_i8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf3,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] + +0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] + +0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x00,0xf4,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] + +0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05 +# GFX1250-REAL16: v_sat_pk4_u4_u8_e64_dpp v150.h, v2 op_sel:[0,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x40,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf4,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] From ff225b5d88647448be8bbba54aaac3977a5485b5 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 18 Jul 2025 09:51:08 -0700 Subject: [PATCH 371/813] [SLP][NFC]Add a run line for the test, NFC --- .../RISCV/reordered-buildvector-scalars.ll | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll index f7e629f7212e9..d4e323819402c 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mcpu=sifive-x280 < %s | FileCheck %s +; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mcpu=sifive-x280 < %s -slp-threshold=-3 | FileCheck %s --check-prefix=THRESH %struct.ImageParameters = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr, i32, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [9 x [16 x [16 x i16]]], [5 x [16 x [16 x i16]]], [9 x [8 x [8 x i16]]], [2 x [4 x [16 x [16 x i16]]]], [16 x [16 x i16]], [16 x [16 x i32]], ptr, ptr, ptr, ptr, ptr, [1200 x %struct.syntaxelement], ptr, ptr, i32, i32, i32, i32, [4 x [4 x i32]], i32, i32, i32, i32, i32, double, i32, i32, i32, i32, ptr, ptr, ptr, ptr, [15 x i16], i32, i32, i32, i32, i32, i32, i32, i32, [6 x [15 x i32]], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [1 x i32], i32, i32, [2 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, ptr, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, ptr, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [2 x i32], i32, i32, i32 } %struct.syntaxelement = type { i32, i32, i32, i32, i32, i32, i32, i32, ptr, ptr } @@ -94,6 +95,89 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72 ; CHECK-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2 ; CHECK-NEXT: ret i32 0 ; +; THRESH-LABEL: define fastcc i32 @test( +; THRESH-SAME: i32 [[TMP0:%.*]], i32 [[ADD111_I_I:%.*]], <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { +; THRESH-NEXT: [[ENTRY:.*:]] +; THRESH-NEXT: [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1 +; THRESH-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1 +; THRESH-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1 +; THRESH-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1 +; THRESH-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]] +; THRESH-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1 +; THRESH-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16 +; THRESH-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2 +; THRESH-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2 +; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16 +; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1 +; THRESH-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]] +; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16 +; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1 +; THRESH-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]] +; THRESH-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16 +; THRESH-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1 +; THRESH-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1 +; THRESH-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1 +; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4 +; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8 +; THRESH-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]] +; THRESH-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1 +; THRESH-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4 +; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8 +; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4 +; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1 +; THRESH-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1 +; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8 +; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4 +; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8 +; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1 +; THRESH-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]] +; THRESH-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1 +; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4 +; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8 +; THRESH-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1 +; THRESH-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1 +; THRESH-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8 +; THRESH-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1 +; THRESH-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]] +; THRESH-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8 +; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> +; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 +; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) +; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) +; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> +; THRESH-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4 +; THRESH-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1 +; THRESH-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]] +; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16 +; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2 +; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 +; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2 +; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2 +; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2 +; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> +; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0 +; THRESH-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1) +; THRESH-NEXT: [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1) +; THRESH-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16> +; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1 +; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 +; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8 +; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4 +; THRESH-NEXT: store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 +; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2 +; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2 +; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2 +; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2 +; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2 +; THRESH-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2 +; THRESH-NEXT: ret i32 0 +; entry: %LoopArray.sroa.24.0.i.i3 = ashr i32 %0, 1 %shr143.5.i.i9 = ashr i32 %0, 1 From 7fd91bb6e89be39a130e04058a01d41ae5d600cb Mon Sep 17 00:00:00 2001 From: Jaden Angella Date: Fri, 18 Jul 2025 10:15:05 -0700 Subject: [PATCH 372/813] [mlir][EmitC]Expand the MemRefToEmitC pass - Adding scalars (#148055) This aims to expand the the MemRefToEmitC pass so that it can accept global scalars. From: ``` memref.global "private" constant @__constant_xi32 : memref = dense<-1> func.func @globals() { memref.get_global @__constant_xi32 : memref } ``` To: ``` emitc.global static const @__constant_xi32 : i32 = -1 emitc.func @globals() { %0 = get_global @__constant_xi32 : !emitc.lvalue %1 = apply "&"(%0) : (!emitc.lvalue) -> !emitc.ptr return } ``` --- .../MemRefToEmitC/MemRefToEmitC.cpp | 36 +++++++++++++++++-- .../MemRefToEmitC/memref-to-emitc.mlir | 5 +++ 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp index db244d1d1cac8..0b7ffa40ec09d 100644 --- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp +++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp @@ -16,7 +16,9 @@ #include "mlir/Dialect/EmitC/IR/EmitC.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/PatternMatch.h" +#include "mlir/IR/TypeRange.h" #include "mlir/Transforms/DialectConversion.h" using namespace mlir; @@ -77,13 +79,23 @@ struct ConvertAlloca final : public OpConversionPattern { } }; +Type convertMemRefType(MemRefType opTy, const TypeConverter *typeConverter) { + Type resultTy; + if (opTy.getRank() == 0) { + resultTy = typeConverter->convertType(mlir::getElementTypeOrSelf(opTy)); + } else { + resultTy = typeConverter->convertType(opTy); + } + return resultTy; +} + struct ConvertGlobal final : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult matchAndRewrite(memref::GlobalOp op, OpAdaptor operands, ConversionPatternRewriter &rewriter) const override { - + MemRefType opTy = op.getType(); if (!op.getType().hasStaticShape()) { return rewriter.notifyMatchFailure( op.getLoc(), "cannot transform global with dynamic shape"); @@ -95,7 +107,9 @@ struct ConvertGlobal final : public OpConversionPattern { op.getLoc(), "global variable with alignment requirement is " "currently not supported"); } - auto resultTy = getTypeConverter()->convertType(op.getType()); + + Type resultTy = convertMemRefType(opTy, getTypeConverter()); + if (!resultTy) { return rewriter.notifyMatchFailure(op.getLoc(), "cannot convert result type"); @@ -114,6 +128,10 @@ struct ConvertGlobal final : public OpConversionPattern { bool externSpecifier = !staticSpecifier; Attribute initialValue = operands.getInitialValueAttr(); + if (opTy.getRank() == 0) { + auto elementsAttr = llvm::cast(*op.getInitialValue()); + initialValue = elementsAttr.getSplatValue(); + } if (isa_and_present(initialValue)) initialValue = {}; @@ -132,11 +150,23 @@ struct ConvertGetGlobal final matchAndRewrite(memref::GetGlobalOp op, OpAdaptor operands, ConversionPatternRewriter &rewriter) const override { - auto resultTy = getTypeConverter()->convertType(op.getType()); + MemRefType opTy = op.getType(); + Type resultTy = convertMemRefType(opTy, getTypeConverter()); + if (!resultTy) { return rewriter.notifyMatchFailure(op.getLoc(), "cannot convert result type"); } + + if (opTy.getRank() == 0) { + emitc::LValueType lvalueType = emitc::LValueType::get(resultTy); + emitc::GetGlobalOp globalLValue = rewriter.create( + op.getLoc(), lvalueType, operands.getNameAttr()); + emitc::PointerType pointerType = emitc::PointerType::get(resultTy); + rewriter.replaceOpWithNewOp( + op, pointerType, rewriter.getStringAttr("&"), globalLValue); + return success(); + } rewriter.replaceOpWithNewOp(op, resultTy, operands.getNameAttr()); return success(); diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir index d37fd1de90add..2b4eda37903d4 100644 --- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir +++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc.mlir @@ -41,6 +41,8 @@ func.func @memref_load(%buff : memref<4x8xf32>, %i: index, %j: index) -> f32 { module @globals { memref.global "private" constant @internal_global : memref<3x7xf32> = dense<4.0> // CHECK-NEXT: emitc.global static const @internal_global : !emitc.array<3x7xf32> = dense<4.000000e+00> + memref.global "private" constant @__constant_xi32 : memref = dense<-1> + // CHECK-NEXT: emitc.global static const @__constant_xi32 : i32 = -1 memref.global @public_global : memref<3x7xf32> // CHECK-NEXT: emitc.global extern @public_global : !emitc.array<3x7xf32> memref.global @uninitialized_global : memref<3x7xf32> = uninitialized @@ -50,6 +52,9 @@ module @globals { func.func @use_global() { // CHECK-NEXT: emitc.get_global @public_global : !emitc.array<3x7xf32> %0 = memref.get_global @public_global : memref<3x7xf32> + // CHECK-NEXT: emitc.get_global @__constant_xi32 : !emitc.lvalue + // CHECK-NEXT: emitc.apply "&"(%1) : (!emitc.lvalue) -> !emitc.ptr + %1 = memref.get_global @__constant_xi32 : memref return } } From 87c2adbb589d4cd0b6dfb374fce24d29c6bafac0 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 18 Jul 2025 10:33:59 -0700 Subject: [PATCH 373/813] [RISCV][IA] Precommit tests for deinterleaveN of masked.load --- .../RISCV/rvv/vector-deinterleave-load.ll | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll index 9af92aa995f1f..578b67e284c5c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll @@ -538,3 +538,164 @@ define { , , , , , , , , , , } %res6, %t7, 7 ret { , , , , , , , } %res7 } + +define {, } @masked_load_factor2(ptr %p) { +; CHECK-LABEL: masked_load_factor2: +; CHECK: # %bb.0: +; CHECK-NEXT: vl4r.v v12, (a0) +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: vnsrl.wi v10, v12, 8 +; CHECK-NEXT: ret + %vec = call @llvm.masked.load(ptr %p, i32 4, splat (i1 true), poison) + %deinterleaved.results = call {, } @llvm.vector.deinterleave2.nxv32i8( %vec) + ret {, } %deinterleaved.results +} + +define {, , , } @masked_loat_factor4(ptr %p) { +; CHECK-LABEL: masked_loat_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 2 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb +; CHECK-NEXT: vl4r.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %vec = call @llvm.masked.load(ptr %p, i32 4, splat (i1 true), poison) + %deinterleaved.results = call {, , , } @llvm.vector.deinterleave4.nxv32i8( %vec) + ret {, , , } %deinterleaved.results +} + +define {, , , } @masked_loat_factor4_mask(ptr %p, %mask) { +; CHECK-LABEL: masked_loat_factor4_mask: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: add a3, a1, a2 +; CHECK-NEXT: vmv.v.v v9, v8 +; CHECK-NEXT: srli a4, a2, 2 +; CHECK-NEXT: vmv.v.v v10, v8 +; CHECK-NEXT: srli a5, a2, 3 +; CHECK-NEXT: vmv.v.v v11, v8 +; CHECK-NEXT: vsseg4e8.v v8, (a1) +; CHECK-NEXT: vl1r.v v8, (a1) +; CHECK-NEXT: add a1, a4, a5 +; CHECK-NEXT: vl1r.v v9, (a3) +; CHECK-NEXT: add a3, a3, a2 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vl1r.v v10, (a3) +; CHECK-NEXT: vl1r.v v11, (a2) +; CHECK-NEXT: vmsne.vi v9, v9, 0 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: vmsne.vi v10, v11, 0 +; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v0, v9, a5 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v0, v8, a4 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v0, v10, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %interleaved.mask = tail call @llvm.vector.interleave4.nxv32i1( %mask, %mask, %mask, %mask) + %vec = call @llvm.masked.load(ptr %p, i32 4, %interleaved.mask, poison) + %deinterleaved.results = call {, , , } @llvm.vector.deinterleave4.nxv32i8( %vec) + ret {, , , } %deinterleaved.results +} + +; Negative test - some of the deinterleaved elements might come from the +; passthru not the load +define {, , , } @masked_loat_factor4_passthru(ptr %p, %mask, %passthru) { +; CHECK-LABEL: masked_loat_factor4_passthru: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: vmerge.vim v12, v12, 1, v0 +; CHECK-NEXT: add a3, a1, a2 +; CHECK-NEXT: vmv.v.v v13, v12 +; CHECK-NEXT: srli a4, a2, 2 +; CHECK-NEXT: vmv.v.v v14, v12 +; CHECK-NEXT: srli a5, a2, 3 +; CHECK-NEXT: vmv.v.v v15, v12 +; CHECK-NEXT: vsseg4e8.v v12, (a1) +; CHECK-NEXT: vl1r.v v12, (a1) +; CHECK-NEXT: add a1, a4, a5 +; CHECK-NEXT: vl1r.v v13, (a3) +; CHECK-NEXT: add a3, a3, a2 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vl1r.v v14, (a3) +; CHECK-NEXT: vl1r.v v15, (a2) +; CHECK-NEXT: vmsne.vi v13, v13, 0 +; CHECK-NEXT: vmsne.vi v0, v12, 0 +; CHECK-NEXT: vmsne.vi v12, v14, 0 +; CHECK-NEXT: vmsne.vi v14, v15, 0 +; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v0, v13, a5 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v0, v12, a4 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslideup.vx v0, v14, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, mu +; CHECK-NEXT: vle8.v v8, (a0), v0.t +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 2 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs4r.v v8, (a0) +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vlseg4e8.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret + %interleaved.mask = tail call @llvm.vector.interleave4.nxv32i1( %mask, %mask, %mask, %mask) + %vec = call @llvm.masked.load(ptr %p, i32 4, %interleaved.mask, %passthru) + %deinterleaved.results = call {, , , } @llvm.vector.deinterleave4.nxv32i8( %vec) + ret {, , , } %deinterleaved.results +} From 3ea6da59ecda0708d85f78f1feb4090a8551ce90 Mon Sep 17 00:00:00 2001 From: Han-Chung Wang Date: Fri, 18 Jul 2025 10:42:42 -0700 Subject: [PATCH 374/813] [mlir][linalg] Allow pack consumer fusion if the tile size is greater than dimension size. (#149438) This happens only when you use larger tile size, which is greater than or equal to the dimension size. In this case, it is a full slice, so it is fusible. The IR can be generated during the TileAndFuse process. It is hard to fix in such driver, so we enable the naive fusion for the case. --------- Signed-off-by: hanhanW --- .../Linalg/Transforms/TilingInterfaceImpl.cpp | 6 ++- .../tile-and-fuse-consumer.mlir | 50 +++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp index b059bcc025315..28d99b130963a 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp @@ -911,14 +911,16 @@ struct PackOpTiling // If a dimension is not tiled, it is always valid to fuse the pack op, // even if the op has padding semantics. Because it always generates a - // full slice along the dimension. + // full slice along the dimension. The tile sizes are for unpacked + // domain, i.e., `srcDimSize`, so `tileSize < srcDimSize` means that the + // dimension is tiled. // TODO: It could be untiled if the `srcDimSize` is dynamic. It is a // hard check to determine if a dimension is tiled or not. int64_t srcDimSize = packOp.getSourceType().getDimSize(dim); int64_t destDimSize = outerShapeWithoutTranspose[dim]; bool isTiled = failed(cstTileSize) || ShapedType::isDynamic(srcDimSize) || - cstTileSize.value() != srcDimSize; + cstTileSize.value() < srcDimSize; if (!isTiled) { outerDimOffsets.push_back(offsets[dim]); if (ShapedType::isStatic(destDimSize)) { diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir index 20164d5dfd91a..cdbca7228ded3 100644 --- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir +++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-consumer.mlir @@ -451,6 +451,56 @@ module attributes {transform.with_named_sequence} { // ----- +#map = affine_map<(d0) -> (-d0 + 4, 16)> +func.func @fuse_pack_consumer_if_single_iteration(%arg0: tensor<4x4xf32>) -> tensor<1x4x16x1xf32> { + %0 = tensor.empty() : tensor<1x4x16x1xf32> + %1 = tensor.empty() : tensor<4x4xf32> + %2 = scf.forall (%arg1) = (0) to (4) step (16) shared_outs(%arg2 = %1) -> (tensor<4x4xf32>) { + %3 = affine.min #map(%arg1) + %extracted_slice = tensor.extract_slice %arg0[%arg1, 0] [%3, 4] [1, 1] : tensor<4x4xf32> to tensor + %extracted_slice_0 = tensor.extract_slice %arg2[%arg1, 0] [%3, 4] [1, 1] : tensor<4x4xf32> to tensor + %4 = linalg.exp ins(%extracted_slice : tensor) outs(%extracted_slice_0 : tensor) -> tensor + scf.forall.in_parallel { + tensor.parallel_insert_slice %4 into %arg2[%arg1, 0] [%3, 4] [1, 1] : tensor into tensor<4x4xf32> + } + } + %cst = arith.constant 0.000000e+00 : f32 + %pack = linalg.pack %2 padding_value(%cst : f32) outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] into %0 : tensor<4x4xf32> -> tensor<1x4x16x1xf32> + return %pack : tensor<1x4x16x1xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %1 = transform.structured.match ops{["scf.forall"]} in %arg0 : (!transform.any_op) -> !transform.any_op + %consumer, %fused_consumer = transform.test.fuse_consumer %0 in(%1) : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op) + transform.yield + } +} +// CHECK: #[[MAP:.*]] = affine_map<(d0) -> (-d0 + 4, 16)> +// CHECK: func.func @fuse_pack_consumer_if_single_iteration( +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]] +// CHECK-DAG: %[[PACK_INIT:.*]] = tensor.empty() : tensor<1x4x16x1xf32> +// CHECK-DAG: %[[ELEM_INIT:.*]] = tensor.empty() : tensor<4x4xf32> +// CHECK-DAG: %[[PAD_VAL:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %{{.*}}:2 = scf.forall (%[[IV:.*]]) = (0) to (4) step (16) +// CHECK-SAME: shared_outs(%[[ELEM_OUT_ARG:.*]] = %[[ELEM_INIT]], %[[PACK_OUT_ARG:.*]] = %[[PACK_INIT]]) +// CHECK-DAG: %[[SIZE:.+]] = affine.min #[[MAP]](%[[IV]]) +// CHECK-DAG: %[[ELEM_SRC:.*]] = tensor.extract_slice %[[ARG0]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1] +// CHECK-DAG: %[[ELEM_DEST:.*]] = tensor.extract_slice %[[ELEM_OUT_ARG]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1] +// CHECK: %[[ELEM:.*]] = linalg.exp +// CHECK-SAME: ins(%[[ELEM_SRC]] +// CHECK-SAME: outs(%[[ELEM_DEST]] +// CHECK-DAG: %[[TILED_PACK_DEST:.*]] = tensor.extract_slice %[[PACK_OUT_ARG]][%[[IV]], 0, 0, 0] [1, 4, 16, 1] [1, 1, 1, 1] +// CHECK: %[[PACK:.*]] = linalg.pack %[[ELEM]] +// CHECK-SAME: padding_value(%[[PAD_VAL]] : f32) +// CHECK-SAME: outer_dims_perm = [0, 1] inner_dims_pos = [0, 1] inner_tiles = [16, 1] +// CHECK-SAME: into %[[TILED_PACK_DEST]] +// CHECK: scf.forall.in_parallel { +// CHECK: tensor.parallel_insert_slice %[[ELEM]] into %[[ELEM_OUT_ARG]][%[[IV]], 0] [%[[SIZE]], 4] [1, 1] +// CHECK: tensor.parallel_insert_slice %[[PACK]] into %[[PACK_OUT_ARG]][%[[IV]], 0, 0, 0] [1, 4, 16, 1] [1, 1, 1, 1] + +// ----- func.func @fuse_perfect_tiling_pack_consumer_with_outer_dims_perm(%arg0: tensor<64x32xf32>, %arg1: tensor<64x32xf32>, %arg2: tensor<2x64x16x1xf32>) -> tensor<2x64x16x1xf32> { %0 = scf.forall (%arg3) = (0) to (32) step (16) shared_outs(%arg4 = %arg1) -> (tensor<64x32xf32>) { From 796d5a89a12407fb0fdf74ea063259b6ca7333d9 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 10:43:51 -0700 Subject: [PATCH 375/813] [ADT] Use a range-based for loop instead of llvm::for_each (NFC) (#149542) LLVM Coding Standards discourages llvm::for_each unless we already have a callable. --- llvm/include/llvm/ADT/CombinationGenerator.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/include/llvm/ADT/CombinationGenerator.h b/llvm/include/llvm/ADT/CombinationGenerator.h index 6100aa9812293..bbdbd9bfa1be3 100644 --- a/llvm/include/llvm/ADT/CombinationGenerator.h +++ b/llvm/include/llvm/ADT/CombinationGenerator.h @@ -118,10 +118,9 @@ class CombinationGenerator { : VariablesChoices(VariablesChoices_) { #ifndef NDEBUG assert(!VariablesChoices.empty() && "There should be some variables."); - llvm::for_each(VariablesChoices, [](ArrayRef VariableChoices) { + for (ArrayRef VariableChoices : VariablesChoices) assert(!VariableChoices.empty() && "There must always be some choice, at least a placeholder one."); - }); #endif } From b5e71d727b6624c160c9186b52d73bdb635770ed Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 18 Jul 2025 10:48:42 -0700 Subject: [PATCH 376/813] Add section type to support CFI jump table relaxation. For context see main pull request: #147424. Reviewers: MaskRay Reviewed By: MaskRay Pull Request: https://github.com/llvm/llvm-project/pull/149259 --- llvm/docs/Extensions.rst | 20 ++++++++++++++++++++ llvm/include/llvm/BinaryFormat/ELF.h | 1 + llvm/lib/MC/MCParser/ELFAsmParser.cpp | 4 +++- llvm/lib/MC/MCSectionELF.cpp | 4 +++- llvm/lib/Object/ELF.cpp | 1 + llvm/test/MC/AsmParser/llvm_section_types.s | 17 ++++++++++++++++- 6 files changed, 44 insertions(+), 3 deletions(-) diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst index bad72c6ca8295..d8fb87b6998ad 100644 --- a/llvm/docs/Extensions.rst +++ b/llvm/docs/Extensions.rst @@ -581,6 +581,26 @@ This section stores pairs of (jump table address, number of entries). This information is useful for tools that need to statically reconstruct the control flow of executables. +``SHT_LLVM_CFI_JUMP_TABLE`` Section (CFI jump table) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +This section contains the instructions that make up a `CFI jump table`_. +It is expected to be ``SHF_ALLOC`` and may be laid out like a normal +section. The ``SHT_LLVM_CFI_JUMP_TABLE`` section type gives the linker +permission to modify the section in ways that would not normally be +permitted, in order to optimize calls via the jump table. + +Each ``sh_entsize`` sized slice of a section of this type containing +exactly one relocation may be considered to be a jump table entry +that branches to the target of the relocation. This allows the linker +to replace the jump table entry with the function body if it is small +enough, or if the function is the last function in the jump table. + +A section of this type does not have to be placed according to its +name. The linker may place the section in whichever output section it +sees fit (generally the section that would provide the best locality). + +.. _CFI jump table: https://clang.llvm.org/docs/ControlFlowIntegrityDesign.html#forward-edge-cfi-for-indirect-function-calls + CodeView-Dependent ------------------ diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 6bf2e177b5d40..e4f82ad96a084 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -1159,6 +1159,7 @@ enum : unsigned { SHT_LLVM_OFFLOADING = 0x6fff4c0b, // LLVM device offloading data. SHT_LLVM_LTO = 0x6fff4c0c, // .llvm.lto for fat LTO. SHT_LLVM_JT_SIZES = 0x6fff4c0d, // LLVM jump tables sizes. + SHT_LLVM_CFI_JUMP_TABLE = 0x6fff4c0e, // LLVM CFI jump table. // Android's experimental support for SHT_RELR sections. // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512 SHT_ANDROID_RELR = 0x6fffff00, // Relocation entries; only offsets. diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index ec8b40261a6ca..c7c3df330fc94 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -571,7 +571,7 @@ bool ELFAsmParser::parseSectionArguments(bool IsPush, SMLoc loc) { return TokError("expected end of directive"); } - if (Mergeable) + if (Mergeable || TypeName == "llvm_cfi_jump_table") if (parseMergeSize(Size)) return true; if (Flags & ELF::SHF_LINK_ORDER) @@ -637,6 +637,8 @@ bool ELFAsmParser::parseSectionArguments(bool IsPush, SMLoc loc) { Type = ELF::SHT_LLVM_LTO; else if (TypeName == "llvm_jt_sizes") Type = ELF::SHT_LLVM_JT_SIZES; + else if (TypeName == "llvm_cfi_jump_table") + Type = ELF::SHT_LLVM_CFI_JUMP_TABLE; else if (TypeName.getAsInteger(0, Type)) return TokError("unknown section type"); } diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp index cc7cdf2fe4d1a..299fe40706e3a 100644 --- a/llvm/lib/MC/MCSectionELF.cpp +++ b/llvm/lib/MC/MCSectionELF.cpp @@ -176,11 +176,13 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T, OS << "llvm_lto"; else if (Type == ELF::SHT_LLVM_JT_SIZES) OS << "llvm_jt_sizes"; + else if (Type == ELF::SHT_LLVM_CFI_JUMP_TABLE) + OS << "llvm_cfi_jump_table"; else OS << "0x" << Twine::utohexstr(Type); if (EntrySize) { - assert(Flags & ELF::SHF_MERGE); + assert((Flags & ELF::SHF_MERGE) || Type == ELF::SHT_LLVM_CFI_JUMP_TABLE); OS << "," << EntrySize; } diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp index af073f6a1a917..788c6020a7f99 100644 --- a/llvm/lib/Object/ELF.cpp +++ b/llvm/lib/Object/ELF.cpp @@ -321,6 +321,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) { STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_OFFLOADING); STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LTO); STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_JT_SIZES) + STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CFI_JUMP_TABLE) STRINGIFY_ENUM_CASE(ELF, SHT_GNU_SFRAME); STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES); STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH); diff --git a/llvm/test/MC/AsmParser/llvm_section_types.s b/llvm/test/MC/AsmParser/llvm_section_types.s index 147b1499d2b88..83e5db0256647 100644 --- a/llvm/test/MC/AsmParser/llvm_section_types.s +++ b/llvm/test/MC/AsmParser/llvm_section_types.s @@ -1,22 +1,34 @@ -## Verify that LLVM-specific section types are correctly inferred from assembly input. +## Verify that LLVM-specific section types are correctly inferred from assembly input and printed. +# RUN: llvm-mc -triple i386-pc-linux %s | FileCheck --check-prefix=ASM %s # RUN: llvm-mc -triple i386-pc-linux -filetype=obj -o %t %s # RUN: llvm-readobj -S %t | FileCheck %s +# ASM: .section .section1,"",@llvm_bb_addr_map .section .section1,"",@llvm_bb_addr_map .byte 1 +# ASM: .section .section2,"",@llvm_call_graph_profile .section .section2,"",@llvm_call_graph_profile .byte 1 +# ASM: .section .section3,"",@llvm_odrtab .section .section3,"",@llvm_odrtab .byte 1 +# ASM: .section .section4,"",@llvm_linker_options .section .section4,"",@llvm_linker_options .byte 1 +# ASM: .section .section5,"",@llvm_sympart .section .section5,"",@llvm_sympart .byte 1 +# ASM: .section .section6,"",@llvm_dependent_libraries .section .section6,"",@llvm_dependent_libraries .byte 1 +# ASM: .section .section7,"",@llvm_offloading .section .section7,"",@llvm_offloading .byte 1 +# ASM: .section .section8,"",@llvm_lto .section .section8,"",@llvm_lto .byte 1 +# ASM: .section .section9,"",@llvm_cfi_jump_table,1 +.section .section9,"",@llvm_cfi_jump_table,1 +.byte 1 # CHECK: Name: .section1 # CHECK-NEXT: Type: SHT_LLVM_BB_ADDR_MAP @@ -34,3 +46,6 @@ # CHECK-NEXT: Type: SHT_LLVM_OFFLOADING # CHECK: Name: .section8 # CHECK-NEXT: Type: SHT_LLVM_LTO +# CHECK: Name: .section9 +# CHECK-NEXT: Type: SHT_LLVM_CFI_JUMP_TABLE +# CHECK: EntrySize: 1 From f6641e2f233b809958e4f558f5ad2514bc812cb8 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 18 Jul 2025 11:04:18 -0700 Subject: [PATCH 377/813] [RISCV][IA] Factor out code for extracting operands from mem insts [nfc] (#149344) We're going to end up repeating the operand extraction four times once all of the routines have been updated to support both plain load/store and vp.load/vp.store. I plan to add masked.load/masked.store in the near future, and we'd need to add that to each of the four cases. Instead, factor out a single copy of the operand normalization. --- .../Target/RISCV/RISCVInterleavedAccess.cpp | 138 +++++++----------- 1 file changed, 56 insertions(+), 82 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index 38cc0ce00a352..dd68a5556cdb5 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -102,6 +102,56 @@ static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { return false; } +/// Do the common operand retrieval and validition required by the +/// routines below. +static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy, + Instruction *I, Value *&Ptr, Value *&Mask, + Value *&VL, Align &Alignment) { + + IRBuilder<> Builder(I); + const DataLayout &DL = I->getDataLayout(); + ElementCount EC = VTy->getElementCount(); + if (auto *LI = dyn_cast(I)) { + assert(LI->isSimple()); + Ptr = LI->getPointerOperand(); + Alignment = LI->getAlign(); + assert(!Mask && "Unexpected mask on a load"); + Mask = Builder.getAllOnesMask(EC); + VL = isa(VTy) ? Builder.CreateElementCount(XLenTy, EC) + : Constant::getAllOnesValue(XLenTy); + return true; + } + if (auto *SI = dyn_cast(I)) { + assert(SI->isSimple()); + Ptr = SI->getPointerOperand(); + Alignment = SI->getAlign(); + assert(!Mask && "Unexpected mask on a store"); + Mask = Builder.getAllOnesMask(EC); + VL = isa(VTy) ? Builder.CreateElementCount(XLenTy, EC) + : Constant::getAllOnesValue(XLenTy); + return true; + } + auto *VPLdSt = cast(I); + assert((VPLdSt->getIntrinsicID() == Intrinsic::vp_load || + VPLdSt->getIntrinsicID() == Intrinsic::vp_store) && + "Unexpected intrinsic"); + Ptr = VPLdSt->getMemoryPointerParam(); + Alignment = VPLdSt->getPointerAlignment().value_or( + DL.getABITypeAlign(VTy->getElementType())); + + assert(Mask && "vp.load and vp.store needs a mask!"); + + Value *WideEVL = VPLdSt->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, I->getDataLayout(), Factor)) + return false; + + auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); + VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); + return true; +} + /// Lower an interleaved load into a vlsegN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -127,32 +177,8 @@ bool RISCVTargetLowering::lowerInterleavedLoad( Value *Ptr, *VL; Align Alignment; - if (auto *LI = dyn_cast(Load)) { - assert(LI->isSimple()); - Ptr = LI->getPointerOperand(); - Alignment = LI->getAlign(); - assert(!Mask && "Unexpected mask on a load\n"); - Mask = Builder.getAllOnesMask(VTy->getElementCount()); - VL = Builder.CreateElementCount(XLenTy, VTy->getElementCount()); - } else { - auto *VPLoad = cast(Load); - assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - Ptr = VPLoad->getMemoryPointerParam(); - Alignment = VPLoad->getPointerAlignment().value_or( - DL.getABITypeAlign(VTy->getElementType())); - - assert(Mask && "vp.load needs a mask!"); - - Value *WideEVL = VPLoad->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, DL, Factor)) - return false; - - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - } + if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) + return false; Type *PtrTy = Ptr->getType(); unsigned AS = PtrTy->getPointerAddressSpace(); @@ -296,34 +322,8 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( Value *Ptr, *VL; Align Alignment; - if (auto *LI = dyn_cast(Load)) { - assert(LI->isSimple()); - Ptr = LI->getPointerOperand(); - Alignment = LI->getAlign(); - assert(!Mask && "Unexpected mask on a load\n"); - Mask = Builder.getAllOnesMask(ResVTy->getElementCount()); - VL = isa(ResVTy) - ? Builder.CreateElementCount(XLenTy, ResVTy->getElementCount()) - : Constant::getAllOnesValue(XLenTy); - } else { - auto *VPLoad = cast(Load); - assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - Ptr = VPLoad->getMemoryPointerParam(); - Alignment = VPLoad->getPointerAlignment().value_or( - DL.getABITypeAlign(ResVTy->getElementType())); - - assert(Mask && "vp.load needs a mask!"); - - Value *WideEVL = VPLoad->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - } + if (!getMemOperands(Factor, ResVTy, XLenTy, Load, Ptr, Mask, VL, Alignment)) + return false; Type *PtrTy = Ptr->getType(); unsigned AS = PtrTy->getPointerAddressSpace(); @@ -385,34 +385,8 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Value *Ptr, *VL; Align Alignment; - if (auto *SI = dyn_cast(Store)) { - assert(SI->isSimple()); - Ptr = SI->getPointerOperand(); - Alignment = SI->getAlign(); - assert(!Mask && "Unexpected mask on a store"); - Mask = Builder.getAllOnesMask(InVTy->getElementCount()); - VL = isa(InVTy) - ? Builder.CreateElementCount(XLenTy, InVTy->getElementCount()) - : Constant::getAllOnesValue(XLenTy); - } else { - auto *VPStore = cast(Store); - assert(VPStore->getIntrinsicID() == Intrinsic::vp_store && - "Unexpected intrinsic"); - Ptr = VPStore->getMemoryPointerParam(); - Alignment = VPStore->getPointerAlignment().value_or( - DL.getABITypeAlign(InVTy->getElementType())); - - assert(Mask && "vp.store needs a mask!"); - - Value *WideEVL = VPStore->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, DL, Factor)) - return false; - - auto *FactorC = ConstantInt::get(WideEVL->getType(), Factor); - VL = Builder.CreateZExt(Builder.CreateExactUDiv(WideEVL, FactorC), XLenTy); - } + if (!getMemOperands(Factor, InVTy, XLenTy, Store, Ptr, Mask, VL, Alignment)) + return false; Type *PtrTy = Ptr->getType(); unsigned AS = Ptr->getType()->getPointerAddressSpace(); if (!isLegalInterleavedAccessType(InVTy, Factor, Alignment, AS, DL)) From 4bf4e87576688c942b7b337f24fb098247dc4642 Mon Sep 17 00:00:00 2001 From: Daniel Chen Date: Fri, 18 Jul 2025 14:14:27 -0400 Subject: [PATCH 378/813] Static_cast std::size_t to build flang_rt in 32-bit. (#149529) --- flang-rt/lib/runtime/descriptor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp index b723acdd27bd5..e9301bd0307d2 100644 --- a/flang-rt/lib/runtime/descriptor.cpp +++ b/flang-rt/lib/runtime/descriptor.cpp @@ -85,7 +85,7 @@ RT_API_ATTRS void Descriptor::Establish(int characterKind, RT_API_ATTRS void Descriptor::Establish(const typeInfo::DerivedType &dt, void *p, int rank, const SubscriptValue *extent, ISO::CFI_attribute_t attribute) { - std::size_t elementBytes{dt.sizeInBytes()}; + auto elementBytes{static_cast(dt.sizeInBytes())}; ISO::EstablishDescriptor( &raw_, p, attribute, CFI_type_struct, elementBytes, rank, extent); if (elementBytes == 0) { From 10b0dee97dd7e5a122116f7ccb26a19b081db9fd Mon Sep 17 00:00:00 2001 From: Tobias Decking Date: Fri, 18 Jul 2025 20:14:34 +0200 Subject: [PATCH 379/813] [X86] Ensure that bit reversals of byte vectors are properly lowered on pure GFNI targets (#148304) Fixes #148238. When GFNI is present, custom bit reversal lowerings for scalar integers become active. They work by swapping the bytes in the scalar value and then reversing bits in a vector of bytes. However, the custom bit reversal lowering for a vector of bytes is disabled if GFNI is present in isolation, resulting messed up code. --------- Co-authored-by: Simon Pilgrim --- llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +- llvm/test/CodeGen/X86/vector-bitreverse.ll | 388 ++++++++++++++++----- 2 files changed, 313 insertions(+), 84 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d91ea1ea1bb1b..62811244dcfee 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -1323,11 +1323,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); } - if (Subtarget.hasGFNI()) { + if (!Subtarget.useSoftFloat() && Subtarget.hasGFNI()) { setOperationAction(ISD::BITREVERSE, MVT::i8, Custom); setOperationAction(ISD::BITREVERSE, MVT::i16, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Custom); setOperationAction(ISD::BITREVERSE, MVT::i64, Custom); + + for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { + setOperationAction(ISD::BITREVERSE, VT, Custom); + } } if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { @@ -32694,7 +32698,8 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); - assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); + assert((Subtarget.hasSSSE3() || Subtarget.hasGFNI()) && + "SSSE3 or GFNI required for BITREVERSE"); SDValue In = Op.getOperand(0); SDLoc DL(Op); diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index 5dcf19013f0b7..834dfd63432b0 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -8,7 +8,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE,GFNISSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE,GFNISSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F @@ -492,11 +493,20 @@ define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind { ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v8i16: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v8i16: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: movdqa %xmm0, %xmm1 +; GFNISSE2-NEXT: psrlw $8, %xmm1 +; GFNISSE2-NEXT: psllw $8, %xmm0 +; GFNISSE2-NEXT: por %xmm1, %xmm0 +; GFNISSE2-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v8i16: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNISSSE3-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX-LABEL: test_bitreverse_v8i16: ; GFNIAVX: # %bb.0: @@ -605,11 +615,25 @@ define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind { ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v4i32: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v4i32: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm1, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm2 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm2, %xmm0 +; GFNISSE2-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v4i32: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNISSSE3-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX-LABEL: test_bitreverse_v4i32: ; GFNIAVX: # %bb.0: @@ -720,11 +744,27 @@ define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind { ; XOP-NEXT: vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0 ; XOP-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v2i64: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; GFNISSE-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v2i64: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm1, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm2 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm2, %xmm0 +; GFNISSE2-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v2i64: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNISSSE3-NEXT: gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX-LABEL: test_bitreverse_v2i64: ; GFNIAVX: # %bb.0: @@ -1042,15 +1082,30 @@ define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v16i16: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; GFNISSE-NEXT: pshufb %xmm2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 -; GFNISSE-NEXT: pshufb %xmm2, %xmm1 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v16i16: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: movdqa %xmm0, %xmm2 +; GFNISSE2-NEXT: psrlw $8, %xmm2 +; GFNISSE2-NEXT: psllw $8, %xmm0 +; GFNISSE2-NEXT: por %xmm2, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm2, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm3 +; GFNISSE2-NEXT: psrlw $8, %xmm3 +; GFNISSE2-NEXT: psllw $8, %xmm1 +; GFNISSE2-NEXT: por %xmm3, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm2, %xmm1 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v16i16: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX1-LABEL: test_bitreverse_v16i16: ; GFNIAVX1: # %bb.0: @@ -1241,15 +1296,39 @@ define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v8i32: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; GFNISSE-NEXT: pshufb %xmm2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 -; GFNISSE-NEXT: pshufb %xmm2, %xmm1 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v8i32: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm2, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm3 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm3, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm4 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm4, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v8i32: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX1-LABEL: test_bitreverse_v8i32: ; GFNIAVX1: # %bb.0: @@ -1444,15 +1523,43 @@ define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v4i64: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; GFNISSE-NEXT: pshufb %xmm2, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 -; GFNISSE-NEXT: pshufb %xmm2, %xmm1 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v4i64: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm2, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm3 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm3, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm4 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm4, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v4i64: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm2, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm3, %xmm1 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX1-LABEL: test_bitreverse_v4i64: ; GFNIAVX1: # %bb.0: @@ -2035,19 +2142,44 @@ define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v32i16: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; GFNISSE-NEXT: pshufb %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 -; GFNISSE-NEXT: pshufb %xmm4, %xmm1 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 -; GFNISSE-NEXT: pshufb %xmm4, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 -; GFNISSE-NEXT: pshufb %xmm4, %xmm3 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v32i16: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: movdqa %xmm0, %xmm4 +; GFNISSE2-NEXT: psrlw $8, %xmm4 +; GFNISSE2-NEXT: psllw $8, %xmm0 +; GFNISSE2-NEXT: por %xmm4, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm5 +; GFNISSE2-NEXT: psrlw $8, %xmm5 +; GFNISSE2-NEXT: psllw $8, %xmm1 +; GFNISSE2-NEXT: por %xmm5, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm2, %xmm5 +; GFNISSE2-NEXT: psrlw $8, %xmm5 +; GFNISSE2-NEXT: psllw $8, %xmm2 +; GFNISSE2-NEXT: por %xmm5, %xmm2 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm3, %xmm5 +; GFNISSE2-NEXT: psrlw $8, %xmm5 +; GFNISSE2-NEXT: psllw $8, %xmm3 +; GFNISSE2-NEXT: por %xmm5, %xmm3 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm4, %xmm3 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v32i16: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX1-LABEL: test_bitreverse_v32i16: ; GFNIAVX1: # %bb.0: @@ -2393,19 +2525,61 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v16i32: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; GFNISSE-NEXT: pshufb %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 -; GFNISSE-NEXT: pshufb %xmm4, %xmm1 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 -; GFNISSE-NEXT: pshufb %xmm4, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 -; GFNISSE-NEXT: pshufb %xmm4, %xmm3 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v16i32: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm4, %xmm4 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm5 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm5, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm2, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm2 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm3, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm3 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v16i32: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX1-LABEL: test_bitreverse_v16i32: ; GFNIAVX1: # %bb.0: @@ -2759,19 +2933,69 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; XOPAVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; XOPAVX2-NEXT: retq ; -; GFNISSE-LABEL: test_bitreverse_v8i64: -; GFNISSE: # %bb.0: -; GFNISSE-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; GFNISSE-NEXT: pshufb %xmm4, %xmm0 -; GFNISSE-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 -; GFNISSE-NEXT: pshufb %xmm4, %xmm1 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 -; GFNISSE-NEXT: pshufb %xmm4, %xmm2 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 -; GFNISSE-NEXT: pshufb %xmm4, %xmm3 -; GFNISSE-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 -; GFNISSE-NEXT: retq +; GFNISSE2-LABEL: test_bitreverse_v8i64: +; GFNISSE2: # %bb.0: +; GFNISSE2-NEXT: pxor %xmm4, %xmm4 +; GFNISSE2-NEXT: movdqa %xmm0, %xmm5 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm5, %xmm0 +; GFNISSE2-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSE2-NEXT: movdqa %xmm1, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm1 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSE2-NEXT: movdqa %xmm2, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm2 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSE2-NEXT: movdqa %xmm3, %xmm6 +; GFNISSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; GFNISSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] +; GFNISSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7] +; GFNISSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4] +; GFNISSE2-NEXT: packuswb %xmm6, %xmm3 +; GFNISSE2-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSE2-NEXT: retq +; +; GFNISSSE3-LABEL: test_bitreverse_v8i64: +; GFNISSSE3: # %bb.0: +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm0 +; GFNISSSE3-NEXT: movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm0 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm1 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm1 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm2 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm2 +; GFNISSSE3-NEXT: pshufb %xmm4, %xmm3 +; GFNISSSE3-NEXT: gf2p8affineqb $0, %xmm5, %xmm3 +; GFNISSSE3-NEXT: retq ; ; GFNIAVX1-LABEL: test_bitreverse_v8i64: ; GFNIAVX1: # %bb.0: From 6acc6991f83409be3ca6315edf8c7f381ebe4d40 Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Fri, 18 Jul 2025 13:34:15 -0500 Subject: [PATCH 380/813] [STLForwardCompat] Improve category handling in transformOptional (#149539) The old version would prefer the "const &" overload over the "&&" one unless the former was not allowed in the given situation. In particular, if the function passed was "[](auto &&)" the argument would be "const &" even if the value passed to transformOptional was an rvalue reference. This version improves the handling of expression categories, and the lambda argument category will reflect the argument category in the above scenario. --- llvm/include/llvm/ADT/STLForwardCompat.h | 22 ++++++------------ llvm/unittests/ADT/STLForwardCompatTest.cpp | 25 +++++++++++++++++++++ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h index 7bd2c8705f393..81b9a685e11d2 100644 --- a/llvm/include/llvm/ADT/STLForwardCompat.h +++ b/llvm/include/llvm/ADT/STLForwardCompat.h @@ -55,21 +55,13 @@ using type_identity_t // NOLINT(readability-identifier-naming) // TODO: Remove this in favor of std::optional::transform once we switch to // C++23. -template -auto transformOptional(const std::optional &O, const Function &F) - -> std::optional { - if (O) - return F(*O); - return std::nullopt; -} - -// TODO: Remove this in favor of std::optional::transform once we switch to -// C++23. -template -auto transformOptional(std::optional &&O, const Function &F) - -> std::optional { - if (O) - return F(*std::move(O)); +template ::value_type> +std::optional> +transformOptional(Optional &&O, Function &&F) { + if (O) { + return F(*std::forward(O)); + } return std::nullopt; } diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp index e3d500aa7b55a..4a8f53cf72f94 100644 --- a/llvm/unittests/ADT/STLForwardCompatTest.cpp +++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp @@ -10,6 +10,11 @@ #include "CountCopyAndMove.h" #include "gtest/gtest.h" +#include +#include +#include +#include + namespace { template @@ -142,6 +147,26 @@ TEST(TransformTest, MoveTransformLlvm) { EXPECT_EQ(0, CountCopyAndMove::Destructions); } +TEST(TransformTest, TransformCategory) { + struct StructA { + int x; + }; + struct StructB : StructA { + StructB(StructA &&A) : StructA(std::move(A)) {} + }; + + std::optional A{StructA{}}; + llvm::transformOptional(A, [](auto &&s) { + EXPECT_FALSE(std::is_rvalue_reference_v); + return StructB{std::move(s)}; + }); + + llvm::transformOptional(std::move(A), [](auto &&s) { + EXPECT_TRUE(std::is_rvalue_reference_v); + return StructB{std::move(s)}; + }); +} + TEST(TransformTest, ToUnderlying) { enum E { A1 = 0, B1 = -1 }; static_assert(llvm::to_underlying(A1) == 0); From 13f7786f72d13a84dfc3d49d87a70e6a05f21fd4 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 18 Jul 2025 11:35:09 -0700 Subject: [PATCH 381/813] [libc] Remove trivial .h.def files (#149466) Remove all the .h.def files that already express nothing whatsoever not already expressed in YAML. Clean up a few YAML files without materially changing any generated header output. Many more .h.def files remain that need a bit of conversion in YAML to express macro requirements and such. --- libc/include/dirent.h.def | 16 ------------- libc/include/dirent.yaml | 18 +++++++-------- libc/include/search.h.def | 18 --------------- libc/include/search.yaml | 24 ++++++++++---------- libc/include/setjmp.h.def | 16 ------------- libc/include/setjmp.yaml | 10 ++++----- libc/include/spawn.h.def | 16 ------------- libc/include/spawn.yaml | 18 +++++++-------- libc/include/string.h.def | 16 ------------- libc/include/string.yaml | 17 +++++++------- libc/include/strings.h.def | 16 ------------- libc/include/strings.yaml | 31 +++++++++---------------- libc/include/sys/sendfile.h.def | 16 ------------- libc/include/sys/sendfile.yaml | 12 ++-------- libc/include/sys/statvfs.h.def | 16 ------------- libc/include/sys/statvfs.yaml | 10 ++++----- libc/include/sys/types.yaml | 40 +++++++++++++++------------------ libc/include/sys/uio.h.def | 16 ------------- libc/include/sys/uio.yaml | 10 ++++----- libc/include/sys/utsname.h.def | 16 ------------- libc/include/sys/utsname.yaml | 8 +++---- libc/include/threads.h.def | 16 ------------- libc/include/threads.yaml | 3 ++- libc/include/uchar.h.def | 16 ------------- libc/include/uchar.yaml | 5 ----- 25 files changed, 85 insertions(+), 315 deletions(-) delete mode 100644 libc/include/dirent.h.def delete mode 100644 libc/include/search.h.def delete mode 100644 libc/include/setjmp.h.def delete mode 100644 libc/include/spawn.h.def delete mode 100644 libc/include/string.h.def delete mode 100644 libc/include/strings.h.def delete mode 100644 libc/include/sys/sendfile.h.def delete mode 100644 libc/include/sys/statvfs.h.def delete mode 100644 libc/include/sys/uio.h.def delete mode 100644 libc/include/sys/utsname.h.def delete mode 100644 libc/include/threads.h.def delete mode 100644 libc/include/uchar.h.def diff --git a/libc/include/dirent.h.def b/libc/include/dirent.h.def deleted file mode 100644 index 6786578fbd067..0000000000000 --- a/libc/include/dirent.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- POSIX header dirent.h ---------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_DIRENT_H -#define LLVM_LIBC_DIRENT_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_DIRENT_H diff --git a/libc/include/dirent.yaml b/libc/include/dirent.yaml index 3fc522fda80e4..66570bca6c495 100644 --- a/libc/include/dirent.yaml +++ b/libc/include/dirent.yaml @@ -1,47 +1,45 @@ header: dirent.h -header_template: dirent.h.def -macros: [] +standards: + - posix types: - type_name: struct_dirent - type_name: DIR - type_name: ino_t -enums: [] -objects: [] functions: - name: alphasort standards: - - POSIX + - posix return_type: int arguments: - type: const struct dirent ** - type: const struct dirent ** - name: closedir standards: - - POSIX + - posix return_type: int arguments: - type: DIR * - name: dirfd standards: - - POSIX + - posix return_type: int arguments: - type: DIR * - name: fdopendir standards: - - POSIX + - posix return_type: DIR * arguments: - type: int - name: opendir standards: - - POSIX + - posix return_type: DIR * arguments: - type: const char * - name: readdir standards: - - POSIX + - posix return_type: struct dirent * arguments: - type: DIR * diff --git a/libc/include/search.h.def b/libc/include/search.h.def deleted file mode 100644 index 6301ba7b656ce..0000000000000 --- a/libc/include/search.h.def +++ /dev/null @@ -1,18 +0,0 @@ -//===-- POSIX header search.h ---------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SEARCH_H -#define LLVM_LIBC_SEARCH_H - -#include "__llvm-libc-common.h" -#define __need_size_t -#include - -%%public_api() - -#endif // LLVM_LIBC_SEARCH_H diff --git a/libc/include/search.yaml b/libc/include/search.yaml index e0247afad2cd6..8a3a0c50af60f 100644 --- a/libc/include/search.yaml +++ b/libc/include/search.yaml @@ -1,6 +1,6 @@ header: search.h -header_template: search.h.def -macros: [] +standards: + - posix types: - type_name: ACTION - type_name: ENTRY @@ -12,35 +12,35 @@ objects: [] functions: - name: hcreate standards: - - POSIX + - posix return_type: int arguments: - type: size_t - name: hcreate_r - standards: GNUExtensions + standards: gnu return_type: int arguments: - type: size_t - type: struct hsearch_data * - name: hdestroy - standards: GNUExtensions + standards: gnu return_type: void arguments: [] - name: hdestroy_r standards: - - POSIX + - posix return_type: void arguments: - type: struct hsearch_data * - name: hsearch standards: - - POSIX + - posix return_type: ENTRY * arguments: - type: ENTRY - type: ACTION - name: hsearch_r - standards: GNUExtensions + standards: gnu return_type: int arguments: - type: ENTRY @@ -49,20 +49,20 @@ functions: - type: struct hsearch_data * - name: insque standards: - - POSIX + - posix return_type: void arguments: - type: void * - type: void * - name: remque standards: - - POSIX + - posix return_type: void arguments: - type: void * - name: lfind standards: - - POSIX + - posix return_type: void * arguments: - type: const void * @@ -72,7 +72,7 @@ functions: - type: __search_compare_t - name: lsearch standards: - - POSIX + - posix return_type: void * arguments: - type: const void * diff --git a/libc/include/setjmp.h.def b/libc/include/setjmp.h.def deleted file mode 100644 index 670bc1ac0fe24..0000000000000 --- a/libc/include/setjmp.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- C standard library header setjmp.h --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SETJMP_H -#define LLVM_LIBC_SETJMP_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_SETJMP_H diff --git a/libc/include/setjmp.yaml b/libc/include/setjmp.yaml index 00049e58c86c8..55e03470e33ca 100644 --- a/libc/include/setjmp.yaml +++ b/libc/include/setjmp.yaml @@ -1,10 +1,8 @@ header: setjmp.h -header_template: setjmp.h.def -macros: [] +standards: + - stdc types: - type_name: jmp_buf -enums: [] -objects: [] functions: - name: longjmp standards: @@ -23,7 +21,7 @@ functions: - type: jmp_buf - name: sigsetjmp standards: - - POSIX + - posix return_type: int attributes: - _Returns_twice @@ -32,7 +30,7 @@ functions: - type: int - name: siglongjmp standards: - - POSIX + - posix return_type: _Noreturn void arguments: - type: sigjmp_buf diff --git a/libc/include/spawn.h.def b/libc/include/spawn.h.def deleted file mode 100644 index a8d7015852868..0000000000000 --- a/libc/include/spawn.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- POSIX header spawn.h ----------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SPAWN_H -#define LLVM_LIBC_SPAWN_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_SPAWN_H diff --git a/libc/include/spawn.yaml b/libc/include/spawn.yaml index c763cc76fd094..ef39f66d080f6 100644 --- a/libc/include/spawn.yaml +++ b/libc/include/spawn.yaml @@ -1,17 +1,15 @@ header: spawn.h -header_template: spawn.h.def -macros: [] +standards: + - posix types: - type_name: posix_spawn_file_actions_t - type_name: posix_spawnattr_t - type_name: pid_t - type_name: mode_t -enums: [] -objects: [] functions: - name: posix_spawn standards: - - POSIX + - posix return_type: int arguments: - type: pid_t *__restrict @@ -22,14 +20,14 @@ functions: - type: char * const * __restrict - name: posix_spawn_file_actions_addclose standards: - - POSIX + - posix return_type: int arguments: - type: posix_spawn_file_actions_t * - type: int - name: posix_spawn_file_actions_adddup2 standards: - - POSIX + - posix return_type: int arguments: - type: posix_spawn_file_actions_t * @@ -37,7 +35,7 @@ functions: - type: int - name: posix_spawn_file_actions_addopen standards: - - POSIX + - posix return_type: int arguments: - type: posix_spawn_file_actions_t *__restrict @@ -47,13 +45,13 @@ functions: - type: mode_t - name: posix_spawn_file_actions_destroy standards: - - POSIX + - posix return_type: int arguments: - type: posix_spawn_file_actions_t * - name: posix_spawn_file_actions_init standards: - - POSIX + - posix return_type: int arguments: - type: posix_spawn_file_actions_t * diff --git a/libc/include/string.h.def b/libc/include/string.h.def deleted file mode 100644 index 339d005e43a4f..0000000000000 --- a/libc/include/string.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- C standard library header string.h --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_STRING_H -#define LLVM_LIBC_STRING_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_STRING_H diff --git a/libc/include/string.yaml b/libc/include/string.yaml index 736deceb453de..0bf297ee747a4 100644 --- a/libc/include/string.yaml +++ b/libc/include/string.yaml @@ -1,5 +1,6 @@ header: string.h -header_template: string.h.def +standards: + - stdc macros: - macro_name: NULL macro_header: null-macro.h @@ -11,7 +12,7 @@ objects: [] functions: - name: memccpy standards: - - POSIX + - posix return_type: void * arguments: - type: void *__restrict @@ -61,7 +62,7 @@ functions: - type: size_t - name: mempcpy standards: - - POSIX + - posix return_type: void * arguments: - type: void *__restrict @@ -93,14 +94,14 @@ functions: - type: size_t - name: stpcpy standards: - - POSIX + - posix return_type: char * arguments: - type: char *__restrict - type: const char *__restrict - name: stpncpy standards: - - POSIX + - posix return_type: char * arguments: - type: char *__restrict @@ -243,7 +244,7 @@ functions: - type: size_t - name: strnlen standards: - - POSIX + - posix return_type: size_t arguments: - type: const char * @@ -271,7 +272,7 @@ functions: - type: const char *__restrict - name: strsignal standards: - - POSIX + - posix return_type: char * arguments: - type: int @@ -298,7 +299,7 @@ functions: - type: const char *__restrict - name: strtok_r standards: - - POSIX + - posix return_type: char * arguments: - type: char *__restrict diff --git a/libc/include/strings.h.def b/libc/include/strings.h.def deleted file mode 100644 index 9b016bf0bc50b..0000000000000 --- a/libc/include/strings.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- C standard library header strings.h -------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_STRINGS_H -#define LLVM_LIBC_STRINGS_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_STRINGS_H diff --git a/libc/include/strings.yaml b/libc/include/strings.yaml index 855800d9dbc3d..1e78f0e48aa59 100644 --- a/libc/include/strings.yaml +++ b/libc/include/strings.yaml @@ -1,15 +1,14 @@ header: strings.h -header_template: strings.h.def -macros: [] +standards: + - bsd + - posix types: - type_name: size_t - type_name: locale_t -enums: [] -objects: [] functions: - name: bcmp standards: - - llvm_libc_ext + - bsd return_type: int arguments: - type: const void * @@ -17,7 +16,7 @@ functions: - type: size_t - name: bcopy standards: - - llvm_libc_ext + - bsd return_type: void arguments: - type: const void * @@ -25,69 +24,61 @@ functions: - type: size_t - name: bzero standards: - - llvm_libc_ext + - bsd return_type: void arguments: - type: void * - type: size_t - name: ffs standards: - - POSIX + - posix return_type: int arguments: - type: int - name: ffsl standards: - - POSIX + - posix return_type: int arguments: - type: long - name: ffsll standards: - - POSIX + - posix return_type: int arguments: - type: long long - name: index standards: - - BSDExtensions + - bsd return_type: char * arguments: - type: const char * - type: int - name: rindex standards: - - BSDExtensions + - bsd return_type: char * arguments: - type: const char * - type: int - name: strcasecmp - standards: - - BSDExtensions return_type: int arguments: - type: const char * - type: const char * - name: strcasecmp_l - standards: - - BSDExtensions return_type: int arguments: - type: const char * - type: const char * - type: locale_t - name: strncasecmp - standards: - - BSDExtensions return_type: int arguments: - type: const char * - type: const char * - type: size_t - name: strncasecmp_l - standards: - - BSDExtensions return_type: int arguments: - type: const char * diff --git a/libc/include/sys/sendfile.h.def b/libc/include/sys/sendfile.h.def deleted file mode 100644 index d7f21f91f95ed..0000000000000 --- a/libc/include/sys/sendfile.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- Linux sys/sendfile.h ----------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SYS_SENDFILE_H -#define LLVM_LIBC_SYS_SENDFILE_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_SYS_SENDFILE_H diff --git a/libc/include/sys/sendfile.yaml b/libc/include/sys/sendfile.yaml index 259ab83dff54b..a845dab580483 100644 --- a/libc/include/sys/sendfile.yaml +++ b/libc/include/sys/sendfile.yaml @@ -1,16 +1,8 @@ header: sys/sendfile.h -header_template: sendfile.h.def -macros: [] -types: - - type_name: ssize_t - - type_name: size_t - - type_name: off_t -enums: [] -objects: [] +standards: + - linux functions: - name: sendfile - standards: - - GNUExtensions return_type: ssize_t arguments: - type: int diff --git a/libc/include/sys/statvfs.h.def b/libc/include/sys/statvfs.h.def deleted file mode 100644 index f23c9a3d5b1f9..0000000000000 --- a/libc/include/sys/statvfs.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- POSIX header statvfs.h --------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SYS_STATVFS_H -#define LLVM_LIBC_SYS_STATVFS_H - -#include <__llvm-libc-common.h> - -%%public_api() - -#endif // LLVM_LIBC_SYS_STATVFS_H diff --git a/libc/include/sys/statvfs.yaml b/libc/include/sys/statvfs.yaml index 8c1d254add37f..e083677beee89 100644 --- a/libc/include/sys/statvfs.yaml +++ b/libc/include/sys/statvfs.yaml @@ -1,23 +1,21 @@ header: sys/statvfs.h -header_template: statvfs.h.def -macros: [] +standards: + - posix types: - type_name: struct_statvfs - type_name: fsblkcnt_t - type_name: fsfilcnt_t -enums: [] -objects: [] functions: - name: fstatvfs standards: - - POSIX + - posix return_type: int arguments: - type: int - type: struct statvfs * - name: statvfs standards: - - POSIX + - posix return_type: int arguments: - type: const char *__restrict diff --git a/libc/include/sys/types.yaml b/libc/include/sys/types.yaml index 6fa0b448fcd38..a00429d3817e1 100644 --- a/libc/include/sys/types.yaml +++ b/libc/include/sys/types.yaml @@ -1,32 +1,28 @@ header: sys/types.h -header_template: types.h.def -standards: POSIX -macros: [] +standards: + - posix types: - - type_name: uid_t - - type_name: time_t - - type_name: pthread_t - - type_name: pthread_rwlock_t - - type_name: pthread_rwlockattr_t - - type_name: pthread_mutex_t - type_name: blkcnt_t - type_name: blksize_t - type_name: clockid_t - - type_name: ssize_t - - type_name: pthread_mutexattr_t - - type_name: ino_t - - type_name: pthread_once_t - - type_name: mode_t - type_name: dev_t - - type_name: pthread_attr_t - type_name: gid_t - - type_name: pid_t + - type_name: ino_t + - type_name: mode_t - type_name: nlink_t - - type_name: suseconds_t - type_name: off_t - - type_name: size_t - - type_name: pthread_key_t + - type_name: pid_t + - type_name: pthread_attr_t - type_name: pthread_condattr_t -enums: [] -objects: [] -functions: [] + - type_name: pthread_key_t + - type_name: pthread_mutex_t + - type_name: pthread_mutexattr_t + - type_name: pthread_once_t + - type_name: pthread_rwlock_t + - type_name: pthread_rwlockattr_t + - type_name: pthread_t + - type_name: size_t + - type_name: ssize_t + - type_name: suseconds_t + - type_name: time_t + - type_name: uid_t diff --git a/libc/include/sys/uio.h.def b/libc/include/sys/uio.h.def deleted file mode 100644 index 76496cb2310f7..0000000000000 --- a/libc/include/sys/uio.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- POSIX header uio.h ------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SYS_UIO_H -#define LLVM_LIBC_SYS_UIO_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_SYS_UIO_H diff --git a/libc/include/sys/uio.yaml b/libc/include/sys/uio.yaml index 6d3f336b2b520..929911e669386 100644 --- a/libc/include/sys/uio.yaml +++ b/libc/include/sys/uio.yaml @@ -1,15 +1,13 @@ header: sys/uio.h -header_template: uio.h.def -macros: [] +standards: + - posix types: - type_name: struct_iovec - type_name: ssize_t -enums: [] -objects: [] functions: - name: writev standards: - - POSIX + - posix return_type: ssize_t arguments: - type: int @@ -17,7 +15,7 @@ functions: - type: int - name: readv standards: - - POSIX + - posix return_type: ssize_t arguments: - type: int diff --git a/libc/include/sys/utsname.h.def b/libc/include/sys/utsname.h.def deleted file mode 100644 index 08dbbfc062453..0000000000000 --- a/libc/include/sys/utsname.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- Linux sys/utsname.h -----------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SYS_UTSNAME_H -#define LLVM_LIBC_SYS_UTSNAME_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_SYS_UTSNAME_H diff --git a/libc/include/sys/utsname.yaml b/libc/include/sys/utsname.yaml index 6c7cb71f9a34f..0f0e4cdb38952 100644 --- a/libc/include/sys/utsname.yaml +++ b/libc/include/sys/utsname.yaml @@ -1,14 +1,12 @@ header: sys/utsname.h -header_template: utsname.h.def -macros: [] +standards: + - posix types: - type_name: struct_utsname -enums: [] -objects: [] functions: - name: uname standards: - - POSIX + - posix return_type: int arguments: - type: struct utsname * diff --git a/libc/include/threads.h.def b/libc/include/threads.h.def deleted file mode 100644 index b114bea0ace34..0000000000000 --- a/libc/include/threads.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- C standard library header threads.h -------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_THREADS_H -#define LLVM_LIBC_THREADS_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_THREADS_H diff --git a/libc/include/threads.yaml b/libc/include/threads.yaml index 7014822f9251d..99b29f1815549 100644 --- a/libc/include/threads.yaml +++ b/libc/include/threads.yaml @@ -1,5 +1,6 @@ header: threads.h -header_template: threads.h.def +standards: + - stdc macros: - macro_name: ONCE_FLAG_INIT macro_value: '{0}' diff --git a/libc/include/uchar.h.def b/libc/include/uchar.h.def deleted file mode 100644 index 31b7fcb73ded6..0000000000000 --- a/libc/include/uchar.h.def +++ /dev/null @@ -1,16 +0,0 @@ -//===-- C standard library header uchar.h ---------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_UCHAR_H -#define LLVM_LIBC_UCHAR_H - -#include "__llvm-libc-common.h" - -%%public_api() - -#endif // LLVM_LIBC_UCHAR_H diff --git a/libc/include/uchar.yaml b/libc/include/uchar.yaml index 713919796762d..d0799e28ac9cb 100644 --- a/libc/include/uchar.yaml +++ b/libc/include/uchar.yaml @@ -1,14 +1,9 @@ header: uchar.h -header_template: uchar.h.def standards: - stdc -macros: [] types: - type_name: char32_t - type_name: char16_t - type_name: char8_t - type_name: mbstate_t - type_name: size_t -enums: [] -objects: [] -functions: [] From cfa918bec18c012a89a36b5a7ddceacb8e6c9ed7 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 18 Jul 2025 12:31:29 -0700 Subject: [PATCH 382/813] [AMDGPU] Select flat GVS atomics on gfx1250 (#149554) --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 3 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 23 + llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 6 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 20 +- .../Target/AMDGPU/AMDGPUInstructionSelector.h | 4 + llvm/lib/Target/AMDGPU/FLATInstructions.td | 60 +- .../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 6030 +++++++++++++++++ 7 files changed, 6120 insertions(+), 26 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 7b5d4077e85f3..2bfd56f9f3554 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -137,6 +137,9 @@ def gi_global_offset : def gi_global_saddr : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_global_saddr_glc : + GIComplexOperandMatcher, + GIComplexPatternEquiv; def gi_mubuf_scratch_offset : GIComplexOperandMatcher, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 25672a52345cb..00c7f0eb6e9f1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1968,6 +1968,29 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, return true; } +bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + return false; + + CPol = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32); + return true; +} + +bool AMDGPUDAGToDAGISel::SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, + SDValue &SAddr, SDValue &VOffset, + SDValue &Offset, + SDValue &CPol) const { + if (!SelectGlobalSAddr(N, Addr, SAddr, VOffset, Offset)) + return false; + + unsigned CPolVal = AMDGPU::CPol::GLC; + CPol = CurDAG->getTargetConstant(CPolVal, SDLoc(), MVT::i32); + return true; +} + static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) { if (auto *FI = dyn_cast(SAddr)) { SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 9967f46e085e4..acbab3d9e2d81 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -163,6 +163,12 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { SDValue &Offset) const; bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &VOffset, SDValue &Offset) const; + bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; + bool SelectGlobalSAddrGLC(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset, + SDValue &CPol) const; bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, SDValue &Offset) const; bool checkFlatScratchSVSSwizzleBug(SDValue VAddr, SDValue SAddr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 1a63c48e3666c..d161c035ac295 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -5485,7 +5485,8 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const { // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { +AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root, + unsigned CPolBits) const { Register Addr = Root.getReg(); Register PtrBase; int64_t ConstOffset; @@ -5529,6 +5530,7 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { MIB.addReg(HighBits); }, // voffset [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); }, }}; } } @@ -5568,6 +5570,9 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(ImmOffset); + }, + [=](MachineInstrBuilder &MIB) { // cpol + MIB.addImm(CPolBits); }}}; } } @@ -5591,10 +5596,21 @@ AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { return {{ [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }, // offset + [=](MachineInstrBuilder &MIB) { MIB.addImm(CPolBits); } // cpol }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const { + return selectGlobalSAddr(Root, 0); +} + +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectGlobalSAddrGLC(MachineOperand &Root) const { + return selectGlobalSAddr(Root, AMDGPU::CPol::GLC); +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const { Register Addr = Root.getReg(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 2cb7904d27ccc..34bdf0a6d4ab2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -253,8 +253,12 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectScratchOffset(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddr(MachineOperand &Root, unsigned CPolBits) const; InstructionSelector::ComplexRendererFns selectGlobalSAddr(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectGlobalSAddrGLC(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectScratchSAddr(MachineOperand &Root) const; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index c8a4e22ed1dae..1432b5940f3f0 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -11,7 +11,8 @@ let WantsRoot = true in { def GlobalOffset : ComplexPattern; def ScratchOffset : ComplexPattern; - def GlobalSAddr : ComplexPattern; + def GlobalSAddr : ComplexPattern; + def GlobalSAddrGLC : ComplexPattern; def ScratchSAddr : ComplexPattern; def ScratchSVAddr : ComplexPattern; } @@ -1252,13 +1253,13 @@ class GlobalLoadSaddrPat_D16 ; class FlatLoadSaddrPat_D16 : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)), - (inst $saddr, $voffset, $offset, (i32 0), $in) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$in)), + (inst $saddr, $voffset, $offset, $cpol, $in) >; class FlatLoadSaddrPat_D16_t16 : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), - (inst $saddr, $voffset, $offset, (i32 0)) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (inst $saddr, $voffset, $offset, $cpol) >; class GlobalLoadSaddrPat_D16_t16 : GCNPat < @@ -1272,26 +1273,26 @@ class FlatLoadSignedPat >; class FlatLoadSaddrPat : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))), - (inst $saddr, $voffset, $offset, 0) + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol))), + (inst $saddr, $voffset, $offset, $cpol) >; class FlatStoreSaddrPat : GCNPat < - (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)), - (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) + (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol)), + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset, $cpol) >; -class GlobalAtomicSaddrPat : GCNPat < - (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)), - (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) +class FlatAtomicSaddrPat : GCNPat < + (vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)), + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset, $cpol) >; class GlobalAtomicNoRtnSaddrPat : GCNPat < - (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data), - (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) + (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), vt:$data), + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset, $cpol) >; class FlatStorePat : GCNPat < @@ -1320,6 +1321,12 @@ multiclass FlatAtomicNoRtnPatBase (inst) VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + + def : FlatAtomicSaddrPat(inst#"_SADDR"), !cast(node), + GlobalSAddr, vt, data_vt> { + let AddedComplexity = 9; + let SubtargetPredicate = HasFlatGVSMode; + } } multiclass FlatAtomicNoRtnPatWithAddrSpace(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT.ret:$data, $offset)>; + + def : FlatAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> { + let AddedComplexity = 8; + let SubtargetPredicate = HasFlatGVSMode; + } } multiclass FlatAtomicRtnPatWithAddrSpace(inst), !cast(node), vt, data_vt>; let AddedComplexity = 13 in - def : GlobalAtomicSaddrPat(inst#"_SADDR"), !cast(node), vt, data_vt>; + def : FlatAtomicSaddrPat(inst#"_SADDR"), !cast(node), + GlobalSAddr, vt, data_vt>; } multiclass GlobalFLATAtomicPatsRtnBase(inst#"_RTN"), rtnNode, vt, data_vt>; let AddedComplexity = 12 in - def : GlobalAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, vt, data_vt>; + def : FlatAtomicSaddrPat(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt>; } multiclass GlobalFLATAtomicPatsNoRtn; defm : FlatStorePats ; defm : FlatStorePats ; -let SubtargetPredicate = isGFX12Plus in { - defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >; +} // End OtherPredicates = [HasFlatAddressSpace] - let OtherPredicates = [HasAtomicCSubNoRtnInsts] in - defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; -} +let OtherPredicates = [isGFX12Plus] in +defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; + +let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in +defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>; let OtherPredicates = [HasD16LoadStore] in { defm : FlatStorePats ; @@ -1826,8 +1840,6 @@ defm : FlatLoadPats_D16 ; defm : FlatLoadPats_D16 ; } -} // End OtherPredicates = [HasFlatAddressSpace] - let OtherPredicates = [HasFlatGlobalInsts] in { defm : GlobalFLATLoadPats ; diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll new file mode 100644 index 0000000000000..f4040f3049e0d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll @@ -0,0 +1,6030 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s + +; Test using saddr addressing mode of flat_* atomic instructions. + +define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xchg_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst + ret void +} + +; Maximum positive offset on gfx10 +define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047 + %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst + ret void +} + +; Maximum negative offset on gfx10 +define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 + %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xchg_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 2048 + %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048 + %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +; -------------------------------------------------------------------------------- +; Uniformity edge cases +; -------------------------------------------------------------------------------- + +@ptr.in.lds = internal addrspace(3) global ptr undef + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) { +; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) { +; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v0, v1, s[0:1] offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: ; return to shader part epilog +; +; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v0, v[2:3], v1 offset:42 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: ; return to shader part epilog + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42 + %rtn = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) { +; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_endpgm + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst + ret void +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) { +; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-SDAG-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s0, v2 +; GFX1250-SDAG-NEXT: v_readfirstlane_b32 s1, v3 +; GFX1250-SDAG-NEXT: flat_atomic_swap_b32 v0, v1, s[0:1] offset:42 scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-GISEL-NEXT: ds_load_b64 v[2:3], v2 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: flat_atomic_swap_b32 v[2:3], v1 offset:42 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_endpgm + %sbase = load ptr, ptr addrspace(3) @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 42 + %unused = atomicrmw xchg ptr %gep1, i32 %data syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; All atomicrmw ops +; -------------------------------------------------------------------------------- + +; -------------------------------------------------------------------------------- +; atomicrmw xchg +; -------------------------------------------------------------------------------- + +define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1250-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB10_5 +; GFX1250-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB10_2 +; GFX1250-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB10_5 +; GFX1250-SDAG-NEXT: .LBB10_5: +; +; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB10_4 +; GFX1250-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB10_5 +; GFX1250-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2 +; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB10_5 +; GFX1250-GISEL-NEXT: .LBB10_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1250-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB11_5 +; GFX1250-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB11_2 +; GFX1250-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB11_5 +; GFX1250-SDAG-NEXT: .LBB11_5: +; +; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB11_4 +; GFX1250-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB11_5 +; GFX1250-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2 +; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[4:5], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB11_5 +; GFX1250-GISEL-NEXT: .LBB11_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw xchg ptr %gep1, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1250-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB12_2 +; GFX1250-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_4 +; GFX1250-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2 +; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB13_4 +; GFX1250-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB13_2 +; GFX1250-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_store_b64 v0, v[2:3], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_4 +; GFX1250-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_swap_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2 +; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw xchg ptr %gep1, i64 %data syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw add +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_add_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw add ptr %gep1, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_add_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw add ptr %gep1, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB18_4 +; GFX1250-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB18_5 +; GFX1250-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB18_2 +; GFX1250-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB18_5 +; GFX1250-SDAG-NEXT: .LBB18_5: +; +; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB18_4 +; GFX1250-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB18_5 +; GFX1250-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2 +; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5] +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB18_5 +; GFX1250-GISEL-NEXT: .LBB18_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB19_4 +; GFX1250-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB19_5 +; GFX1250-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB19_2 +; GFX1250-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3] +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB19_5 +; GFX1250-SDAG-NEXT: .LBB19_5: +; +; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB19_4 +; GFX1250-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB19_5 +; GFX1250-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2 +; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[4:5] +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB19_5 +; GFX1250-GISEL-NEXT: .LBB19_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw add ptr %gep1, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB20_4 +; GFX1250-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB20_2 +; GFX1250-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_4 +; GFX1250-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2 +; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5] +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB21_4 +; GFX1250-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB21_2 +; GFX1250-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB21_4 +; GFX1250-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_add_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB21_2 +; GFX1250-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5] +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw add ptr %gep1, i64 %data syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw sub +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_sub_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw sub ptr %gep1, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_sub_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw sub ptr %gep1, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB26_4 +; GFX1250-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB26_5 +; GFX1250-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB26_2 +; GFX1250-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB26_5 +; GFX1250-SDAG-NEXT: .LBB26_5: +; +; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB26_4 +; GFX1250-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB26_5 +; GFX1250-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB26_2 +; GFX1250-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB26_5 +; GFX1250-GISEL-NEXT: .LBB26_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB27_4 +; GFX1250-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB27_5 +; GFX1250-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB27_2 +; GFX1250-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v2 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v3, vcc_lo +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB27_5 +; GFX1250-SDAG-NEXT: .LBB27_5: +; +; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB27_4 +; GFX1250-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB27_5 +; GFX1250-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB27_2 +; GFX1250-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_sub_co_u32 v2, vcc_lo, v0, v4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v3, null, v1, v5, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB27_5 +; GFX1250-GISEL-NEXT: .LBB27_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw sub ptr %gep1, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB28_4 +; GFX1250-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB28_2 +; GFX1250-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB28_4 +; GFX1250-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB28_2 +; GFX1250-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB29_4 +; GFX1250-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB29_2 +; GFX1250-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v3, vcc_lo +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB29_4 +; GFX1250-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_sub_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB29_2 +; GFX1250-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_sub_co_ci_u32_e64 v1, null, v1, v5, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw sub ptr %gep1, i64 %data syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw and +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_and_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw and ptr %gep1, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_and_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw and ptr %gep1, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB34_4 +; GFX1250-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB34_5 +; GFX1250-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB34_2 +; GFX1250-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB34_5 +; GFX1250-SDAG-NEXT: .LBB34_5: +; +; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB34_4 +; GFX1250-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB34_5 +; GFX1250-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB34_2 +; GFX1250-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB34_5 +; GFX1250-GISEL-NEXT: .LBB34_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB35_4 +; GFX1250-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB35_5 +; GFX1250-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB35_2 +; GFX1250-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v3, v1, v3 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB35_5 +; GFX1250-SDAG-NEXT: .LBB35_5: +; +; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB35_4 +; GFX1250-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB35_5 +; GFX1250-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB35_2 +; GFX1250-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, v0, v4 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v3, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB35_5 +; GFX1250-GISEL-NEXT: .LBB35_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw and ptr %gep1, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB36_4 +; GFX1250-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB36_2 +; GFX1250-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB36_4 +; GFX1250-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB36_2 +; GFX1250-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB37_4 +; GFX1250-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB37_2 +; GFX1250-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB37_4 +; GFX1250-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_and_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB37_2 +; GFX1250-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw and ptr %gep1, i64 %data syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw or +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_or_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw or ptr %gep1, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_or_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw or ptr %gep1, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB42_4 +; GFX1250-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB42_5 +; GFX1250-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB42_2 +; GFX1250-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX1250-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB42_5 +; GFX1250-SDAG-NEXT: .LBB42_5: +; +; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB42_4 +; GFX1250-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB42_5 +; GFX1250-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB42_2 +; GFX1250-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB42_5 +; GFX1250-GISEL-NEXT: .LBB42_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB43_4 +; GFX1250-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB43_5 +; GFX1250-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB43_2 +; GFX1250-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_or_b32_e32 v3, v1, v3 +; GFX1250-SDAG-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB43_5 +; GFX1250-SDAG-NEXT: .LBB43_5: +; +; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB43_4 +; GFX1250-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB43_5 +; GFX1250-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB43_2 +; GFX1250-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v2, v0, v4 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v3, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB43_5 +; GFX1250-GISEL-NEXT: .LBB43_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw or ptr %gep1, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB44_4 +; GFX1250-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB44_2 +; GFX1250-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX1250-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB44_4 +; GFX1250-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB44_2 +; GFX1250-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB45_4 +; GFX1250-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB45_2 +; GFX1250-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX1250-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB45_4 +; GFX1250-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_or_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB45_2 +; GFX1250-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v0, v0, v4 +; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw or ptr %gep1, i64 %data syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw xor +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xor_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw xor ptr %gep1, i32 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xor_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw xor ptr %gep1, i32 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB50_4 +; GFX1250-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB50_5 +; GFX1250-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB50_2 +; GFX1250-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB50_5 +; GFX1250-SDAG-NEXT: .LBB50_5: +; +; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB50_4 +; GFX1250-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB50_5 +; GFX1250-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB50_2 +; GFX1250-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB50_5 +; GFX1250-GISEL-NEXT: .LBB50_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB51_4 +; GFX1250-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB51_5 +; GFX1250-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB51_2 +; GFX1250-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB51_5 +; GFX1250-SDAG-NEXT: .LBB51_5: +; +; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB51_4 +; GFX1250-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB51_5 +; GFX1250-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB51_2 +; GFX1250-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB51_5 +; GFX1250-GISEL-NEXT: .LBB51_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw xor ptr %gep1, i64 %data syncscope("agent") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB52_4 +; GFX1250-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB52_2 +; GFX1250-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB52_4 +; GFX1250-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB52_2 +; GFX1250-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst + ret void +} + +define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB53_4 +; GFX1250-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB53_2 +; GFX1250-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX1250-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB53_4 +; GFX1250-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_xor_b64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB53_2 +; GFX1250-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw xor ptr %gep1, i64 %data syncscope("agent") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw max +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_max_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_max_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw max ptr %gep1, i32 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_max_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw max ptr %gep1, i32 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB58_4 +; GFX1250-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB58_5 +; GFX1250-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB58_2 +; GFX1250-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB58_5 +; GFX1250-SDAG-NEXT: .LBB58_5: +; +; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB58_4 +; GFX1250-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB58_5 +; GFX1250-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB58_2 +; GFX1250-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB58_5 +; GFX1250-GISEL-NEXT: .LBB58_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB59_4 +; GFX1250-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB59_5 +; GFX1250-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB59_2 +; GFX1250-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB59_5 +; GFX1250-SDAG-NEXT: .LBB59_5: +; +; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB59_4 +; GFX1250-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB59_5 +; GFX1250-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB59_2 +; GFX1250-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB59_5 +; GFX1250-GISEL-NEXT: .LBB59_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw max ptr %gep1, i64 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB60_4 +; GFX1250-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB60_2 +; GFX1250-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB60_4 +; GFX1250-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB60_2 +; GFX1250-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB61_4 +; GFX1250-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_max_i64 v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB61_2 +; GFX1250-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB61_4 +; GFX1250-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_max_i64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB61_2 +; GFX1250-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_gt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw max ptr %gep1, i64 %data syncscope("workgroup") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw min +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_min_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_min_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw min ptr %gep1, i32 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_min_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw min ptr %gep1, i32 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB66_4 +; GFX1250-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB66_5 +; GFX1250-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB66_2 +; GFX1250-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB66_5 +; GFX1250-SDAG-NEXT: .LBB66_5: +; +; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB66_4 +; GFX1250-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB66_5 +; GFX1250-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB66_2 +; GFX1250-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB66_5 +; GFX1250-GISEL-NEXT: .LBB66_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB67_4 +; GFX1250-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB67_5 +; GFX1250-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB67_2 +; GFX1250-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB67_5 +; GFX1250-SDAG-NEXT: .LBB67_5: +; +; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB67_4 +; GFX1250-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB67_5 +; GFX1250-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB67_2 +; GFX1250-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB67_5 +; GFX1250-GISEL-NEXT: .LBB67_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw min ptr %gep1, i64 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB68_4 +; GFX1250-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB68_2 +; GFX1250-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB68_4 +; GFX1250-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB68_2 +; GFX1250-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB69_4 +; GFX1250-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_min_i64 v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB69_2 +; GFX1250-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_le_i64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB69_4 +; GFX1250-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_min_i64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB69_2 +; GFX1250-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw min ptr %gep1, i64 %data syncscope("workgroup") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw umax +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_umax_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_umax_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw umax ptr %gep1, i32 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_umax_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw umax ptr %gep1, i32 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB74_4 +; GFX1250-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB74_5 +; GFX1250-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB74_2 +; GFX1250-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB74_5 +; GFX1250-SDAG-NEXT: .LBB74_5: +; +; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB74_4 +; GFX1250-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB74_5 +; GFX1250-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB74_2 +; GFX1250-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB74_5 +; GFX1250-GISEL-NEXT: .LBB74_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB75_4 +; GFX1250-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB75_5 +; GFX1250-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB75_2 +; GFX1250-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB75_5 +; GFX1250-SDAG-NEXT: .LBB75_5: +; +; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB75_4 +; GFX1250-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB75_5 +; GFX1250-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB75_2 +; GFX1250-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB75_5 +; GFX1250-GISEL-NEXT: .LBB75_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw umax ptr %gep1, i64 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB76_4 +; GFX1250-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB76_2 +; GFX1250-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB76_4 +; GFX1250-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB76_2 +; GFX1250-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB77_4 +; GFX1250-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_max_u64 v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB77_2 +; GFX1250-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB77_4 +; GFX1250-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_max_u64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB77_2 +; GFX1250-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw umax ptr %gep1, i64 %data syncscope("workgroup") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw umin +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_umin_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_umin_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw umin ptr %gep1, i32 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_umin_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128 +; GFX1250-NEXT: s_wait_dscnt 0x0 +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw umin ptr %gep1, i32 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB82_4 +; GFX1250-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB82_5 +; GFX1250-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB82_2 +; GFX1250-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB82_5 +; GFX1250-SDAG-NEXT: .LBB82_5: +; +; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB82_4 +; GFX1250-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB82_5 +; GFX1250-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB82_2 +; GFX1250-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB82_5 +; GFX1250-GISEL-NEXT: .LBB82_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB83_4 +; GFX1250-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB83_5 +; GFX1250-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB83_2 +; GFX1250-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v3, v1 :: v_dual_cndmask_b32 v2, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB83_5 +; GFX1250-SDAG-NEXT: .LBB83_5: +; +; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB83_4 +; GFX1250-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB83_5 +; GFX1250-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB83_2 +; GFX1250-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v4, v0 :: v_dual_cndmask_b32 v3, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB83_5 +; GFX1250-GISEL-NEXT: .LBB83_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw umin ptr %gep1, i64 %data syncscope("workgroup") seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB84_4 +; GFX1250-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB84_2 +; GFX1250-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB84_4 +; GFX1250-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB84_2 +; GFX1250-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst + ret void +} + +define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB85_4 +; GFX1250-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_min_u64 v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB85_2 +; GFX1250-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_le_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v0, v2, v0 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB85_4 +; GFX1250-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_min_u64 v0, v[4:5], s[2:3] offset:-128 +; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB85_2 +; GFX1250-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw umin ptr %gep1, i64 %data syncscope("workgroup") seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; cmpxchg +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { +; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %cmpxchg = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst + %rtn = extractvalue { i32, i1 } %cmpxchg, 0 + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { +; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v0, v[2:3], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %cmpxchg = cmpxchg ptr %gep1, i32 %cmp, i32 %data seq_cst seq_cst + %rtn = extractvalue { i32, i1 } %cmpxchg, 0 + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { +; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[2:3] scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst + ret void +} + +define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { +; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt 0x0 +; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v[2:3], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = cmpxchg ptr %gep1, i32 %cmp, i32 %data seq_cst seq_cst + ret void +} + +define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { +; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB90_4 +; GFX1250-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB90_5 +; GFX1250-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[2:3], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB90_2 +; GFX1250-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4 +; GFX1250-SDAG-NEXT: scratch_store_b64 v8, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB90_5 +; GFX1250-SDAG-NEXT: .LBB90_5: +; +; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v0, v5 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB90_4 +; GFX1250-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB90_5 +; GFX1250-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB90_2 +; GFX1250-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7 +; GFX1250-GISEL-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB90_5 +; GFX1250-GISEL-NEXT: .LBB90_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %cmpxchg = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst + %rtn = extractvalue { i64, i1 } %cmpxchg, 0 + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { +; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB91_4 +; GFX1250-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB91_5 +; GFX1250-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[2:3], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB91_2 +; GFX1250-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v8, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v1, v5 :: v_dual_cndmask_b32 v2, v0, v4 +; GFX1250-SDAG-NEXT: scratch_store_b64 v8, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB91_5 +; GFX1250-SDAG-NEXT: .LBB91_5: +; +; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3 +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v7, v4 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v5 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB91_4 +; GFX1250-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB91_5 +; GFX1250-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v[0:1], v5, v[6:9], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB91_2 +; GFX1250-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v0, v6 :: v_dual_cndmask_b32 v3, v1, v7 +; GFX1250-GISEL-NEXT: scratch_store_b64 v4, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB91_5 +; GFX1250-GISEL-NEXT: .LBB91_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %cmpxchg = cmpxchg ptr %gep1, i64 %cmp, i64 %data seq_cst seq_cst + %rtn = extractvalue { i64, i1 } %cmpxchg, 0 + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { +; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v4, v3 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB92_4 +; GFX1250-SDAG-NEXT: .LBB92_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB92_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:7] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB92_2 +; GFX1250-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX1250-SDAG-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB92_4 +; GFX1250-GISEL-NEXT: .LBB92_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB92_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB92_2 +; GFX1250-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst + ret void +} + +define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { +; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3 +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB93_4 +; GFX1250-SDAG-NEXT: .LBB93_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB93_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt 0x0 +; GFX1250-SDAG-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[4:7] scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB93_2 +; GFX1250-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[6:7] +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v5 :: v_dual_cndmask_b32 v0, v0, v4 +; GFX1250-SDAG-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2 +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB93_4 +; GFX1250-GISEL-NEXT: .LBB93_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB93_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: global_wb scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt 0x0 +; GFX1250-GISEL-NEXT: flat_atomic_cmpswap_b64 v0, v[6:9], s[2:3] offset:-128 scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_SYS +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB93_2 +; GFX1250-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = cmpxchg ptr %gep1, i64 %cmp, i64 %data seq_cst seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; amdgcn atomic inc +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_inc_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_inc_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw uinc_wrap ptr %gep1, i32 %data syncscope("agent") monotonic + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_inc_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_inc_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw uinc_wrap ptr %gep1, i32 %data syncscope("agent") monotonic + ret void +} + +define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB98_4 +; GFX1250-SDAG-NEXT: .LBB98_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB98_5 +; GFX1250-SDAG-NEXT: .LBB98_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB98_2 +; GFX1250-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 +; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB98_5 +; GFX1250-SDAG-NEXT: .LBB98_5: +; +; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB98_4 +; GFX1250-GISEL-NEXT: .LBB98_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB98_5 +; GFX1250-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB98_2 +; GFX1250-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB98_5 +; GFX1250-GISEL-NEXT: .LBB98_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB99_4 +; GFX1250-SDAG-NEXT: .LBB99_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB99_5 +; GFX1250-SDAG-NEXT: .LBB99_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB99_2 +; GFX1250-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, 0, v5 :: v_dual_cndmask_b32 v2, 0, v4 +; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_branch .LBB99_5 +; GFX1250-SDAG-NEXT: .LBB99_5: +; +; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB99_4 +; GFX1250-GISEL-NEXT: .LBB99_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB99_5 +; GFX1250-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB99_2 +; GFX1250-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_branch .LBB99_5 +; GFX1250-GISEL-NEXT: .LBB99_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw uinc_wrap ptr %gep1, i64 %data syncscope("agent") monotonic + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB100_4 +; GFX1250-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB100_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB100_2 +; GFX1250-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4 +; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB100_4 +; GFX1250-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB100_2 +; GFX1250-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB101_4 +; GFX1250-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB101_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB101_2 +; GFX1250-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffd +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, 0, v5 :: v_dual_cndmask_b32 v0, 0, v4 +; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB101_4 +; GFX1250-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_inc_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB101_2 +; GFX1250-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_ge_u64_e32 vcc_lo, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, 1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc_lo +; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw uinc_wrap ptr %gep1, i64 %data syncscope("agent") monotonic + ret void +} + +; -------------------------------------------------------------------------------- +; amdgcn atomic dec +; -------------------------------------------------------------------------------- + + +define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_dec_saddr_i32_rtn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_dec_saddr_i32_rtn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw udec_wrap ptr %gep1, i32 %data syncscope("agent") monotonic + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_dec_saddr_i32_nortn: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) { +; GFX1250-LABEL: flat_dec_saddr_i32_nortn_neg128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw udec_wrap ptr %gep1, i32 %data syncscope("agent") monotonic + ret void +} + +define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB106_4 +; GFX1250-SDAG-NEXT: .LBB106_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB106_5 +; GFX1250-SDAG-NEXT: .LBB106_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB106_2 +; GFX1250-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-SDAG-NEXT: s_branch .LBB106_5 +; GFX1250-SDAG-NEXT: .LBB106_5: +; +; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB106_4 +; GFX1250-GISEL-NEXT: .LBB106_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB106_5 +; GFX1250-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB106_2 +; GFX1250-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1 +; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-GISEL-NEXT: s_branch .LBB106_5 +; GFX1250-GISEL-NEXT: .LBB106_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %rtn = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v5 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB107_4 +; GFX1250-SDAG-NEXT: .LBB107_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: s_branch .LBB107_5 +; GFX1250-SDAG-NEXT: .LBB107_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[4:5], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s1, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB107_2 +; GFX1250-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[4:5] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc_lo +; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1 +; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v3, v5, v3 :: v_dual_cndmask_b32 v2, v4, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-SDAG-NEXT: s_branch .LBB107_5 +; GFX1250-SDAG-NEXT: .LBB107_5: +; +; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v7 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB107_4 +; GFX1250-GISEL-NEXT: .LBB107_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: s_branch .LBB107_5 +; GFX1250-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v[0:1], v3, v[4:5], s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr6_vgpr7 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s1, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB107_2 +; GFX1250-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v6, vcc_lo +; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, -1 +; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v2, v2, v4 :: v_dual_cndmask_b32 v3, v3, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[2:3], off +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1250-GISEL-NEXT: s_branch .LBB107_5 +; GFX1250-GISEL-NEXT: .LBB107_5: + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %rtn = atomicrmw udec_wrap ptr %gep1, i64 %data syncscope("agent") monotonic + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB108_4 +; GFX1250-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB108_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB108_2 +; GFX1250-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB108_4 +; GFX1250-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB108_2 +; GFX1250-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %unused = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic + ret void +} + +define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) { +; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128: +; GFX1250-SDAG: ; %bb.0: +; GFX1250-SDAG-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], lit64(0xffffffffffffff80) +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] +; GFX1250-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-SDAG-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_cmpx_ne_u32_e64 s1, v1 +; GFX1250-SDAG-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_3 +; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB109_4 +; GFX1250-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi +; GFX1250-SDAG-NEXT: s_endpgm +; GFX1250-SDAG-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX1250-SDAG-NEXT: flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB109_2 +; GFX1250-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX1250-SDAG-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc_lo +; GFX1250-SDAG-NEXT: scratch_load_b64 v[0:1], v4, off +; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX1250-SDAG-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-SDAG-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[2:3] +; GFX1250-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-SDAG-NEXT: s_wait_alu 0xfffe +; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-SDAG-NEXT: v_dual_cndmask_b32 v1, v1, v3 :: v_dual_cndmask_b32 v0, v0, v2 +; GFX1250-SDAG-NEXT: scratch_store_b64 v4, v[0:1], off +; GFX1250-SDAG-NEXT: s_endpgm +; +; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128: +; GFX1250-GISEL: ; %bb.0: +; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2 +; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] +; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base +; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0 +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_cmpx_ne_u32_e64 s1, v3 +; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_3 +; GFX1250-GISEL-NEXT: ; %bb.1: ; %Flow +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB109_4 +; GFX1250-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi +; GFX1250-GISEL-NEXT: s_endpgm +; GFX1250-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global +; GFX1250-GISEL-NEXT: flat_atomic_dec_u64 v0, v[4:5], s[2:3] offset:-128 scope:SCOPE_DEV +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5 +; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0 +; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB109_2 +; GFX1250-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private +; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffd +; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc_lo +; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off +; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX1250-GISEL-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] +; GFX1250-GISEL-NEXT: v_cmp_gt_u64_e64 s0, v[0:1], v[4:5] +; GFX1250-GISEL-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, -1 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX1250-GISEL-NEXT: s_wait_alu 0xfffe +; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5 +; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off +; GFX1250-GISEL-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128 + %unused = atomicrmw udec_wrap ptr %gep1, i64 %data syncscope("agent") monotonic + ret void +} + +attributes #0 = { argmemonly nounwind willreturn } From de59e7b86cd349f9f74b7561594aeae410477326 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 18 Jul 2025 14:36:09 -0500 Subject: [PATCH 383/813] [libc] Fix GPU benchmarking --- libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 2 +- .../gpu/src/math/atan2_benchmark.cpp | 24 ++++---- libc/benchmarks/gpu/src/math/platform.h | 57 +++++++++++++++++++ .../benchmarks/gpu/src/math/sin_benchmark.cpp | 40 ++++++------- libc/benchmarks/gpu/timing/amdgpu/timing.h | 21 +++---- libc/benchmarks/gpu/timing/nvptx/timing.h | 17 +++--- 6 files changed, 106 insertions(+), 55 deletions(-) create mode 100644 libc/benchmarks/gpu/src/math/platform.h diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 920c5b206b0fe..57ff5b9fdb846 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -7,9 +7,9 @@ #include "src/__support/GPU/utils.h" #include "src/__support/fixedvector.h" #include "src/__support/macros/config.h" +#include "src/__support/time/gpu/time_utils.h" #include "src/stdio/printf.h" #include "src/stdlib/srand.h" -#include "src/time/gpu/time_utils.h" namespace LIBC_NAMESPACE_DECL { namespace benchmarks { diff --git a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp index 3bb5b0cc6788c..1f91a9a35c373 100644 --- a/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp +++ b/libc/benchmarks/gpu/src/math/atan2_benchmark.cpp @@ -3,12 +3,8 @@ #include "src/math/atan2.h" #include "src/stdlib/rand.h" -#ifdef NVPTX_MATH_FOUND -#include "src/math/nvptx/declarations.h" -#endif - -#ifdef AMDGPU_MATH_FOUND -#include "src/math/amdgpu/declarations.h" +#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) +#include "platform.h" #endif #define BM_TWO_RANDOM_INPUT(T, Func, MIN_EXP, MAX_EXP, N) \ @@ -33,15 +29,15 @@ BENCH(double, Atan2TwoPow30, LIBC_NAMESPACE::atan2, 0, 30); BENCH(double, Atan2Large, LIBC_NAMESPACE::atan2, 30, 1000); #ifdef NVPTX_MATH_FOUND -BENCH(double, NvAtan2, LIBC_NAMESPACE::__nv_atan2, -1023, 1023); -BENCH(double, NvAtan2TwoPi, LIBC_NAMESPACE::__nv_atan2, -10, 3); -BENCH(double, NvAtan2TwoPow30, LIBC_NAMESPACE::__nv_atan2, 0, 30); -BENCH(double, NvAtan2Large, LIBC_NAMESPACE::__nv_atan2, 30, 1000); +BENCH(double, NvAtan2, __nv_atan2, -1023, 1023); +BENCH(double, NvAtan2TwoPi, __nv_atan2, -10, 3); +BENCH(double, NvAtan2TwoPow30, __nv_atan2, 0, 30); +BENCH(double, NvAtan2Large, __nv_atan2, 30, 1000); #endif #ifdef AMDGPU_MATH_FOUND -BENCH(double, AmdAtan2, LIBC_NAMESPACE::__ocml_atan2_f64, -1023, 1023); -BENCH(double, AmdAtan2TwoPi, LIBC_NAMESPACE::__ocml_atan2_f64, -10, 3); -BENCH(double, AmdAtan2TwoPow30, LIBC_NAMESPACE::__ocml_atan2_f64, 0, 30); -BENCH(double, AmdAtan2Large, LIBC_NAMESPACE::__ocml_atan2_f64, 30, 1000); +BENCH(double, AmdAtan2, __ocml_atan2_f64, -1023, 1023); +BENCH(double, AmdAtan2TwoPi, __ocml_atan2_f64, -10, 3); +BENCH(double, AmdAtan2TwoPow30, __ocml_atan2_f64, 0, 30); +BENCH(double, AmdAtan2Large, __ocml_atan2_f64, 30, 1000); #endif diff --git a/libc/benchmarks/gpu/src/math/platform.h b/libc/benchmarks/gpu/src/math/platform.h new file mode 100644 index 0000000000000..bb7825d38bd42 --- /dev/null +++ b/libc/benchmarks/gpu/src/math/platform.h @@ -0,0 +1,57 @@ +//===-- AMDGPU specific platform definitions for math support -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H +#define LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" +#include + +namespace LIBC_NAMESPACE_DECL { + +#ifdef LIBC_TARGET_ARCH_IS_AMDGPU +// The ROCm device library uses control globals to alter codegen for the +// different targets. To avoid needing to link them in manually we simply +// define them here. +extern "C" { +extern const LIBC_INLINE_VAR uint8_t __oclc_unsafe_math_opt = 0; +extern const LIBC_INLINE_VAR uint8_t __oclc_daz_opt = 0; +extern const LIBC_INLINE_VAR uint8_t __oclc_correctly_rounded_sqrt32 = 1; +extern const LIBC_INLINE_VAR uint8_t __oclc_finite_only_opt = 0; +extern const LIBC_INLINE_VAR uint32_t __oclc_ISA_version = 9000; +} + +// These aliases cause clang to emit the control constants with ODR linkage. +// This allows us to link against the symbols without preventing them from being +// optimized out or causing symbol collisions. +[[gnu::alias("__oclc_unsafe_math_opt")]] const uint8_t __oclc_unsafe_math_opt__; +[[gnu::alias("__oclc_daz_opt")]] const uint8_t __oclc_daz_opt__; +[[gnu::alias("__oclc_correctly_rounded_sqrt32")]] const uint8_t + __oclc_correctly_rounded_sqrt32__; +[[gnu::alias("__oclc_finite_only_opt")]] const uint8_t __oclc_finite_only_opt__; +[[gnu::alias("__oclc_ISA_version")]] const uint32_t __oclc_ISA_version__; +#endif +} // namespace LIBC_NAMESPACE_DECL + +// Forward declarations for the vendor math libraries. +extern "C" { +#ifdef AMDGPU_MATH_FOUND +double __ocml_sin_f64(double); +float __ocml_sin_f32(float); +double __ocml_atan2_f64(double, double); +float __ocml_atan2_f32(float, float); +#endif + +#ifdef NVPTX_MATH_FOUND +double __nv_sin(double); +float __nv_sinf(float); +double __nv_atan2(double, double); +float __nv_atan2f(float, float); +#endif +} + +#endif // LLVM_LIBC_SRC_MATH_AMDGPU_PLATFORM_H diff --git a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp index bf09e6e462172..a759db2e9d33f 100644 --- a/libc/benchmarks/gpu/src/math/sin_benchmark.cpp +++ b/libc/benchmarks/gpu/src/math/sin_benchmark.cpp @@ -8,12 +8,8 @@ #include "src/math/sinf.h" #include "src/stdlib/rand.h" -#ifdef NVPTX_MATH_FOUND -#include "src/math/nvptx/declarations.h" -#endif - -#ifdef AMDGPU_MATH_FOUND -#include "src/math/amdgpu/declarations.h" +#if defined(NVPTX_MATH_FOUND) || defined(AMDGPU_MATH_FOUND) +#include "platform.h" #endif // BENCHMARK() expects a function that with no parameters that returns a @@ -42,17 +38,17 @@ BENCH(double, SinTwoPow30, LIBC_NAMESPACE::sin, 0, 30); BENCH(double, SinVeryLarge, LIBC_NAMESPACE::sin, 30, 1000); #ifdef NVPTX_MATH_FOUND -BENCH(double, NvSin, LIBC_NAMESPACE::__nv_sin, -1023, 1023); -BENCH(double, NvSinTwoPi, LIBC_NAMESPACE::__nv_sin, -10, 3); -BENCH(double, NvSinTwoPow30, LIBC_NAMESPACE::__nv_sin, 0, 30); -BENCH(double, NvSinVeryLarge, LIBC_NAMESPACE::__nv_sin, 30, 1000); +BENCH(double, NvSin, __nv_sin, -1023, 1023); +BENCH(double, NvSinTwoPi, __nv_sin, -10, 3); +BENCH(double, NvSinTwoPow30, __nv_sin, 0, 30); +BENCH(double, NvSinVeryLarge, __nv_sin, 30, 1000); #endif #ifdef AMDGPU_MATH_FOUND -BENCH(double, AmdSin, LIBC_NAMESPACE::__ocml_sin_f64, -1023, 1023); -BENCH(double, AmdSinTwoPi, LIBC_NAMESPACE::__ocml_sin_f64, -10, 3); -BENCH(double, AmdSinTwoPow30, LIBC_NAMESPACE::__ocml_sin_f64, 0, 30); -BENCH(double, AmdSinVeryLarge, LIBC_NAMESPACE::__ocml_sin_f64, 30, 1000); +BENCH(double, AmdSin, __ocml_sin_f64, -1023, 1023); +BENCH(double, AmdSinTwoPi, __ocml_sin_f64, -10, 3); +BENCH(double, AmdSinTwoPow30, __ocml_sin_f64, 0, 30); +BENCH(double, AmdSinVeryLarge, __ocml_sin_f64, 30, 1000); #endif BENCH(float, Sinf, LIBC_NAMESPACE::sinf, -127, 128); @@ -61,15 +57,15 @@ BENCH(float, SinfTwoPow30, LIBC_NAMESPACE::sinf, 0, 30); BENCH(float, SinfVeryLarge, LIBC_NAMESPACE::sinf, 30, 120); #ifdef NVPTX_MATH_FOUND -BENCH(float, NvSinf, LIBC_NAMESPACE::__nv_sinf, -127, 128); -BENCH(float, NvSinfTwoPi, LIBC_NAMESPACE::__nv_sinf, -10, 3); -BENCH(float, NvSinfTwoPow30, LIBC_NAMESPACE::__nv_sinf, 0, 30); -BENCH(float, NvSinfVeryLarge, LIBC_NAMESPACE::__nv_sinf, 30, 120); +BENCH(float, NvSinf, __nv_sinf, -127, 128); +BENCH(float, NvSinfTwoPi, __nv_sinf, -10, 3); +BENCH(float, NvSinfTwoPow30, __nv_sinf, 0, 30); +BENCH(float, NvSinfVeryLarge, __nv_sinf, 30, 120); #endif #ifdef AMDGPU_MATH_FOUND -BENCH(float, AmdSinf, LIBC_NAMESPACE::__ocml_sin_f32, -127, 128); -BENCH(float, AmdSinfTwoPi, LIBC_NAMESPACE::__ocml_sin_f32, -10, 3); -BENCH(float, AmdSinfTwoPow30, LIBC_NAMESPACE::__ocml_sin_f32, 0, 30); -BENCH(float, AmdSinfVeryLarge, LIBC_NAMESPACE::__ocml_sin_f32, 30, 120); +BENCH(float, AmdSinf, __ocml_sin_f32, -127, 128); +BENCH(float, AmdSinfTwoPi, __ocml_sin_f32, -10, 3); +BENCH(float, AmdSinfTwoPow30, __ocml_sin_f32, 0, 30); +BENCH(float, AmdSinfVeryLarge, __ocml_sin_f32, 30, 120); #endif diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h index 4cf7e9838add3..0f2c04c07c921 100644 --- a/libc/benchmarks/gpu/timing/amdgpu/timing.h +++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU #include "src/__support/CPP/array.h" +#include "src/__support/CPP/atomic.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/GPU/utils.h" #include "src/__support/common.h" @@ -24,7 +25,7 @@ namespace LIBC_NAMESPACE_DECL { // allows us to substract the constant-time overhead from the latency to // obtain a true result. This can vary with system load. [[gnu::noinline]] static LIBC_INLINE uint64_t overhead() { - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); uint32_t result = 0.0; asm("v_or_b32 %[v_reg], 0, %[v_reg]\n" ::[v_reg] "v"(result)); @@ -44,13 +45,13 @@ template T arg = storage; // The AMDGPU architecture needs to wait on pending results. - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); // Get the current timestamp from the clock. uint64_t start = gpu::processor_clock(); // This forces the compiler to load the input argument and run the clock // cycle counter before the profiling region. - asm("" ::"s"(start)); + asm("" : "+v"(arg) : "s"(start)); // Run the function under test and return its value. auto result = f(arg); @@ -71,7 +72,7 @@ template // ordering. uint64_t stop = gpu::processor_clock(); asm("" ::"s"(stop)); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); // Return the time elapsed. return stop - start; @@ -84,7 +85,7 @@ template T1 arg1 = storage1; T2 arg2 = storage2; - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); asm("" ::"s"(start)); @@ -100,7 +101,7 @@ template uint64_t stop = gpu::processor_clock(); asm("" ::"s"(stop)); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); return stop - start; } @@ -111,7 +112,7 @@ template throughput(F f, const cpp::array &inputs) { asm("" ::"v"(&inputs)); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); asm("" ::"s"(start)); @@ -124,7 +125,7 @@ throughput(F f, const cpp::array &inputs) { uint64_t stop = gpu::processor_clock(); asm("" ::"s"(stop)); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); // Return the time elapsed. return stop - start; @@ -136,7 +137,7 @@ template F f, const cpp::array &inputs1, const cpp::array &inputs2) { asm("" ::"v"(&inputs1), "v"(&inputs2)); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); asm("" ::"s"(start)); @@ -149,7 +150,7 @@ template uint64_t stop = gpu::processor_clock(); asm("" ::"s"(stop)); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); // Return the time elapsed. return stop - start; diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h index ece7d9a6c5396..3ed97645ddc93 100644 --- a/libc/benchmarks/gpu/timing/nvptx/timing.h +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -10,6 +10,7 @@ #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX #include "src/__support/CPP/array.h" +#include "src/__support/CPP/atomic.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/GPU/utils.h" #include "src/__support/common.h" @@ -46,7 +47,7 @@ template T arg = storage; // Get the current timestamp from the clock. - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); // This forces the compiler to load the input argument and run the clock cycle @@ -63,7 +64,7 @@ template // Obtain the current timestamp after running the calculation and force // ordering. uint64_t stop = gpu::processor_clock(); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); asm("" ::"r"(stop)); volatile T output = result; @@ -78,7 +79,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { T1 arg = storage; T2 arg2 = storage2; - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); asm("" ::"llr"(start)); @@ -88,7 +89,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result)); uint64_t stop = gpu::processor_clock(); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); asm("" ::"r"(stop)); volatile auto output = result; @@ -101,7 +102,7 @@ template throughput(F f, const cpp::array &inputs) { asm("" ::"r"(&inputs)); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); asm("" ::"llr"(start)); @@ -114,7 +115,7 @@ throughput(F f, const cpp::array &inputs) { } uint64_t stop = gpu::processor_clock(); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); asm("" ::"r"(stop)); volatile auto output = result; @@ -128,7 +129,7 @@ template F f, const cpp::array &inputs1, const cpp::array &inputs2) { asm("" ::"r"(&inputs1), "r"(&inputs2)); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); uint64_t start = gpu::processor_clock(); asm("" ::"llr"(start)); @@ -140,7 +141,7 @@ template } uint64_t stop = gpu::processor_clock(); - gpu::memory_fence(); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); asm("" ::"r"(stop)); volatile auto output = result; From 6a7f572ef9758f49fcf9e178ce1cb95aa3069415 Mon Sep 17 00:00:00 2001 From: Jacob Lalonde Date: Fri, 18 Jul 2025 13:05:15 -0700 Subject: [PATCH 384/813] [LLDB] Fix Memory64 BaseRVA, move all non-stack memory to Mem64. (#146777) ### Context Over a year ago, I landed support for 64b Memory ranges in Minidump (#95312). In this patch we added the Memory64 list stream, which is effectively a Linked List on disk. The layout is a sixteen byte header and then however many Memory descriptors. ### The Bug This is a classic off-by one error, where I added 8 bytes instead of 16 for the header. This caused the first region to start 8 bytes before the correct RVA, thus shifting all memory reads by 8 bytes. We are correctly writing all the regions to disk correctly, with no physical corruption but the RVA is defined wrong, meaning we were incorrectly reading memory ![image](https://github.com/user-attachments/assets/049ef55d-856c-4f3c-9376-aeaa3fe8c0e1) ### Why wasn't this caught? One problem we've had is forcing Minidump to actually use the 64b mode, it would be a massive waste of resources to have a test that actually wrote >4.2gb of IO to validate the 64b regions, and so almost all validation has been manual. As a weakness of manual testing, this issue is psuedo non-deterministic, as what regions end up in 64b or 32b is handled greedily and iterated in the order it's laid out in /proc/pid/maps. We often validated 64b was written correctly by hexdumping the Minidump itself, which was not corrupted (other than the BaseRVA) ![image](https://github.com/user-attachments/assets/b599e3be-2d59-47e2-8a2d-75f182bb0b1d) ### Why is this showing up now? During internal usage, we had a bug report that the Minidump wasn't displaying values. I was unable to repro the issue, but during my investigation I saw the variables were in the 64b regions which resulted in me identifying the bug. ### How do we prevent future regressions? To prevent regressions, and honestly to save my sanity for figuring out where 8 bytes magically came from, I've added a new API to SBSaveCoreOptions. ```SBSaveCoreOptions::GetMemoryRegionsToSave()``` The ability to get the memory regions that we intend to include in the Coredump. I added this so we can compare what we intended to include versus what was actually included. Traditionally we've always had issues comparing regions because Minidump includes `/proc/pid/maps` and it can be difficult to know what memoryregion read failure was a genuine error or just a page that wasn't meant to be included. We are also leveraging this API to choose the memory regions to be generated, as well as for testing what regions should be bytewise 1:1. After much debate with @clayborg, I've moved all non-stack memory to the Memory64 List. This list doesn't incur us any meaningful overhead and Greg originally suggested doing this in the original 64b PR. This also means we're exercising the 64b path every single time we save a Minidump, preventing regressions on this feature from slipping through testing in the future. Snippet produced by [minidump.py](https://github.com/clayborg/scripts) ``` MINIDUMP_MEMORY_LIST: NumberOfMemoryRanges = 0x00000002 MemoryRanges[0] = [0x00007f61085ff9f0 - 0x00007f6108601000) @ 0x0003f655 MemoryRanges[1] = [0x00007ffe47e50910 - 0x00007ffe47e52000) @ 0x00040c65 MINIDUMP_MEMORY64_LIST: NumberOfMemoryRanges = 0x000000000000002e BaseRva = 0x0000000000042669 MemoryRanges[0] = [0x00005584162d8000 - 0x00005584162d9000) MemoryRanges[1] = [0x00005584162d9000 - 0x00005584162db000) MemoryRanges[2] = [0x00005584162db000 - 0x00005584162dd000) MemoryRanges[3] = [0x00005584162dd000 - 0x00005584162ff000) MemoryRanges[4] = [0x00007f6100000000 - 0x00007f6100021000) MemoryRanges[5] = [0x00007f6108800000 - 0x00007f6108828000) MemoryRanges[6] = [0x00007f6108828000 - 0x00007f610899d000) MemoryRanges[7] = [0x00007f610899d000 - 0x00007f61089f9000) MemoryRanges[8] = [0x00007f61089f9000 - 0x00007f6108a08000) MemoryRanges[9] = [0x00007f6108bf5000 - 0x00007f6108bf7000) ``` ### Misc As a part of this fix I had to look at LLDB logs a lot, you'll notice I added `0x` to many of the PRIx64 `LLDB_LOGF`. This is so the user (or I) can directly copy paste the address in the logs instead of adding the hex prefix themselves. Added some SBSaveCore tests for the new GetMemoryAPI, and Docstrings. CC: @DavidSpickett, @da-viper @labath because we've been working together on save-core plugins, review it optional and I didn't tag you but figured you'd want to know --- .../interface/SBSaveCoreOptionsDocstrings.i | 10 ++ .../include/lldb/API/SBMemoryRegionInfoList.h | 1 + lldb/include/lldb/API/SBSaveCoreOptions.h | 15 +++ lldb/include/lldb/Core/PluginManager.h | 3 +- lldb/include/lldb/Symbol/SaveCoreOptions.h | 7 +- lldb/source/API/SBProcess.cpp | 11 +- lldb/source/API/SBSaveCoreOptions.cpp | 25 +++++ lldb/source/Commands/CommandObjectProcess.cpp | 3 +- lldb/source/Core/PluginManager.cpp | 14 +-- .../Minidump/MinidumpFileBuilder.cpp | 78 +++++++------- lldb/source/Symbol/SaveCoreOptions.cpp | 36 +++++-- .../TestProcessSaveCoreMinidump64b.py | 102 ++++++++++++++++++ .../TestSBSaveCoreOptions.py | 43 ++++++++ 13 files changed, 290 insertions(+), 58 deletions(-) create mode 100644 lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump64b.py diff --git a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i index 6907164a1b95c..1df4d2b26212d 100644 --- a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i +++ b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i @@ -45,6 +45,10 @@ Note that currently ELF Core files are not supported." Resetting will result in the reset of all process specific options, such as Threads to save." ) lldb::SBSaveCoreOptions::SetProcess; +%feature("docstring", " + Get the process to save. If a process is not defined, whether by calling clear or by not setting a process, an invalid process will be returned." +) lldb::SBSaveCoreOptions::GetProcess; + %feature("docstring", " Add an SBThread to be saved, an error will be returned if an SBThread from a different process is specified. The process is set either by the first SBThread added to the options container, or explicitly by the SetProcess call." @@ -63,6 +67,12 @@ Note that currently ELF Core files are not supported." Get an SBThreadCollection of all threads marked to be saved. This collection is not sorted according to insertion order." ) lldb::SBSaveCoreOptions::GetThreadsToSave; +%feature("docstring", " + Get an SBMemoryRegionInfoList of all the Regions that LLDB will attempt to write into the Core. Note, reading from these + regions can fail, and it's not guaraunteed every region will be present in the resulting core. If called without a valid process or style set an empty + collection will be returned." +) lldb::SBSaveCoreOptions::GetMemoryRegionsToSave; + %feature("docstring", " Get the current total number of bytes the core is expected to have, excluding the overhead of the core file format. Requires both a Process and a Style to be specified. An error will be returned if the provided options would result in no data being saved." diff --git a/lldb/include/lldb/API/SBMemoryRegionInfoList.h b/lldb/include/lldb/API/SBMemoryRegionInfoList.h index 1d939dff55faa..8ac9c1aceb6f6 100644 --- a/lldb/include/lldb/API/SBMemoryRegionInfoList.h +++ b/lldb/include/lldb/API/SBMemoryRegionInfoList.h @@ -45,6 +45,7 @@ class LLDB_API SBMemoryRegionInfoList { private: friend class SBProcess; + friend class SBSaveCoreOptions; lldb_private::MemoryRegionInfos &ref(); diff --git a/lldb/include/lldb/API/SBSaveCoreOptions.h b/lldb/include/lldb/API/SBSaveCoreOptions.h index 37552c13d0f36..7b05377966965 100644 --- a/lldb/include/lldb/API/SBSaveCoreOptions.h +++ b/lldb/include/lldb/API/SBSaveCoreOptions.h @@ -12,6 +12,7 @@ #include "lldb/API/SBDefines.h" #include "lldb/API/SBError.h" #include "lldb/API/SBFileSpec.h" +#include "lldb/API/SBMemoryRegionInfoList.h" #include "lldb/API/SBProcess.h" #include "lldb/API/SBThread.h" #include "lldb/API/SBThreadCollection.h" @@ -78,6 +79,13 @@ class LLDB_API SBSaveCoreOptions { /// api, or implicitly from any function that requires a process. SBError SetProcess(lldb::SBProcess process); + /// Get the process to save, if the process is not set an invalid SBProcess + /// will be returned. + /// + /// \return + /// The set process, or an invalid SBProcess if no process is set. + SBProcess GetProcess(); + /// Add a thread to save in the core file. /// /// \param thread @@ -119,6 +127,13 @@ class LLDB_API SBSaveCoreOptions { /// an empty collection will be returned. SBThreadCollection GetThreadsToSave() const; + /// Get an unsorted copy of all memory regions to save + /// + /// \returns + /// An unsorted copy of all memory regions to save. If no process or style + /// is specified an empty collection will be returned. + SBMemoryRegionInfoList GetMemoryRegionsToSave(); + /// Get the current total number of bytes the core is expected to have /// excluding the overhead of the core file format. Requires a Process and /// Style to be specified. diff --git a/lldb/include/lldb/Core/PluginManager.h b/lldb/include/lldb/Core/PluginManager.h index 369785ceea5a5..aa60b7c6693ca 100644 --- a/lldb/include/lldb/Core/PluginManager.h +++ b/lldb/include/lldb/Core/PluginManager.h @@ -261,8 +261,7 @@ class PluginManager { static ObjectFileCreateMemoryInstance GetObjectFileCreateMemoryCallbackForPluginName(llvm::StringRef name); - static Status SaveCore(const lldb::ProcessSP &process_sp, - lldb_private::SaveCoreOptions &core_options); + static Status SaveCore(lldb_private::SaveCoreOptions &core_options); static std::vector GetSaveCorePluginNames(); diff --git a/lldb/include/lldb/Symbol/SaveCoreOptions.h b/lldb/include/lldb/Symbol/SaveCoreOptions.h index da66b184745db..697549706ed07 100644 --- a/lldb/include/lldb/Symbol/SaveCoreOptions.h +++ b/lldb/include/lldb/Symbol/SaveCoreOptions.h @@ -9,6 +9,7 @@ #ifndef LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H #define LLDB_SOURCE_PLUGINS_OBJECTFILE_SaveCoreOPTIONS_H +#include "lldb/Target/CoreFileMemoryRanges.h" #include "lldb/Target/ThreadCollection.h" #include "lldb/Utility/FileSpec.h" #include "lldb/Utility/RangeMap.h" @@ -23,7 +24,7 @@ namespace lldb_private { class SaveCoreOptions { public: - SaveCoreOptions(){}; + SaveCoreOptions() = default; ~SaveCoreOptions() = default; lldb_private::Status SetPluginName(const char *name); @@ -36,17 +37,19 @@ class SaveCoreOptions { const std::optional GetOutputFile() const; Status SetProcess(lldb::ProcessSP process_sp); + lldb::ProcessSP GetProcess() { return m_process_sp; } Status AddThread(lldb::ThreadSP thread_sp); bool RemoveThread(lldb::ThreadSP thread_sp); bool ShouldThreadBeSaved(lldb::tid_t tid) const; bool HasSpecifiedThreads() const; - Status EnsureValidConfiguration(lldb::ProcessSP process_sp) const; + Status EnsureValidConfiguration() const; const MemoryRanges &GetCoreFileMemoryRanges() const; void AddMemoryRegionToSave(const lldb_private::MemoryRegionInfo ®ion); + llvm::Expected GetMemoryRegionsToSave(); lldb_private::ThreadCollection::collection GetThreadsToSave() const; llvm::Expected GetCurrentSizeInBytes(); diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp index 4de5929d6b230..d4be64b815369 100644 --- a/lldb/source/API/SBProcess.cpp +++ b/lldb/source/API/SBProcess.cpp @@ -1263,6 +1263,15 @@ lldb::SBError SBProcess::SaveCore(SBSaveCoreOptions &options) { return error; } + if (!options.GetProcess()) + options.SetProcess(process_sp); + + if (options.GetProcess().GetSP() != process_sp) { + error = Status::FromErrorString( + "Save Core Options configured for a different process."); + return error; + } + std::lock_guard guard( process_sp->GetTarget().GetAPIMutex()); @@ -1271,7 +1280,7 @@ lldb::SBError SBProcess::SaveCore(SBSaveCoreOptions &options) { return error; } - error.ref() = PluginManager::SaveCore(process_sp, options.ref()); + error.ref() = PluginManager::SaveCore(options.ref()); return error; } diff --git a/lldb/source/API/SBSaveCoreOptions.cpp b/lldb/source/API/SBSaveCoreOptions.cpp index 15584abaac013..e8b81ee57f5a9 100644 --- a/lldb/source/API/SBSaveCoreOptions.cpp +++ b/lldb/source/API/SBSaveCoreOptions.cpp @@ -81,6 +81,11 @@ SBError SBSaveCoreOptions::SetProcess(lldb::SBProcess process) { return m_opaque_up->SetProcess(process.GetSP()); } +SBProcess SBSaveCoreOptions::GetProcess() { + LLDB_INSTRUMENT_VA(this); + return SBProcess(m_opaque_up->GetProcess()); +} + SBError SBSaveCoreOptions::AddThread(lldb::SBThread thread) { LLDB_INSTRUMENT_VA(this, thread); return m_opaque_up->AddThread(thread.GetSP()); @@ -128,6 +133,26 @@ uint64_t SBSaveCoreOptions::GetCurrentSizeInBytes(SBError &error) { return *expected_bytes; } +lldb::SBMemoryRegionInfoList SBSaveCoreOptions::GetMemoryRegionsToSave() { + LLDB_INSTRUMENT_VA(this); + llvm::Expected memory_ranges = + m_opaque_up->GetMemoryRegionsToSave(); + if (!memory_ranges) { + llvm::consumeError(memory_ranges.takeError()); + return SBMemoryRegionInfoList(); + } + + SBMemoryRegionInfoList memory_region_infos; + for (const auto &range : *memory_ranges) { + SBMemoryRegionInfo region_info( + nullptr, range.GetRangeBase(), range.GetRangeEnd(), + range.data.lldb_permissions, /*mapped=*/true); + memory_region_infos.Append(region_info); + } + + return memory_region_infos; +} + lldb_private::SaveCoreOptions &SBSaveCoreOptions::ref() const { return *m_opaque_up; } diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp index 1181b2d95c8b4..84c576e721e71 100644 --- a/lldb/source/Commands/CommandObjectProcess.cpp +++ b/lldb/source/Commands/CommandObjectProcess.cpp @@ -1354,7 +1354,8 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed { FileSystem::Instance().Resolve(output_file); auto &core_dump_options = m_options.m_core_dump_options; core_dump_options.SetOutputFile(output_file); - Status error = PluginManager::SaveCore(process_sp, core_dump_options); + core_dump_options.SetProcess(process_sp); + Status error = PluginManager::SaveCore(core_dump_options); if (error.Success()) { if (core_dump_options.GetStyle() == SaveCoreStyle::eSaveCoreDirtyOnly || diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp index bece690a85fa5..588736715f817 100644 --- a/lldb/source/Core/PluginManager.cpp +++ b/lldb/source/Core/PluginManager.cpp @@ -952,27 +952,26 @@ PluginManager::GetObjectFileCreateMemoryCallbackForPluginName( return nullptr; } -Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp, - lldb_private::SaveCoreOptions &options) { +Status PluginManager::SaveCore(lldb_private::SaveCoreOptions &options) { Status error; if (!options.GetOutputFile()) { error = Status::FromErrorString("No output file specified"); return error; } - if (!process_sp) { + if (!options.GetProcess()) { error = Status::FromErrorString("Invalid process"); return error; } - error = options.EnsureValidConfiguration(process_sp); + error = options.EnsureValidConfiguration(); if (error.Fail()) return error; if (!options.GetPluginName().has_value()) { // Try saving core directly from the process plugin first. llvm::Expected ret = - process_sp->SaveCore(options.GetOutputFile()->GetPath()); + options.GetProcess()->SaveCore(options.GetOutputFile()->GetPath()); if (!ret) return Status::FromError(ret.takeError()); if (ret.get()) @@ -984,7 +983,10 @@ Status PluginManager::SaveCore(const lldb::ProcessSP &process_sp, auto instances = GetObjectFileInstances().GetSnapshot(); for (auto &instance : instances) { if (plugin_name.empty() || instance.name == plugin_name) { - if (instance.save_core && instance.save_core(process_sp, options, error)) + // TODO: Refactor the instance.save_core() to not require a process and + // get it from options instead. + if (instance.save_core && + instance.save_core(options.GetProcess(), options, error)) return error; } } diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp index 806f256d9da48..fe28213c49740 100644 --- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp +++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp @@ -836,13 +836,13 @@ Status MinidumpFileBuilder::AddMemoryList() { // 32 bit memory descriptiors, so we emit them first to ensure the memory is // in accessible with a 32 bit offset. std::vector ranges_32; - std::vector ranges_64; - CoreFileMemoryRanges all_core_memory_ranges; - error = m_process_sp->CalculateCoreFileSaveRanges(m_save_core_options, - all_core_memory_ranges); + llvm::Expected all_core_memory_ranges_maybe = + m_save_core_options.GetMemoryRegionsToSave(); + if (!all_core_memory_ranges_maybe) + return Status::FromError(all_core_memory_ranges_maybe.takeError()); - if (error.Fail()) - return error; + const CoreFileMemoryRanges &all_core_memory_ranges = + *all_core_memory_ranges_maybe; lldb_private::Progress progress("Saving Minidump File", "", all_core_memory_ranges.GetSize()); @@ -868,6 +868,10 @@ Status MinidumpFileBuilder::AddMemoryList() { } } + // The header has to be in 32b memory, as it needs to be addressable by a 32b + // RVA. Everything else can be 64b. + total_size += sizeof(llvm::minidump::MemoryListHeader); + if (total_size >= UINT32_MAX) { error = Status::FromErrorStringWithFormat( "Unable to write minidump. Stack memory " @@ -876,35 +880,15 @@ Status MinidumpFileBuilder::AddMemoryList() { return error; } - // After saving the stacks, we start packing as much as we can into 32b. - // We apply a generous padding here so that the Directory, MemoryList and - // Memory64List sections all begin in 32b addressable space. - // Then anything overflow extends into 64b addressable space. - // all_core_memory_vec will either contain all stack regions at this point, - // or be empty if it's a stack only minidump. - if (!all_core_memory_vec.empty()) - total_size += 256 + (all_core_memory_vec.size() * - sizeof(llvm::minidump::MemoryDescriptor_64)); - - for (const auto &core_range : all_core_memory_vec) { - const addr_t range_size = core_range.range.size(); - // We don't need to check for stacks here because we already removed them - // from all_core_memory_ranges. - if (total_size + range_size < UINT32_MAX) { - ranges_32.push_back(core_range); - total_size += range_size; - } else { - ranges_64.push_back(core_range); - } - } - + // Save only the thread stacks to the 32b memory list. Everything else will + // get put in Memory64, this simplifies tracking error = AddMemoryList_32(ranges_32, progress); if (error.Fail()) return error; // Add the remaining memory as a 64b range. - if (!ranges_64.empty()) { - error = AddMemoryList_64(ranges_64, progress); + if (!all_core_memory_ranges.IsEmpty()) { + error = AddMemoryList_64(all_core_memory_vec, progress); if (error.Fail()) return error; } @@ -977,6 +961,7 @@ Status MinidumpFileBuilder::ReadWriteMemoryInChunks( const lldb::addr_t addr = range.range.start(); const lldb::addr_t size = range.range.size(); Log *log = GetLog(LLDBLog::Object); + uint64_t total_bytes_read = 0; Status addDataError; Process::ReadMemoryChunkCallback callback = [&](Status &error, lldb::addr_t current_addr, const void *buf, @@ -984,7 +969,7 @@ Status MinidumpFileBuilder::ReadWriteMemoryInChunks( if (error.Fail() || bytes_read == 0) { LLDB_LOGF(log, "Failed to read memory region at: 0x%" PRIx64 - ". Bytes read: %" PRIx64 ", error: %s", + ". Bytes read: 0x%" PRIx64 ", error: %s", current_addr, bytes_read, error.AsCString()); // If we failed in a memory read, we would normally want to skip @@ -997,6 +982,21 @@ Status MinidumpFileBuilder::ReadWriteMemoryInChunks( return lldb_private::IterationAction::Stop; } + if (current_addr != addr + total_bytes_read) { + LLDB_LOGF(log, + "Current addr is at unexpected address, 0x%" PRIx64 + ", expected at 0x%" PRIx64, + current_addr, addr + total_bytes_read); + + // Something went wrong and the address is not where it should be + // we'll error out of this Minidump generation. + addDataError = Status::FromErrorStringWithFormat( + "Unexpected address encounterd when reading memory in chunks " + "0x%" PRIx64 " expected 0x%" PRIx64, + current_addr, addr + total_bytes_read); + return lldb_private::IterationAction::Stop; + } + // Write to the minidump file with the chunk potentially flushing to // disk. // This error will be captured by the outer scope and is considered fatal. @@ -1006,13 +1006,13 @@ Status MinidumpFileBuilder::ReadWriteMemoryInChunks( if (addDataError.Fail()) return lldb_private::IterationAction::Stop; + total_bytes_read += bytes_read; // If we have a partial read, report it, but only if the partial read // didn't finish reading the entire region. - if (bytes_read != data_buffer.GetByteSize() && - current_addr + bytes_read != size) { + if (bytes_read != data_buffer.GetByteSize() && total_bytes_read != size) { LLDB_LOGF(log, - "Memory region at: %" PRIx64 " partiall read 0x%" PRIx64 - " bytes out of %" PRIx64 " bytes.", + "Memory region at: 0x%" PRIx64 " partial read 0x%" PRIx64 + " bytes out of 0x%" PRIx64 " bytes.", current_addr, bytes_read, data_buffer.GetByteSize() - bytes_read); @@ -1059,7 +1059,7 @@ MinidumpFileBuilder::AddMemoryList_32(std::vector &ranges, LLDB_LOGF(log, "AddMemoryList %zu/%zu reading memory for region " - "(%" PRIx64 " bytes) [%" PRIx64 ", %" PRIx64 ")", + "(0x%" PRIx64 " bytes) [0x%" PRIx64 ", 0x%" PRIx64 ")", region_index, ranges.size(), size, addr, addr + size); ++region_index; @@ -1117,7 +1117,7 @@ MinidumpFileBuilder::AddMemoryList_64(std::vector &ranges, return error; error = AddDirectory(StreamType::Memory64List, - (sizeof(llvm::support::ulittle64_t) * 2) + + (sizeof(llvm::minidump::Memory64ListHeader)) + ranges.size() * sizeof(llvm::minidump::MemoryDescriptor_64)); if (error.Fail()) @@ -1130,9 +1130,9 @@ MinidumpFileBuilder::AddMemoryList_64(std::vector &ranges, // Capture the starting offset for all the descriptors so we can clean them up // if needed. offset_t starting_offset = - GetCurrentDataEndOffset() + sizeof(llvm::support::ulittle64_t); + GetCurrentDataEndOffset() + sizeof(llvm::minidump::Memory64ListHeader); // The base_rva needs to start after the directories, which is right after - // this 8 byte variable. + // the descriptors + the size of the header. offset_t base_rva = starting_offset + (ranges.size() * sizeof(llvm::minidump::MemoryDescriptor_64)); diff --git a/lldb/source/Symbol/SaveCoreOptions.cpp b/lldb/source/Symbol/SaveCoreOptions.cpp index f93b58f59cf96..6d762a66181cf 100644 --- a/lldb/source/Symbol/SaveCoreOptions.cpp +++ b/lldb/source/Symbol/SaveCoreOptions.cpp @@ -124,16 +124,14 @@ void SaveCoreOptions::AddMemoryRegionToSave( const MemoryRanges &SaveCoreOptions::GetCoreFileMemoryRanges() const { return m_regions_to_save; } -Status -SaveCoreOptions::EnsureValidConfiguration(lldb::ProcessSP process_sp) const { +Status SaveCoreOptions::EnsureValidConfiguration() const { Status error; std::string error_str; if (!m_threads_to_save.empty() && GetStyle() == lldb::eSaveCoreFull) error_str += "Cannot save a full core with a subset of threads\n"; - if (m_process_sp && m_process_sp != process_sp) - error_str += "Cannot save core for process using supplied core options. " - "Options were constructed targeting a different process. \n"; + if (!m_process_sp) + error_str += "Need to assign a valid process\n"; if (!error_str.empty()) error = Status(error_str); @@ -155,12 +153,30 @@ SaveCoreOptions::GetThreadsToSave() const { return thread_collection; } +llvm::Expected +SaveCoreOptions::GetMemoryRegionsToSave() { + Status error; + if (!m_process_sp) + return Status::FromErrorString("Requires a process to be set.").takeError(); + + error = EnsureValidConfiguration(); + if (error.Fail()) + return error.takeError(); + + CoreFileMemoryRanges ranges; + error = m_process_sp->CalculateCoreFileSaveRanges(*this, ranges); + if (error.Fail()) + return error.takeError(); + + return ranges; +} + llvm::Expected SaveCoreOptions::GetCurrentSizeInBytes() { Status error; if (!m_process_sp) return Status::FromErrorString("Requires a process to be set.").takeError(); - error = EnsureValidConfiguration(m_process_sp); + error = EnsureValidConfiguration(); if (error.Fail()) return error.takeError(); @@ -169,8 +185,14 @@ llvm::Expected SaveCoreOptions::GetCurrentSizeInBytes() { if (error.Fail()) return error.takeError(); + llvm::Expected core_file_ranges_maybe = + GetMemoryRegionsToSave(); + if (!core_file_ranges_maybe) + return core_file_ranges_maybe.takeError(); + const lldb_private::CoreFileMemoryRanges &core_file_ranges = + *core_file_ranges_maybe; uint64_t total_in_bytes = 0; - for (auto &core_range : ranges) + for (const auto &core_range : core_file_ranges) total_in_bytes += core_range.data.range.size(); return total_in_bytes; diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump64b.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump64b.py new file mode 100644 index 0000000000000..b86b69c8399f2 --- /dev/null +++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump64b.py @@ -0,0 +1,102 @@ +""" +Test that saved memory regions is byte-wise 1:1 with the live process. Specifically +that the memory regions that will be populated in the Memory64List are the same byte for byte. +""" + +import os +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class ProcessSaveCoreMinidump64bTestCase(TestBase): + def verify_minidump( + self, + options, + ): + """Verify that the minidump is the same byte for byte as the live process.""" + self.build() + exe = self.getBuildArtifact("a.out") + target = self.dbg.CreateTarget(exe) + core_target = None + live_proc = target.LaunchSimple( + None, None, self.get_process_working_directory() + ) + try: + self.assertState(live_proc.GetState(), lldb.eStateStopped) + error = live_proc.SaveCore(options) + self.assertTrue(error.Success(), error.GetCString()) + core_target = self.dbg.CreateTarget(None) + core_proc = target.LoadCore(options.GetOutputFile().fullpath) + # Get the memory regions we saved off in this core, we can't compare to the core + # because we pull from /proc/pid/maps, so even ranges that don't get mapped in will show up + # as ranges in the minidump. + # + # Instead, we have an API that returns to us the number of regions we planned to save from the live process + # and we compare those + memory_regions_to_compare = options.GetMemoryRegionsToSave() + + for region in memory_regions_to_compare: + start_addr = region.GetRegionBase() + end_addr = region.GetRegionEnd() + actual_process_read_error = lldb.SBError() + actual = live_proc.ReadMemory( + start_addr, end_addr - start_addr, actual_process_read_error + ) + expected_process_read_error = lldb.SBError() + expected = core_proc.ReadMemory( + start_addr, end_addr - start_addr, expected_process_read_error + ) + + # Both processes could fail to read a given memory region, so if they both pass + # compare, then we'll fail them if the core differs from the live process. + if ( + actual_process_read_error.Success() + and expected_process_read_error.Success() + ): + self.assertEqual( + actual, expected, "Bytes differ between live process and core" + ) + + # Now we check if the error is the same, error isn't abnormal but they should fail for the same reason + # Success will be false if they both fail + self.assertTrue( + actual_process_read_error.Success() + == expected_process_read_error.Success(), + f"Address range {hex(start_addr)} - {hex(end_addr)} failed to read from live process and core for different reasons", + ) + finally: + self.assertTrue(self.dbg.DeleteTarget(target)) + if core_target is not None: + self.assertTrue(self.dbg.DeleteTarget(core_target)) + + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test_minidump_save_style_full(self): + """Test that a full minidump is the same byte for byte.""" + minidump_path = self.getBuildArtifact("minidump_full_force64b.dmp") + try: + options = lldb.SBSaveCoreOptions() + options.SetOutputFile(lldb.SBFileSpec(minidump_path)) + options.SetStyle(lldb.eSaveCoreFull) + options.SetPluginName("minidump") + self.verify_minidump(options) + finally: + if os.path.isfile(minidump_path): + os.unlink(minidump_path) + + @skipUnlessArch("x86_64") + @skipUnlessPlatform(["linux"]) + def test_minidump_save_style_mixed_memory(self): + """Test that a mixed memory minidump is the same byte for byte.""" + minidump_path = self.getBuildArtifact("minidump_mixed_force64b.dmp") + try: + options = lldb.SBSaveCoreOptions() + options.SetOutputFile(lldb.SBFileSpec(minidump_path)) + options.SetStyle(lldb.eSaveCoreDirtyOnly) + options.SetPluginName("minidump") + self.verify_minidump(options) + finally: + if os.path.isfile(minidump_path): + os.unlink(minidump_path) diff --git a/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py b/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py index 31e35e0285f17..92ca44ecbbffc 100644 --- a/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py +++ b/lldb/test/API/python_api/sbsavecoreoptions/TestSBSaveCoreOptions.py @@ -164,3 +164,46 @@ def test_get_total_in_bytes_missing_requirements(self): options.SetStyle(lldb.eSaveCoreCustomOnly) total = options.GetCurrentSizeInBytes(error) self.assertTrue(error.Fail(), error.GetCString()) + + def test_get_memory_regions_to_save(self): + """ + Tests the matrix of responses for GetMemoryRegionsToSave + """ + + options = lldb.SBSaveCoreOptions() + + # Not specifying plugin or process should return an empty list. + memory_list = options.GetMemoryRegionsToSave() + self.assertEqual(0, memory_list.GetSize()) + + # No style returns an empty list + process = self.get_basic_process() + options.SetProcess(process) + memory_list = options.GetMemoryRegionsToSave() + self.assertEqual(0, memory_list.GetSize()) + options.Clear() + + # No Process returns an empty list + options.SetStyle(lldb.eSaveCoreCustomOnly) + memory_list = options.GetMemoryRegionsToSave() + self.assertEqual(0, memory_list.GetSize()) + options.Clear() + + # Validate we get back the single region we populate + options.SetStyle(lldb.eSaveCoreCustomOnly) + process = self.get_basic_process() + options.SetProcess(process) + memory_range = lldb.SBMemoryRegionInfo() + + # Add the memory range of 0x1000-0x1100 + process.GetMemoryRegionInfo(0x1000, memory_range) + options.AddMemoryRegionToSave(memory_range) + memory_list = options.GetMemoryRegionsToSave() + self.assertEqual(1, memory_list.GetSize()) + read_region = lldb.SBMemoryRegionInfo() + memory_list.GetMemoryRegionAtIndex(0, read_region) + + # Permissions from Process getLLDBRegion aren't matching up with + # the live process permissions, so we're just checking the range for now. + self.assertEqual(memory_range.GetRegionBase(), read_region.GetRegionBase()) + self.assertEqual(memory_range.GetRegionEnd(), read_region.GetRegionEnd()) From 4dc6dfd65397e65f62a453b65cd180639c3a8b9e Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Fri, 18 Jul 2025 13:08:29 -0700 Subject: [PATCH 385/813] [NFC][profdata] Apply lints and other format fixes (#149433) Apply lints and other format fixes to `llvm/tools/llvm-profdata/llvm-profdata.cpp`. This is intended to have no functional change. --- llvm/tools/llvm-profdata/llvm-profdata.cpp | 120 ++++++++++----------- 1 file changed, 58 insertions(+), 62 deletions(-) diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index 45eac90aef935..5efabd5f2a7c6 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -16,7 +16,6 @@ #include "llvm/Debuginfod/HTTPClient.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Object/Binary.h" -#include "llvm/ProfileData/DataAccessProf.h" #include "llvm/ProfileData/InstrProfCorrelator.h" #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/InstrProfWriter.h" @@ -54,23 +53,23 @@ using ProfCorrelatorKind = InstrProfCorrelator::ProfCorrelatorKind; // https://llvm.org/docs/CommandGuide/llvm-profdata.html has documentations // on each subcommand. -cl::SubCommand ShowSubcommand( +static cl::SubCommand ShowSubcommand( "show", "Takes a profile data file and displays the profiles. See detailed " "documentation in " "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-show"); -cl::SubCommand OrderSubcommand( +static cl::SubCommand OrderSubcommand( "order", "Reads temporal profiling traces from a profile and outputs a function " "order that reduces the number of page faults for those traces. See " "detailed documentation in " "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-order"); -cl::SubCommand OverlapSubcommand( +static cl::SubCommand OverlapSubcommand( "overlap", "Computes and displays the overlap between two profiles. See detailed " "documentation in " "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-overlap"); -cl::SubCommand MergeSubcommand( +static cl::SubCommand MergeSubcommand( "merge", "Takes several profiles and merge them together. See detailed " "documentation in " @@ -93,12 +92,11 @@ enum class ShowFormat { Text, Json, Yaml }; } // namespace // Common options. -cl::opt OutputFilename("output", cl::value_desc("output"), - cl::init("-"), cl::desc("Output file"), - cl::sub(ShowSubcommand), - cl::sub(OrderSubcommand), - cl::sub(OverlapSubcommand), - cl::sub(MergeSubcommand)); +static cl::opt + OutputFilename("output", cl::value_desc("output"), cl::init("-"), + cl::desc("Output file"), cl::sub(ShowSubcommand), + cl::sub(OrderSubcommand), cl::sub(OverlapSubcommand), + cl::sub(MergeSubcommand)); // NOTE: cl::alias must not have cl::sub(), since aliased option's cl::sub() // will be used. llvm::cl::alias::done() method asserts this condition. static cl::alias OutputFilenameA("o", cl::desc("Alias for --output"), @@ -528,9 +526,9 @@ static void exitWithError(Twine Message, StringRef Whence = "", static void exitWithError(Error E, StringRef Whence = "") { if (E.isA()) { handleAllErrors(std::move(E), [&](const InstrProfError &IPE) { - instrprof_error instrError = IPE.get(); + instrprof_error InstrError = IPE.get(); StringRef Hint = ""; - if (instrError == instrprof_error::unrecognized_format) { + if (InstrError == instrprof_error::unrecognized_format) { // Hint in case user missed specifying the profile type. Hint = "Perhaps you forgot to use the --sample or --memory option?"; } @@ -637,7 +635,7 @@ class SymbolRemapper { return New.empty() ? Name : FunctionId(New); } }; -} +} // namespace struct WeightedFile { std::string Filename; @@ -827,18 +825,18 @@ loadInput(const WeightedFile &Input, SymbolRemapper *Remapper, // Only show hint the first time an error occurs. auto [ErrCode, Msg] = InstrProfError::take(std::move(E)); std::unique_lock ErrGuard{WC->ErrLock}; - bool firstTime = WC->WriterErrorCodes.insert(ErrCode).second; + bool FirstTime = WC->WriterErrorCodes.insert(ErrCode).second; handleMergeWriterError(make_error(ErrCode, Msg), - Input.Filename, FuncName, firstTime); + Input.Filename, FuncName, FirstTime); }); } if (KeepVTableSymbols) { - const InstrProfSymtab &symtab = Reader->getSymtab(); - const auto &VTableNames = symtab.getVTableNames(); + const InstrProfSymtab &Symtab = Reader->getSymtab(); + const auto &VTableNames = Symtab.getVTableNames(); - for (const auto &kv : VTableNames) - WC->Writer.addVTableName(kv.getKey()); + for (const auto &KV : VTableNames) + WC->Writer.addVTableName(KV.getKey()); } if (Reader->hasTemporalProfile()) { @@ -879,8 +877,8 @@ static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) { Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer), [&](Error E) { auto [ErrorCode, Msg] = InstrProfError::take(std::move(E)); std::unique_lock ErrGuard{Dst->ErrLock}; - bool firstTime = Dst->WriterErrorCodes.insert(ErrorCode).second; - if (firstTime) + bool FirstTime = Dst->WriterErrorCodes.insert(ErrorCode).second; + if (FirstTime) warn(toString(make_error(ErrorCode, Msg))); }); } @@ -890,24 +888,22 @@ getFuncName(const StringMap::value_type &Val) { return Val.first(); } -static std::string -getFuncName(const SampleProfileMap::value_type &Val) { +static std::string getFuncName(const SampleProfileMap::value_type &Val) { return Val.second.getContext().toString(); } -template -static void filterFunctions(T &ProfileMap) { - bool hasFilter = !FuncNameFilter.empty(); - bool hasNegativeFilter = !FuncNameNegativeFilter.empty(); - if (!hasFilter && !hasNegativeFilter) +template static void filterFunctions(T &ProfileMap) { + bool HasFilter = !FuncNameFilter.empty(); + bool HasNegativeFilter = !FuncNameNegativeFilter.empty(); + if (!HasFilter && !HasNegativeFilter) return; // If filter starts with '?' it is MSVC mangled name, not a regex. llvm::Regex ProbablyMSVCMangledName("[?@$_0-9A-Za-z]+"); - if (hasFilter && FuncNameFilter[0] == '?' && + if (HasFilter && FuncNameFilter[0] == '?' && ProbablyMSVCMangledName.match(FuncNameFilter)) FuncNameFilter = llvm::Regex::escape(FuncNameFilter); - if (hasNegativeFilter && FuncNameNegativeFilter[0] == '?' && + if (HasNegativeFilter && FuncNameNegativeFilter[0] == '?' && ProbablyMSVCMangledName.match(FuncNameNegativeFilter)) FuncNameNegativeFilter = llvm::Regex::escape(FuncNameNegativeFilter); @@ -915,9 +911,9 @@ static void filterFunctions(T &ProfileMap) { llvm::Regex Pattern(FuncNameFilter); llvm::Regex NegativePattern(FuncNameNegativeFilter); std::string Error; - if (hasFilter && !Pattern.isValid(Error)) + if (HasFilter && !Pattern.isValid(Error)) exitWithError(Error); - if (hasNegativeFilter && !NegativePattern.isValid(Error)) + if (HasNegativeFilter && !NegativePattern.isValid(Error)) exitWithError(Error); // Handle MD5 profile, so it is still able to match using the original name. @@ -929,10 +925,10 @@ static void filterFunctions(T &ProfileMap) { auto Tmp = I++; const auto &FuncName = getFuncName(*Tmp); // Negative filter has higher precedence than positive filter. - if ((hasNegativeFilter && + if ((HasNegativeFilter && (NegativePattern.match(FuncName) || (FunctionSamples::UseMD5 && NegativeMD5Name == FuncName))) || - (hasFilter && !(Pattern.match(FuncName) || + (HasFilter && !(Pattern.match(FuncName) || (FunctionSamples::UseMD5 && MD5Name == FuncName)))) ProfileMap.erase(Tmp); } @@ -1193,7 +1189,7 @@ adjustInstrProfile(std::unique_ptr &WC, StringMap StaticFuncMap; InstrProfSummaryBuilder IPBuilder(ProfileSummaryBuilder::DefaultCutoffs); - auto checkSampleProfileHasFUnique = [&Reader]() { + auto CheckSampleProfileHasFUnique = [&Reader]() { for (const auto &PD : Reader->getProfiles()) { auto &FContext = PD.second.getContext(); if (FContext.toString().find(FunctionSamples::UniqSuffix) != @@ -1204,9 +1200,9 @@ adjustInstrProfile(std::unique_ptr &WC, return false; }; - bool SampleProfileHasFUnique = checkSampleProfileHasFUnique(); + bool SampleProfileHasFUnique = CheckSampleProfileHasFUnique(); - auto buildStaticFuncMap = [&StaticFuncMap, + auto BuildStaticFuncMap = [&StaticFuncMap, SampleProfileHasFUnique](const StringRef Name) { std::string FilePrefixes[] = {".cpp", "cc", ".c", ".hpp", ".h"}; size_t PrefixPos = StringRef::npos; @@ -1366,7 +1362,7 @@ adjustInstrProfile(std::unique_ptr &WC, InstrProfRecord *R = &PD.getValue().begin()->second; StringRef FullName = PD.getKey(); InstrProfileMap[FullName] = InstrProfileEntry(R); - buildStaticFuncMap(FullName); + BuildStaticFuncMap(FullName); } for (auto &PD : Reader->getProfiles()) { @@ -1497,8 +1493,8 @@ remapSamples(const sampleprof::FunctionSamples &Samples, BodySample.second.getSamples()); for (const auto &Target : BodySample.second.getCallTargets()) { Result.addCalledTargetSamples(BodySample.first.LineOffset, - MaskedDiscriminator, - Remapper(Target.first), Target.second); + MaskedDiscriminator, Remapper(Target.first), + Target.second); } } for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) { @@ -1759,7 +1755,7 @@ static void parseInputFilenamesFile(MemoryBuffer *Buffer, if (SanitizedEntry.starts_with("#")) continue; // If there's no comma, it's an unweighted profile. - else if (!SanitizedEntry.contains(',')) + if (!SanitizedEntry.contains(',')) addWeightedInput(WFV, {std::string(SanitizedEntry), 1}); else addWeightedInput(WFV, parseWeightedFile(SanitizedEntry)); @@ -2740,10 +2736,11 @@ std::error_code SampleOverlapAggregator::loadProfiles() { return std::error_code(); } -void overlapSampleProfile(const std::string &BaseFilename, - const std::string &TestFilename, - const OverlapFuncFilters &FuncFilter, - uint64_t SimilarityCutoff, raw_fd_ostream &OS) { +static void overlapSampleProfile(const std::string &BaseFilename, + const std::string &TestFilename, + const OverlapFuncFilters &FuncFilter, + uint64_t SimilarityCutoff, + raw_fd_ostream &OS) { using namespace sampleprof; // We use 0.000005 to initialize OverlapAggr.Epsilon because the final metrics @@ -2883,7 +2880,7 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { OS << ":ir\n"; for (const auto &Func : *Reader) { - if (Reader->isIRLevelProfile()) { + if (IsIRInstr) { bool FuncIsCS = NamedInstrProfRecord::hasCSFlagInHash(Func.Hash); if (FuncIsCS != ShowCS) continue; @@ -2891,9 +2888,7 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { bool Show = ShowAllFunctions || (!FuncNameFilter.empty() && Func.Name.contains(FuncNameFilter)); - bool doTextFormatDump = (Show && TextFormat); - - if (doTextFormatDump) { + if (Show && TextFormat) { InstrProfSymtab &Symtab = Reader->getSymtab(); InstrProfWriter::writeRecordInText(Func.Name, Func.Hash, Func, Symtab, OS); @@ -2931,9 +2926,9 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { continue; } - for (size_t I = 0, E = Func.Counts.size(); I < E; ++I) { - FuncMax = std::max(FuncMax, Func.Counts[I]); - FuncSum += Func.Counts[I]; + for (const auto &Count : Func.Counts) { + FuncMax = std::max(FuncMax, Count); + FuncSum += Count; } if (FuncMax < ShowValueCutoff) { @@ -2943,7 +2938,8 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { << " Sum = " << FuncSum << ")\n"; } continue; - } else if (OnlyListBelow) + } + if (OnlyListBelow) continue; if (TopNFunctions) { @@ -3017,9 +3013,8 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { if (TextFormat || ShowCovered) return 0; std::unique_ptr PS(Builder.getSummary()); - bool IsIR = Reader->isIRLevelProfile(); - OS << "Instrumentation level: " << (IsIR ? "IR" : "Front-end"); - if (IsIR) { + OS << "Instrumentation level: " << (IsIRInstr ? "IR" : "Front-end"); + if (IsIRInstr) { OS << " entry_first = " << Reader->instrEntryBBEnabled(); OS << " instrument_loop_entries = " << Reader->instrLoopEntriesEnabled(); } @@ -3076,10 +3071,10 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { auto &Traces = Reader->getTemporalProfTraces(); OS << "Temporal Profile Traces (samples=" << Traces.size() << " seen=" << Reader->getTemporalProfTraceStreamSize() << "):\n"; - for (unsigned i = 0; i < Traces.size(); i++) { - OS << " Temporal Profile Trace " << i << " (weight=" << Traces[i].Weight - << " count=" << Traces[i].FunctionNameRefs.size() << "):\n"; - for (auto &NameRef : Traces[i].FunctionNameRefs) + for (auto [Index, Trace] : llvm::enumerate(Traces)) { + OS << " Temporal Profile Trace " << Index << " (weight=" << Trace.Weight + << " count=" << Trace.FunctionNameRefs.size() << "):\n"; + for (auto &NameRef : Trace.FunctionNameRefs) OS << " " << Reader->getSymtab().getFuncOrVarName(NameRef) << "\n"; } } @@ -3392,7 +3387,8 @@ static int show_main(StringRef ProgName) { exitWithErrorCode(EC, OutputFilename); if (ShowAllFunctions && !FuncNameFilter.empty()) - WithColor::warning() << "-function argument ignored: showing all functions\n"; + WithColor::warning() + << "-function argument ignored: showing all functions\n"; if (!DebugInfoFilename.empty()) return showDebugInfoCorrelation(DebugInfoFilename, SFormat, OS); From b846d8c3e26ef98c8d6936e7cad354f035d322d1 Mon Sep 17 00:00:00 2001 From: Hanumanth Date: Fri, 18 Jul 2025 16:12:57 -0400 Subject: [PATCH 386/813] [mlir][tosa] Fix tosa-reduce-transposes to handle large constants better (#148755) This change addresses the performance issue in the **--tosa-reduce-transposes** implementation by working directly with the raw tensor data, eliminating the need for creating the costly intermediate attributes that leads to bottleneck. --- .../Tosa/Transforms/TosaReduceTransposes.cpp | 136 ++++++++++-------- 1 file changed, 76 insertions(+), 60 deletions(-) diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp index 8ebbbc94eb6a2..db7a3c671dedc 100644 --- a/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp +++ b/mlir/lib/Dialect/Tosa/Transforms/TosaReduceTransposes.cpp @@ -178,10 +178,8 @@ std::optional TosaReduceTransposes::transposeDenseAttribute(DenseElementsAttr input, ArrayRef perms) { RankedTensorType oldType = llvm::cast(input.getType()); - RankedTensorType newType = - RankedTensorType::get(applyTOSAPermutation(oldType.getShape(), perms), - oldType.getElementType()); - size_t rank = oldType.getRank(); + ArrayRef oldShape = oldType.getShape(); + int64_t rank = oldType.getRank(); // Asserted by TransposeOp verifier and TOSA disallowing tensor with dimension // 0. If not in place, something is very wrong. @@ -190,65 +188,83 @@ TosaReduceTransposes::transposeDenseAttribute(DenseElementsAttr input, return std::nullopt; } - if (input.isSplat()) + auto newShape = applyTOSAPermutation(oldShape, perms); + RankedTensorType newType = + RankedTensorType::get(newShape, oldType.getElementType()); + + if (input.isSplat()) { return input.reshape(newType); + } + + auto rawData = input.getRawData(); + if (!rawData.data()) { + return std::nullopt; + } // The algorithm is approximately as follows: - // input: perms, input flat array, input tensor type - // (1/2) determine the strides of input/output if - // they were strided in row-major order. (3) adjust the strides for the - // input to be in the same order of indices as the output is written. - // (4) process dimension by dimension. example: perms 2, 0, 1; input - // 2x3x4; output 4x2x3 for i ... 4, j ... 2, k ... 3: output[i][j][k] = - // input[j][k][i] output[6i + 3j + k] = input[12j + 4k + i] and we adjust - // input strides to be as input[i + 12j + 4k] so we may process - // layer-by-layer. - - // Step 1/2: Strides for input. We ignore output since row-major and can just - // push_back. - - SmallVector originalInputStrides(rank); - originalInputStrides[rank - 1] = 1; - // index with int64_t to avoid overflow - for (int64_t i = rank - 2; i >= 0; i--) - originalInputStrides[i] = - originalInputStrides[i + 1] * oldType.getDimSize(i + 1); - - // Step 3: Transpose strides of input to be same indexing (i, j, k, ...) as - // output which is done in row-major order. - - SmallVector newInputStrides; - newInputStrides.reserve(rank); - for (int32_t v : perms) - newInputStrides.push_back(originalInputStrides[v]); - - // Step 4: Write out the transposed "flat array" dimension by dimension. - - auto inputArray = input.getValues(); - SmallVector> boundsAndStrides; - for (size_t i = 0; i < rank; i++) - boundsAndStrides.push_back({newType.getDimSize(i), newInputStrides[i]}); - - SmallVector resultArray; - resultArray.reserve(inputArray.size()); - - std::function>::const_iterator)> - processTransposeDim = [&](auto accumulatedIndex, auto it) { - if (it == boundsAndStrides.end()) { - resultArray.push_back(inputArray[accumulatedIndex]); - return; - } - - for (int64_t i = 0; i < it->first; i++) { - int64_t j = accumulatedIndex + i * it->second; - processTransposeDim(j, it + 1); - } - }; - - processTransposeDim(0, boundsAndStrides.begin()); - - return DenseElementsAttr::get(newType, resultArray); + // 1. Determine the strides of both input and output tensors in row-major + // order + // 2. Iterate through the output tensor linearly. + // 3. For each output position, decompose the linear index into + // multi-dimensional coordinates using output strides. + // 4. Use the permutation to map output coordinates to input coordinates and + // calculate the source linear index. + + // Example: perms [2, 0, 1]; input 2x3x4; output 4x2x3 + // for output linear index 11: decompose to output[1][1][2] + // using output strides [6,3,1]. Map to input coordinates using + // perms: dim 0→2, dim 1→0, dim 2→1, giving source position + // calculated as 1*inputStrides[2] + 1*inputStrides[0] + 2*inputStrides[1] + // = 1*1 + 1*12 + 2*4 = 21 + + size_t elementSize = oldType.getElementTypeBitWidth() / 8; + int64_t numElements = oldType.getNumElements(); + + SmallVector outputBuffer(numElements * elementSize); + const char *inputPtr = rawData.data(); + char *outputPtr = outputBuffer.data(); + + auto calculateStrides = [](ArrayRef shape) -> SmallVector { + int64_t rank = shape.size(); + SmallVector strides(rank); + strides[rank - 1] = 1; + for (int64_t i = rank - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * shape[i + 1]; + } + return strides; + }; + + // Calculate strides for both input and output tensors + SmallVector inputStrides = calculateStrides(oldShape); + SmallVector outputStrides = calculateStrides(newShape); + + auto mapCoordinates = [&](int64_t destLinearIndex) -> int64_t { + int64_t tempDestIndex = destLinearIndex; + int64_t sourceLinearIndex = 0; + + // Decompose linear destination index into multi-dimensional + // coordinates dividing by output strides. + // Simultaneously map these coordinates through the permutation + // to calculate the corresponding source linear index. + for (auto j : llvm::seq(rank)) { + int64_t destCoord = tempDestIndex / outputStrides[j]; + tempDestIndex %= outputStrides[j]; + sourceLinearIndex += destCoord * inputStrides[perms[j]]; + } + + return sourceLinearIndex; + }; + + for (auto destLinearIndex : llvm::seq(numElements)) { + int64_t sourceLinearIndex = mapCoordinates(destLinearIndex); + + // Copy the element from source to destination using type-agnostic byte + // copying. + std::memcpy(outputPtr + destLinearIndex * elementSize, + inputPtr + sourceLinearIndex * elementSize, elementSize); + } + + return DenseElementsAttr::getFromRawBuffer(newType, outputBuffer); } // The SetVector should only contain ConstOp, ReshapeOp, TransposeOp From d64802d6d96ec5aff3739ce34f8143b935921809 Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Fri, 18 Jul 2025 15:26:09 -0500 Subject: [PATCH 387/813] [lldb][framework] Glob headers from source for framework (#148736) When gathering the headers to fix up and place in LLDB.framework, we were previously globbing the header files from a location in the build directory. This commit changes this to glob from the source directory instead, as we were globbing from the build directory without ensuring that the necessary files were actually in that location before globbing. --- lldb/cmake/modules/LLDBFramework.cmake | 43 ------------------- lldb/scripts/framework-header-fix.py | 8 ++-- lldb/scripts/version-header-fix.py | 4 ++ lldb/source/API/CMakeLists.txt | 24 ++++++++++- .../Shell/Scripts/TestFrameworkFixScript.test | 2 +- .../Scripts/TestRPCFrameworkFixScript.test | 2 +- 6 files changed, 33 insertions(+), 50 deletions(-) diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake index bbd717a982cf3..c6f00ed05cfc2 100644 --- a/lldb/cmake/modules/LLDBFramework.cmake +++ b/lldb/cmake/modules/LLDBFramework.cmake @@ -70,33 +70,6 @@ endif() find_program(unifdef_EXECUTABLE unifdef) -# All necessary header files will be staged in the include directory in the build directory, -# so just copy the files from there into the framework's staging directory. -set(lldb_build_dir_header_staging "${CMAKE_BINARY_DIR}/include/lldb") -set(lldb_framework_header_staging "${CMAKE_CURRENT_BINARY_DIR}/FrameworkHeaders") -file(GLOB lldb_build_dir_header_staging_list ${lldb_build_dir_header_staging}/*) -foreach(header ${lldb_build_dir_header_staging_list}) - - get_filename_component(basename ${header} NAME) - set(staged_header ${lldb_framework_header_staging}/${basename}) - - if(unifdef_EXECUTABLE) - # unifdef returns 0 when the file is unchanged and 1 if something was changed. - # That means if we successfully remove SWIG code, the build system believes - # that the command has failed and stops. This is undesirable. - set(copy_command ${unifdef_EXECUTABLE} -USWIG -o ${staged_header} ${header} || (exit 0)) - else() - set(copy_command ${CMAKE_COMMAND} -E copy ${header} ${staged_header}) - endif() - - add_custom_command( - DEPENDS ${header} OUTPUT ${staged_header} - COMMAND ${copy_command} - COMMENT "LLDB.framework: collect framework header and remove SWIG macros") - - list(APPEND lldb_staged_headers ${staged_header}) -endforeach() - # Wrap output in a target, so lldb-framework can depend on it. add_custom_target(liblldb-resource-headers DEPENDS lldb-sbapi-dwarf-enums ${lldb_staged_headers}) set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources") @@ -105,22 +78,6 @@ set_target_properties(liblldb-resource-headers PROPERTIES FOLDER "LLDB/Resources add_dependencies(liblldb-resource-headers liblldb-header-staging) add_dependencies(liblldb liblldb-resource-headers) -# Take the headers from the staging directory and fix up their includes for the framework. -# Then write them to the output directory. -# Also, run unifdef to remove any specified guards from the header files. -file(GLOB lldb_framework_header_staging_list ${lldb_framework_header_staging}/*) -foreach(header ${lldb_framework_header_staging_list}) - - set(input_header ${header}) - get_filename_component(header_basename ${input_header} NAME) - set(output_header $/Headers/${header_basename}) - - add_custom_command(TARGET liblldb POST_BUILD - COMMAND ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${input_header} -o ${output_header} -p ${unifdef_EXECUTABLE} USWIG - COMMENT "LLDB.framework: Fix up and copy framework headers" - ) -endforeach() - # Copy vendor-specific headers from clang (without staging). if(NOT APPLE_EMBEDDED) if (TARGET clang-resource-headers) diff --git a/lldb/scripts/framework-header-fix.py b/lldb/scripts/framework-header-fix.py index 6ea8df4c24dd4..aa034db36968d 100755 --- a/lldb/scripts/framework-header-fix.py +++ b/lldb/scripts/framework-header-fix.py @@ -97,7 +97,7 @@ def main(): parser.add_argument("-o", "--output_file") parser.add_argument("-p", "--unifdef_path") parser.add_argument( - "unifdef_guards", + "--unifdef_guards", nargs="+", type=str, help="Guards to be removed with unifdef. These must be specified in the same way as they would be when passed directly into unifdef.", @@ -111,7 +111,8 @@ def main(): # unifdef takes the guards to remove as arguments in their own right (e.g. -USWIG) # but passing them in with dashes for this script causes argparse to think that they're # arguments in and of themself, so they need to passed in without dashes. - unifdef_guards = ["-" + guard for guard in args.unifdef_guards] + if args.unifdef_guards: + unifdef_guards = ["-" + guard for guard in args.unifdef_guards] # Create the framework's header dir if it doesn't already exist if not os.path.exists(os.path.dirname(output_file_path)): @@ -123,7 +124,8 @@ def main(): modify_rpc_includes(input_file_path, output_file_path) # After the incldues have been modified, run unifdef on the headers to remove any guards # specified at the command line. - remove_guards(output_file_path, unifdef_path, unifdef_guards) + if args.unifdef_guards: + remove_guards(output_file_path, unifdef_path, unifdef_guards) if __name__ == "__main__": diff --git a/lldb/scripts/version-header-fix.py b/lldb/scripts/version-header-fix.py index 98457e6f5b3cd..0caf7c62bc91f 100755 --- a/lldb/scripts/version-header-fix.py +++ b/lldb/scripts/version-header-fix.py @@ -29,6 +29,10 @@ def main(): input_path = str(args.input_path) output_path = str(args.output_path) + # Create the output dir if it doesn't already exist + if not os.path.exists(os.path.dirname(output_path)): + os.makedirs(os.path.dirname(output_path)) + with open(input_path, "r") as input_file: lines = input_file.readlines() file_buffer = "".join(lines) diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt index 4751ed319b259..197c98c0d2b0c 100644 --- a/lldb/source/API/CMakeLists.txt +++ b/lldb/source/API/CMakeLists.txt @@ -295,12 +295,21 @@ endif() # Stage all headers in the include directory in the build dir. file(GLOB public_headers ${LLDB_SOURCE_DIR}/include/lldb/API/*.h) set(lldb_header_staging_dir ${CMAKE_BINARY_DIR}/include/lldb) +set(generated_public_headers ${LLDB_OBJ_DIR}/include/lldb/API/SBLanguages.h) file(GLOB root_public_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-*.h) file(GLOB root_private_headers ${LLDB_SOURCE_DIR}/include/lldb/lldb-private*.h) list(REMOVE_ITEM root_public_headers ${root_private_headers}) find_program(unifdef_EXECUTABLE unifdef) +add_custom_target(liblldb-header-staging DEPENDS ${lldb_staged_headers} ${lldb_header_staging_dir}/lldb-defines.h) + +if (LLDB_BUILD_FRAMEWORK) + add_custom_target(lldb-framework-fixup-all-headers) + add_dependencies(lldb-framework-fixup-all-headers liblldb-header-staging) + add_dependencies(liblldb lldb-framework-fixup-all-headers) +endif() + foreach(header ${public_headers} ${generated_public_headers} @@ -323,12 +332,23 @@ foreach(header COMMENT "LLDB headers: stage LLDB headers in include directory") list(APPEND lldb_staged_headers ${staged_header}) + + if (LLDB_BUILD_FRAMEWORK) + set(output_header $/Headers/${basename}) + + add_custom_target(lldb-framework-fixup-header-${basename} DEPENDS ${staged_header}) + add_dependencies(lldb-framework-fixup-all-headers lldb-framework-fixup-header-${basename}) + + add_custom_command(TARGET lldb-framework-fixup-header-${basename} POST_BUILD + COMMAND "${Python3_EXECUTABLE}" ${LLDB_SOURCE_DIR}/scripts/framework-header-fix.py -f lldb_main -i ${staged_header} -o ${output_header} + COMMENT "LLDB.framework: Fix up and copy framework headers" + ) + endif() endforeach() -add_custom_command(TARGET liblldb POST_BUILD +add_custom_command(TARGET liblldb-header-staging POST_BUILD COMMAND "${Python3_EXECUTABLE}" ${LLDB_SOURCE_DIR}/scripts/version-header-fix.py -i ${LLDB_SOURCE_DIR}/include/lldb/lldb-defines.h -o ${lldb_header_staging_dir}/lldb-defines.h -m ${LLDB_VERSION_MAJOR} -n ${LLDB_VERSION_MINOR} -p ${LLDB_VERSION_PATCH} ) -add_custom_target(liblldb-header-staging DEPENDS ${lldb_staged_headers}) add_dependencies(liblldb liblldb-header-staging) if(LLDB_BUILD_FRAMEWORK) diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test index 5c48b796efda4..2b1818ef8d6c6 100644 --- a/lldb/test/Shell/Scripts/TestFrameworkFixScript.test +++ b/lldb/test/Shell/Scripts/TestFrameworkFixScript.test @@ -1,6 +1,6 @@ # Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir. RUN: mkdir -p %t/Outputs -RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG +RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef --unifdef_guards USWIG # Check the output RUN: cat %t/Outputs/SBAddress.h | FileCheck %s diff --git a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test index d015942653967..e2080ca01a6fc 100644 --- a/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test +++ b/lldb/test/Shell/Scripts/TestRPCFrameworkFixScript.test @@ -1,6 +1,6 @@ # Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir. RUN: mkdir -p %t/Outputs -RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/RPC/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef USWIG +RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_rpc -i %p/Inputs/RPC/RPCSBAddress.h -o %t/Outputs/RPCSBAddress.h -p /usr/bin/unifdef --unifdef_guards USWIG # Check the output RUN: cat %t/Outputs/RPCSBAddress.h | FileCheck %s From 9d9662e4bdffc109cc5a7d0fa7c522d27babfa31 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Fri, 18 Jul 2025 13:27:31 -0700 Subject: [PATCH 388/813] [NVPTX][test] fixup version for ptxas on trunc-tofp.ll (#149558) --- llvm/test/CodeGen/NVPTX/trunc-tofp.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll index 404c423cc026a..12502b6f29899 100644 --- a/llvm/test/CodeGen/NVPTX/trunc-tofp.ll +++ b/llvm/test/CodeGen/NVPTX/trunc-tofp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=sm_80 | FileCheck %s -; RUN: %if ptxas %{ llc < %s -mcpu=sm_80 | %ptxas-verify %} +; RUN: llc < %s -mcpu=sm_50 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mcpu=sm_50 | %ptxas-verify -arch=sm_50 %} target triple = "nvptx64-nvidia-cuda" From 3c1a09d939f44cbe039ea178af5a77c40b2776a0 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 13:32:42 -0700 Subject: [PATCH 389/813] [lldb] Use a range-based for loop instead of llvm::for_each (NFC) (#149541) LLVM Coding Standards discourages llvm::for_each unless we already have a callable. --- .../SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp index f9aa6b1a98765..b775ec98c9a17 100644 --- a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp +++ b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp @@ -87,9 +87,8 @@ class PluginProperties : public Properties { void ServerURLsChangedCallback() { m_server_urls = GetDebugInfoDURLs(); llvm::SmallVector dbginfod_urls; - llvm::for_each(m_server_urls, [&](const auto &obj) { + for (const auto &obj : m_server_urls) dbginfod_urls.push_back(obj.ref()); - }); llvm::setDefaultDebuginfodUrls(dbginfod_urls); } // Storage for the StringRef's used within the Debuginfod library. From 36c78ec3c8641cb193ea66f49da01fa6f62280d7 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 13:32:49 -0700 Subject: [PATCH 390/813] [DebugInfo] Use llvm::remove_if (NFC) (#149543) We can pass a range to llvm::remove_if. --- llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp index 93a567e89f774..64f1bfc015380 100644 --- a/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp +++ b/llvm/lib/DebugInfo/LogicalView/Core/LVScope.cpp @@ -263,7 +263,7 @@ bool LVScope::removeElement(LVElement *Element) { return Item == Element; }; auto RemoveElement = [Element, Predicate](auto &Container) -> bool { - auto Iter = std::remove_if(Container->begin(), Container->end(), Predicate); + auto Iter = llvm::remove_if(*Container, Predicate); if (Iter != Container->end()) { Container->erase(Iter, Container->end()); Element->resetParent(); From c98b05bd567c16d575bc241cd0602cdf6558d8db Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 13:32:56 -0700 Subject: [PATCH 391/813] [mlir] Deprecate NamedAttrList(std::nullopt_t) (NFC) (#149544) This patch deprecates NamedAttrList(std::nullopt_t) to avoid use of std::nullopt outside the context of std::optional. --- mlir/include/mlir/IR/OperationSupport.h | 1 + mlir/lib/AsmParser/Parser.cpp | 4 ++-- mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp | 2 +- mlir/unittests/IR/OperationSupportTest.cpp | 4 ++-- mlir/unittests/IR/ValueTest.cpp | 2 +- mlir/unittests/Transforms/DialectConversion.cpp | 2 +- 6 files changed, 8 insertions(+), 7 deletions(-) diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h index 65e6d4f64e36c..5282c47dd7036 100644 --- a/mlir/include/mlir/IR/OperationSupport.h +++ b/mlir/include/mlir/IR/OperationSupport.h @@ -802,6 +802,7 @@ class NamedAttrList { using size_type = size_t; NamedAttrList() : dictionarySorted({}, true) {} + LLVM_DEPRECATED("Use NamedAttrList() instead", "NamedAttrList()") NamedAttrList(std::nullopt_t none) : NamedAttrList() {} NamedAttrList(ArrayRef attributes); NamedAttrList(DictionaryAttr attributes); diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp index 756d3d01a4534..435ff713a1b29 100644 --- a/mlir/lib/AsmParser/Parser.cpp +++ b/mlir/lib/AsmParser/Parser.cpp @@ -1198,8 +1198,8 @@ Value OperationParser::createForwardRefPlaceholder(SMLoc loc, Type type) { auto name = OperationName("builtin.unrealized_conversion_cast", getContext()); auto *op = Operation::create( getEncodedSourceLocation(loc), name, type, /*operands=*/{}, - /*attributes=*/std::nullopt, /*properties=*/nullptr, /*successors=*/{}, - /*numRegions=*/0); + /*attributes=*/NamedAttrList(), /*properties=*/nullptr, + /*successors=*/{}, /*numRegions=*/0); forwardRefPlaceholders[op->getResult(0)] = loc; forwardRefOps.insert(op); return op->getResult(0); diff --git a/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp b/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp index f688fa97e8409..6a81422b6b66b 100644 --- a/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp +++ b/mlir/unittests/Debug/FileLineColLocBreakpointManagerTest.cpp @@ -24,7 +24,7 @@ static Operation *createOp(MLIRContext *context, Location loc, unsigned int numRegions = 0) { context->allowUnregisteredDialects(); return Operation::create(loc, OperationName(operationName, context), {}, {}, - std::nullopt, OpaqueProperties(nullptr), {}, + NamedAttrList(), OpaqueProperties(nullptr), {}, numRegions); } diff --git a/mlir/unittests/IR/OperationSupportTest.cpp b/mlir/unittests/IR/OperationSupportTest.cpp index 4b800fa36a375..7bc1a044d0dad 100644 --- a/mlir/unittests/IR/OperationSupportTest.cpp +++ b/mlir/unittests/IR/OperationSupportTest.cpp @@ -24,7 +24,7 @@ static Operation *createOp(MLIRContext *context, ArrayRef operands = {}, context->allowUnregisteredDialects(); return Operation::create(UnknownLoc::get(context), OperationName("foo.bar", context), resultTypes, - operands, std::nullopt, nullptr, {}, numRegions); + operands, NamedAttrList(), nullptr, {}, numRegions); } namespace { @@ -236,7 +236,7 @@ TEST(OperationFormatPrintTest, CanPrintNameAsPrefix) { Operation *op = Operation::create( NameLoc::get(StringAttr::get(&context, "my_named_loc")), OperationName("t.op", &context), builder.getIntegerType(16), {}, - std::nullopt, nullptr, {}, 0); + NamedAttrList(), nullptr, {}, 0); std::string str; OpPrintingFlags flags; diff --git a/mlir/unittests/IR/ValueTest.cpp b/mlir/unittests/IR/ValueTest.cpp index fc671be39f1eb..97e32d474d522 100644 --- a/mlir/unittests/IR/ValueTest.cpp +++ b/mlir/unittests/IR/ValueTest.cpp @@ -22,7 +22,7 @@ static Operation *createOp(MLIRContext *context, ArrayRef operands = {}, context->allowUnregisteredDialects(); return Operation::create(UnknownLoc::get(context), OperationName("foo.bar", context), resultTypes, - operands, std::nullopt, nullptr, {}, numRegions); + operands, NamedAttrList(), nullptr, {}, numRegions); } namespace { diff --git a/mlir/unittests/Transforms/DialectConversion.cpp b/mlir/unittests/Transforms/DialectConversion.cpp index 7bb27f721414c..6418c9dc0ac5b 100644 --- a/mlir/unittests/Transforms/DialectConversion.cpp +++ b/mlir/unittests/Transforms/DialectConversion.cpp @@ -15,7 +15,7 @@ static Operation *createOp(MLIRContext *context) { context->allowUnregisteredDialects(); return Operation::create(UnknownLoc::get(context), OperationName("foo.bar", context), {}, {}, - std::nullopt, /*properties=*/nullptr, {}, 0); + NamedAttrList(), /*properties=*/nullptr, {}, 0); } namespace { From cb6370167fd26d61397c1a2555d4c8a5f116d1f6 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 13:33:05 -0700 Subject: [PATCH 392/813] [mlir] Deprecate OpPrintingFlags(std::nullopt_t) (NFC) (#149546) This patch deprecates OpPrintingFlags(std::nullopt_t) to avoid use of std::nullopt outside the context of std::optional. --- mlir/include/mlir/IR/OpDefinition.h | 2 +- mlir/include/mlir/IR/Operation.h | 2 +- mlir/include/mlir/IR/OperationSupport.h | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h index 75c3aea0792ac..883ece32967e4 100644 --- a/mlir/include/mlir/IR/OpDefinition.h +++ b/mlir/include/mlir/IR/OpDefinition.h @@ -115,7 +115,7 @@ class OpState { MLIRContext *getContext() { return getOperation()->getContext(); } /// Print the operation to the given stream. - void print(raw_ostream &os, OpPrintingFlags flags = std::nullopt) { + void print(raw_ostream &os, OpPrintingFlags flags = {}) { state->print(os, flags); } void print(raw_ostream &os, AsmState &asmState) { diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h index 1c2c04e718bf7..fa8a4873572ce 100644 --- a/mlir/include/mlir/IR/Operation.h +++ b/mlir/include/mlir/IR/Operation.h @@ -318,7 +318,7 @@ class alignas(8) Operation final /// take O(N) where N is the number of operations within the parent block. bool isBeforeInBlock(Operation *other); - void print(raw_ostream &os, const OpPrintingFlags &flags = std::nullopt); + void print(raw_ostream &os, const OpPrintingFlags &flags = {}); void print(raw_ostream &os, AsmState &state); void dump(); diff --git a/mlir/include/mlir/IR/OperationSupport.h b/mlir/include/mlir/IR/OperationSupport.h index 5282c47dd7036..1ff7c56ddca38 100644 --- a/mlir/include/mlir/IR/OperationSupport.h +++ b/mlir/include/mlir/IR/OperationSupport.h @@ -1176,6 +1176,7 @@ class alignas(8) OperandStorage { class OpPrintingFlags { public: OpPrintingFlags(); + LLVM_DEPRECATED("Use OpPrintingFlags() instead", "OpPrintingFlags()") OpPrintingFlags(std::nullopt_t) : OpPrintingFlags() {} /// Enables the elision of large elements attributes by printing a lexically From c5f0c4ad378803f449f37730601b7d95059600a4 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 18 Jul 2025 13:28:33 -0700 Subject: [PATCH 393/813] [RISCV][IA] Add test coverage for vp.store of interleaveN with one active --- .../rvv/fixed-vectors-interleaved-access.ll | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index dbc8e891ab5f7..bdf344d4d16ae 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1754,6 +1754,17 @@ define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) { ret void } +define void @vpstore_factor4_one_active(ptr %ptr, <4 x i32> %v) { +; CHECK-LABEL: vpstore_factor4_one_active: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> + tail call void @llvm.vp.store.v16i32.p0(<16 x i32> %v0, ptr %ptr, <16 x i1> splat (i1 true), i32 16) + ret void +} + define void @store_factor4_one_active_idx1(ptr %ptr, <4 x i32> %v) { ; CHECK-LABEL: store_factor4_one_active_idx1: ; CHECK: # %bb.0: @@ -1828,8 +1839,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_mask(ptr %ptr) { ; RV32-NEXT: vle32.v v12, (a0), v0.t ; RV32-NEXT: li a0, 36 ; RV32-NEXT: vmv.s.x v20, a1 -; RV32-NEXT: lui a1, %hi(.LCPI53_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI53_0) +; RV32-NEXT: lui a1, %hi(.LCPI54_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI54_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v21, (a1) ; RV32-NEXT: vcompress.vm v8, v12, v11 @@ -1904,8 +1915,8 @@ define {<4 x i32>, <4 x i32>, <4 x i32>} @invalid_vp_evl(ptr %ptr) { ; RV32-NEXT: vmv.s.x v10, a0 ; RV32-NEXT: li a0, 146 ; RV32-NEXT: vmv.s.x v11, a0 -; RV32-NEXT: lui a0, %hi(.LCPI54_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI54_0) +; RV32-NEXT: lui a0, %hi(.LCPI55_0) +; RV32-NEXT: addi a0, a0, %lo(.LCPI55_0) ; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; RV32-NEXT: vle16.v v20, (a0) ; RV32-NEXT: li a0, 36 From 9878ef3abd2a48fcfb81357d581dac292b52ddb3 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 18 Jul 2025 13:33:46 -0700 Subject: [PATCH 394/813] CodeGen: Respect function align attribute if less than preferred alignment. Reviewers: arsenm, efriedma-quic Reviewed By: arsenm Pull Request: https://github.com/llvm/llvm-project/pull/149444 --- llvm/lib/CodeGen/MachineFunction.cpp | 3 +-- llvm/test/CodeGen/X86/function-align.ll | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/X86/function-align.ll diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 38ad582ba923c..429a17a9113d3 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -211,9 +211,8 @@ void MachineFunction::init() { ConstantPool = new (Allocator) MachineConstantPool(getDataLayout()); Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); - // FIXME: Shouldn't use pref alignment if explicit alignment is set on F. // FIXME: Use Function::hasOptSize(). - if (!F.hasFnAttribute(Attribute::OptimizeForSize)) + if (!F.getAlign() && !F.hasFnAttribute(Attribute::OptimizeForSize)) Alignment = std::max(Alignment, STI->getTargetLowering()->getPrefFunctionAlignment()); diff --git a/llvm/test/CodeGen/X86/function-align.ll b/llvm/test/CodeGen/X86/function-align.ll new file mode 100644 index 0000000000000..11d0e99929927 --- /dev/null +++ b/llvm/test/CodeGen/X86/function-align.ll @@ -0,0 +1,18 @@ +; RUN: llc -function-sections < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: .section .text.f1 +; CHECK-NOT: .p2align +; CHECK: f1: +define void @f1() align 1 { + ret void +} + +; CHECK: .section .text.f2 +; CHECK-NEXT: .globl f2 +; CHECK-NEXT: .p2align 1 +define void @f2() align 2 { + ret void +} From 97a8476068bad449c0340021398b0356a44857aa Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 18 Jul 2025 13:44:25 -0700 Subject: [PATCH 395/813] [flang][runtime] Further work on speeding up work queue operations (#149189) This patch avoids a trip through the work queue engine for cases on a CPU where finalization and destruction actions during assignment were handled without enqueueing another task. --- flang-rt/lib/runtime/assign.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp index d642ed578b061..7cf4147a94a95 100644 --- a/flang-rt/lib/runtime/assign.cpp +++ b/flang-rt/lib/runtime/assign.cpp @@ -279,13 +279,15 @@ RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) { if (mustDeallocateLHS) { // Convert the LHS into a temporary, then make it look deallocated. toDeallocate_ = &tempDescriptor_.descriptor(); - persist_ = true; // tempDescriptor_ state must outlive child tickets std::memcpy( reinterpret_cast(toDeallocate_), &to_, to_.SizeInBytes()); to_.set_base_addr(nullptr); if (toDerived_ && (flags_ & NeedFinalization)) { - if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)}; - status != StatOk && status != StatContinue) { + int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)}; + if (status == StatContinue) { + // tempDescriptor_ state must outlive pending child ticket + persist_ = true; + } else if (status != StatOk) { return status; } flags_ &= ~NeedFinalization; @@ -304,6 +306,9 @@ RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) { if (int stat{ReturnError( workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))}; stat != StatOk) { + if (stat == StatContinue) { + persist_ = true; + } return stat; } if (HasDynamicComponent(*from_)) { @@ -507,6 +512,7 @@ RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) { } } if (persist_) { + // tempDescriptor_ must outlive pending child ticket(s) done_ = true; return StatContinue; } else { From 680b8dd7073cce6606006ae723899444521aa496 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 18 Jul 2025 13:44:44 -0700 Subject: [PATCH 396/813] =?UTF-8?q?[flang][runtime]=20Handle=20spaces=20be?= =?UTF-8?q?fore=20')'=20in=20alternative=20list-directe=E2=80=A6=20(#14938?= =?UTF-8?q?4)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …d complex input List-directed reads of complex values that can't go through the usual fast path (as in this bug's test case, which uses DECIMAL='COMMA') didn't skip spaces before the closing right parenthesis correctly. Fixes https://github.com/llvm/llvm-project/issues/149164. --- flang-rt/lib/runtime/edit-input.cpp | 41 +++++-------------- .../unittests/Runtime/NumericalFormatTest.cpp | 31 ++++++++++++++ 2 files changed, 41 insertions(+), 31 deletions(-) diff --git a/flang-rt/lib/runtime/edit-input.cpp b/flang-rt/lib/runtime/edit-input.cpp index 0cc287aa3b47e..13557678f6057 100644 --- a/flang-rt/lib/runtime/edit-input.cpp +++ b/flang-rt/lib/runtime/edit-input.cpp @@ -19,16 +19,19 @@ namespace Fortran::runtime::io { RT_OFFLOAD_API_GROUP_BEGIN -// Checks that a list-directed input value has been entirely consumed and -// doesn't contain unparsed characters before the next value separator. +// Handle DC or DECIMAL='COMMA' and determine the active separator character +static inline RT_API_ATTRS char32_t GetSeparatorChar(const DataEdit &edit) { + return edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','}; +} + static inline RT_API_ATTRS bool IsCharValueSeparator( const DataEdit &edit, char32_t ch) { - char32_t comma{ - edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','}}; - return ch == ' ' || ch == '\t' || ch == comma || ch == '/' || + return ch == ' ' || ch == '\t' || ch == '/' || ch == GetSeparatorChar(edit) || (edit.IsNamelist() && (ch == '&' || ch == '$')); } +// Checks that a list-directed input value has been entirely consumed and +// doesn't contain unparsed characters before the next value separator. static RT_API_ATTRS bool CheckCompleteListDirectedField( IoStatementState &io, const DataEdit &edit) { if (edit.IsListDirected()) { @@ -54,10 +57,6 @@ static RT_API_ATTRS bool CheckCompleteListDirectedField( } } -static inline RT_API_ATTRS char32_t GetSeparatorChar(const DataEdit &edit) { - return edit.modes.editingFlags & decimalComma ? char32_t{';'} : char32_t{','}; -} - template static RT_API_ATTRS bool EditBOZInput( IoStatementState &io, const DataEdit &edit, void *n, std::size_t bytes) { @@ -518,7 +517,7 @@ static RT_API_ATTRS ScannedRealInput ScanRealInput( // Consume the trailing ')' of a list-directed or NAMELIST complex // input value. if (edit.descriptor == DataEdit::ListDirectedImaginaryPart) { - if (next && (*next == ' ' || *next == '\t')) { + if (!next || *next == ' ' || *next == '\t') { io.SkipSpaces(remaining); next = io.NextInField(remaining, edit); } @@ -1006,27 +1005,7 @@ static RT_API_ATTRS bool EditListDirectedCharacterInput( // Undelimited list-directed character input: stop at a value separator // or the end of the current record. while (auto ch{io.GetCurrentChar(byteCount)}) { - bool isSep{false}; - switch (*ch) { - case ' ': - case '\t': - case '/': - isSep = true; - break; - case '&': - case '$': - isSep = edit.IsNamelist(); - break; - case ',': - isSep = !(edit.modes.editingFlags & decimalComma); - break; - case ';': - isSep = !!(edit.modes.editingFlags & decimalComma); - break; - default: - break; - } - if (isSep) { + if (IsCharValueSeparator(edit, *ch)) { break; } if (length > 0) { diff --git a/flang-rt/unittests/Runtime/NumericalFormatTest.cpp b/flang-rt/unittests/Runtime/NumericalFormatTest.cpp index f1492d0e39fec..73245dca13bc0 100644 --- a/flang-rt/unittests/Runtime/NumericalFormatTest.cpp +++ b/flang-rt/unittests/Runtime/NumericalFormatTest.cpp @@ -213,6 +213,37 @@ TEST(IOApiTests, ListInputTest) { << "', but got '" << output << "'"; } +TEST(IOApiTests, ListInputComplexRegressionTest) { + static const char input[]{"(1,;2, );(3,;4,)"}; + auto cookie{IONAME(BeginInternalListInput)(input, sizeof input - 1)}; + static constexpr int numRealValues{4}; + float z[numRealValues]; + ASSERT_TRUE(IONAME(SetDecimal)(cookie, "COMMA", 5)); + for (int j{0}; j < numRealValues; j += 2) { + ASSERT_TRUE(IONAME(InputComplex32)(cookie, &z[j])) + << "InputComplex32 failed with value " << z[j]; + } + auto status{IONAME(EndIoStatement)(cookie)}; + ASSERT_EQ(status, 0) << "Failed complex list-directed input, status " + << static_cast(status); + static constexpr int bufferSize{18}; + char output[bufferSize]; + output[bufferSize - 1] = '\0'; + cookie = IONAME(BeginInternalListOutput)(output, bufferSize - 1); + for (int j{0}; j < numRealValues; j += 2) { + ASSERT_TRUE(IONAME(OutputComplex32)(cookie, z[j], z[j + 1])) + << "OutputComplex32 failed when outputting value " << z[j] << ", " + << z[j + 1]; + } + status = IONAME(EndIoStatement)(cookie); + ASSERT_EQ(status, 0) << "Failed complex list-directed output, status " + << static_cast(status); + static const char expect[bufferSize]{" (1.,2.) (3.,4.) "}; + ASSERT_EQ(std::strncmp(output, expect, bufferSize), 0) + << "Failed complex list-directed output, expected '" << expect + << "', but got '" << output << "'"; +} + TEST(IOApiTests, DescriptorOutputTest) { static constexpr int bufferSize{10}; char buffer[bufferSize]; From 9e5b2fbe86ed9b303eff779fff012d6a96574f3d Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 18 Jul 2025 13:45:05 -0700 Subject: [PATCH 397/813] [flang][runtime] Preserve type when remapping monomorphic pointers (#149427) Pointer remappings unconditionally update the element byte size and derived type of the pointer's descriptor. This is okay when the pointer is polymorphic, but not when a pointer is associated with an extended type. To communicate this monomorphic case to the runtime, add a new entry point so as to not break forward binary compatibility. --- .../include/flang-rt/runtime/descriptor.h | 3 ++- flang-rt/lib/runtime/descriptor.cpp | 19 +++++++++++-------- flang-rt/lib/runtime/pointer.cpp | 19 ++++++++++++++++--- flang/include/flang/Lower/Runtime.h | 2 +- .../Optimizer/Builder/Runtime/Intrinsics.h | 2 +- flang/include/flang/Runtime/pointer.h | 5 +++++ flang/lib/Lower/Bridge.cpp | 6 ++++-- flang/lib/Lower/Runtime.cpp | 15 ++++++++------- flang/test/Lower/polymorphic.f90 | 11 +++++++++++ 9 files changed, 59 insertions(+), 23 deletions(-) diff --git a/flang-rt/include/flang-rt/runtime/descriptor.h b/flang-rt/include/flang-rt/runtime/descriptor.h index 68106f3462c9b..bc5a5b5f14697 100644 --- a/flang-rt/include/flang-rt/runtime/descriptor.h +++ b/flang-rt/include/flang-rt/runtime/descriptor.h @@ -478,7 +478,8 @@ class Descriptor { const SubscriptValue *upper = nullptr, const SubscriptValue *stride = nullptr); - RT_API_ATTRS void ApplyMold(const Descriptor &, int rank); + RT_API_ATTRS void ApplyMold( + const Descriptor &, int rank, bool isMonomorphic = false); RT_API_ATTRS void Check() const; diff --git a/flang-rt/lib/runtime/descriptor.cpp b/flang-rt/lib/runtime/descriptor.cpp index e9301bd0307d2..021440cbdd0f6 100644 --- a/flang-rt/lib/runtime/descriptor.cpp +++ b/flang-rt/lib/runtime/descriptor.cpp @@ -252,18 +252,21 @@ RT_API_ATTRS bool Descriptor::EstablishPointerSection(const Descriptor &source, return CFI_section(&raw_, &source.raw_, lower, upper, stride) == CFI_SUCCESS; } -RT_API_ATTRS void Descriptor::ApplyMold(const Descriptor &mold, int rank) { - raw_.elem_len = mold.raw_.elem_len; +RT_API_ATTRS void Descriptor::ApplyMold( + const Descriptor &mold, int rank, bool isMonomorphic) { raw_.rank = rank; - raw_.type = mold.raw_.type; for (int j{0}; j < rank && j < mold.raw_.rank; ++j) { GetDimension(j) = mold.GetDimension(j); } - if (auto *addendum{Addendum()}) { - if (auto *moldAddendum{mold.Addendum()}) { - *addendum = *moldAddendum; - } else { - INTERNAL_CHECK(!addendum->derivedType()); + if (!isMonomorphic) { + raw_.elem_len = mold.raw_.elem_len; + raw_.type = mold.raw_.type; + if (auto *addendum{Addendum()}) { + if (auto *moldAddendum{mold.Addendum()}) { + *addendum = *moldAddendum; + } else { + INTERNAL_CHECK(!addendum->derivedType()); + } } } } diff --git a/flang-rt/lib/runtime/pointer.cpp b/flang-rt/lib/runtime/pointer.cpp index 04487abd3272e..68db2594acdd4 100644 --- a/flang-rt/lib/runtime/pointer.cpp +++ b/flang-rt/lib/runtime/pointer.cpp @@ -87,9 +87,9 @@ void RTDEF(PointerAssociateLowerBounds)(Descriptor &pointer, } } -void RTDEF(PointerAssociateRemapping)(Descriptor &pointer, +static void RT_API_ATTRS PointerRemapping(Descriptor &pointer, const Descriptor &target, const Descriptor &bounds, const char *sourceFile, - int sourceLine) { + int sourceLine, bool isMonomorphic) { Terminator terminator{sourceFile, sourceLine}; SubscriptValue byteStride{/*captured from first dimension*/}; std::size_t boundElementBytes{bounds.ElementBytes()}; @@ -99,7 +99,7 @@ void RTDEF(PointerAssociateRemapping)(Descriptor &pointer, // the ranks may mismatch. Use target as a mold for initializing // the pointer descriptor. INTERNAL_CHECK(static_cast(pointer.rank()) == boundsRank); - pointer.ApplyMold(target, boundsRank); + pointer.ApplyMold(target, boundsRank, isMonomorphic); pointer.set_base_addr(target.raw().base_addr); pointer.raw().attribute = CFI_attribute_pointer; for (unsigned j{0}; j < boundsRank; ++j) { @@ -124,6 +124,19 @@ void RTDEF(PointerAssociateRemapping)(Descriptor &pointer, } } +void RTDEF(PointerAssociateRemapping)(Descriptor &pointer, + const Descriptor &target, const Descriptor &bounds, const char *sourceFile, + int sourceLine) { + PointerRemapping( + pointer, target, bounds, sourceFile, sourceLine, /*isMonomorphic=*/false); +} +void RTDEF(PointerAssociateRemappingMonomorphic)(Descriptor &pointer, + const Descriptor &target, const Descriptor &bounds, const char *sourceFile, + int sourceLine) { + PointerRemapping( + pointer, target, bounds, sourceFile, sourceLine, /*isMonomorphic=*/true); +} + RT_API_ATTRS void *AllocateValidatedPointerPayload( std::size_t byteSize, int allocatorIdx) { // Add space for a footer to validate during deallocation. diff --git a/flang/include/flang/Lower/Runtime.h b/flang/include/flang/Lower/Runtime.h index 77e98a1e019e7..f76f398569b54 100644 --- a/flang/include/flang/Lower/Runtime.h +++ b/flang/include/flang/Lower/Runtime.h @@ -70,7 +70,7 @@ void genPointerAssociate(fir::FirOpBuilder &, mlir::Location, mlir::Value pointer, mlir::Value target); void genPointerAssociateRemapping(fir::FirOpBuilder &, mlir::Location, mlir::Value pointer, mlir::Value target, - mlir::Value bounds); + mlir::Value bounds, bool isMonomorphic); void genPointerAssociateLowerBounds(fir::FirOpBuilder &, mlir::Location, mlir::Value pointer, mlir::Value target, mlir::Value lbounds); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h index 9ca4b2baeaa65..145ea04e56484 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Intrinsics.h @@ -37,7 +37,7 @@ void genPointerAssociate(fir::FirOpBuilder &, mlir::Location, mlir::Value pointer, mlir::Value target); void genPointerAssociateRemapping(fir::FirOpBuilder &, mlir::Location, mlir::Value pointer, mlir::Value target, - mlir::Value bounds); + mlir::Value bounds, bool isMonomorphic); mlir::Value genCpuTime(fir::FirOpBuilder &, mlir::Location); void genDateAndTime(fir::FirOpBuilder &, mlir::Location, diff --git a/flang/include/flang/Runtime/pointer.h b/flang/include/flang/Runtime/pointer.h index 83472ee59d2ab..6787ef3ece232 100644 --- a/flang/include/flang/Runtime/pointer.h +++ b/flang/include/flang/Runtime/pointer.h @@ -59,9 +59,14 @@ void RTDECL(PointerAssociateLowerBounds)( // Associates a pointer with a target with bounds remapping. The target must be // simply contiguous &/or of rank 1. The bounds constitute a [2,newRank] // integer array whose columns are [lower bound, upper bound] on each dimension. +// Use the Monomorphic form if the pointer's type shouldn't change and +// the target is polymorphic. void RTDECL(PointerAssociateRemapping)(Descriptor &, const Descriptor &target, const Descriptor &bounds, const char *sourceFile = nullptr, int sourceLine = 0); +void RTDECL(PointerAssociateRemappingMonomorphic)(Descriptor &, + const Descriptor &target, const Descriptor &bounds, + const char *sourceFile = nullptr, int sourceLine = 0); // Data pointer allocation and deallocation diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 5f0783f869bf6..7ce397a11861b 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -4703,8 +4703,10 @@ class FirConverter : public Fortran::lower::AbstractConverter { mlir::Value lhs = lhsMutableBox.getAddr(); mlir::Value rhs = fir::getBase(genExprBox(loc, assign.rhs, stmtCtx)); mlir::Value boundsDesc = createBoundArray(lbounds, ubounds, loc); - Fortran::lower::genPointerAssociateRemapping(*builder, loc, lhs, rhs, - boundsDesc); + Fortran::lower::genPointerAssociateRemapping( + *builder, loc, lhs, rhs, boundsDesc, + lhsType && rhsType && !lhsType->IsPolymorphic() && + rhsType->IsPolymorphic()); return; } if (!lowerToHighLevelFIR() && explicitIterationSpace()) { diff --git a/flang/lib/Lower/Runtime.cpp b/flang/lib/Lower/Runtime.cpp index 5f73335242336..ae8bf0e1630aa 100644 --- a/flang/lib/Lower/Runtime.cpp +++ b/flang/lib/Lower/Runtime.cpp @@ -213,14 +213,15 @@ void Fortran::lower::genPointerAssociate(fir::FirOpBuilder &builder, builder.create(loc, func, args); } -void Fortran::lower::genPointerAssociateRemapping(fir::FirOpBuilder &builder, - mlir::Location loc, - mlir::Value pointer, - mlir::Value target, - mlir::Value bounds) { +void Fortran::lower::genPointerAssociateRemapping( + fir::FirOpBuilder &builder, mlir::Location loc, mlir::Value pointer, + mlir::Value target, mlir::Value bounds, bool isMonomorphic) { mlir::func::FuncOp func = - fir::runtime::getRuntimeFunc(loc, - builder); + isMonomorphic + ? fir::runtime::getRuntimeFunc(loc, builder) + : fir::runtime::getRuntimeFunc( + loc, builder); auto fTy = func.getFunctionType(); auto sourceFile = fir::factory::locationToFilename(builder, loc); auto sourceLine = diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90 index a84b495dd09d0..f586380e653a0 100644 --- a/flang/test/Lower/polymorphic.f90 +++ b/flang/test/Lower/polymorphic.f90 @@ -178,6 +178,17 @@ subroutine polymorphic_to_nonpolymorphic(p) ! CHECK-LABEL: func.func @_QMpolymorphic_testPpolymorphic_to_nonpolymorphic ! Just checking that FIR is generated without error. + subroutine nonpolymorphic_to_polymorphic(p, t) + type p1 + end type + type(p1), pointer :: p(:) + class(p1), target :: t(:) + p(0:1) => t + end subroutine + +! CHECK-LABEL: func.func @_QMpolymorphic_testPnonpolymorphic_to_polymorphic +! CHECK: fir.call @_FortranAPointerAssociateRemappingMonomorphic + ! Test that lowering does not crash for function return with unlimited ! polymoprhic value. From b6ea04a37b2a41e24bb999e5a9b6a7bd2b576085 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Fri, 18 Jul 2025 13:45:25 -0700 Subject: [PATCH 398/813] [flang][NFC] Fix build-time warning (#149549) Don't increment the LHS variable of an assignment that also uses that variable on the RHS. --- flang/lib/Semantics/resolve-labels.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/flang/lib/Semantics/resolve-labels.cpp b/flang/lib/Semantics/resolve-labels.cpp index 27e259fab3873..9454ef9fe928a 100644 --- a/flang/lib/Semantics/resolve-labels.cpp +++ b/flang/lib/Semantics/resolve-labels.cpp @@ -492,10 +492,9 @@ class ParseTreeAnalyzer { // Uppercase the name of the main program, so that its symbol name // would be unique from similarly named non-main-program symbols. auto upperCaseCharBlock = [](const parser::CharBlock &cb) { - char *ch{const_cast(cb.begin())}; - char *endCh{ch + cb.size()}; - while (ch != endCh) { - *ch++ = parser::ToUpperCaseLetter(*ch); + auto ch{const_cast(cb.begin())}; + for (char *endCh{ch + cb.size()}; ch != endCh; ++ch) { + *ch = parser::ToUpperCaseLetter(*ch); } }; const parser::CharBlock *progName{nullptr}; From abdd4536ce0fc75c7a4ddcc1da5913ec5e028091 Mon Sep 17 00:00:00 2001 From: Andre Kuhlenschmidt Date: Fri, 18 Jul 2025 13:50:09 -0700 Subject: [PATCH 399/813] [flang][openacc] fix bugs with default(none) checking (#149220) A report of the following code not generating an error led to fixing two bugs in directive checking. - We should treat CombinedConstructs as OpenACC Constructs - We should treat DoConstruct index variables as private. ```fortran subroutine sub(nn) integer :: nn, ii !$acc serial loop default(none) do ii = 1, nn end do !$acc end serial loop end subroutine ``` Here `nn` should be flagged as needing a data clause while `ii` should still get one implicitly. --- flang/lib/Semantics/resolve-directives.cpp | 15 +++++++++++++++ flang/test/Lower/OpenACC/acc-loop.f90 | 17 ++++++++++------- .../test/Semantics/OpenACC/acc-kernels-loop.f90 | 7 +++++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 151f4ccae634e..521c7432d9fbb 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -138,6 +138,9 @@ class AccAttributeVisitor : DirectiveAttributeVisitor { void Post(const parser::OpenACCBlockConstruct &) { PopContext(); } bool Pre(const parser::OpenACCCombinedConstruct &); void Post(const parser::OpenACCCombinedConstruct &) { PopContext(); } + void Post(const parser::AccBeginCombinedDirective &) { + GetContext().withinConstruct = true; + } bool Pre(const parser::OpenACCDeclarativeConstruct &); void Post(const parser::OpenACCDeclarativeConstruct &) { PopContext(); } @@ -160,6 +163,18 @@ class AccAttributeVisitor : DirectiveAttributeVisitor { GetContext().withinConstruct = true; } + // TODO: We should probably also privatize ConcurrentBounds. + template + bool Pre(const parser::LoopBounds &x) { + if (!dirContext_.empty() && GetContext().withinConstruct) { + if (auto *symbol{ResolveAcc( + x.name.thing, Symbol::Flag::AccPrivate, currScope())}) { + AddToContextObjectWithDSA(*symbol, Symbol::Flag::AccPrivate); + } + } + return true; + } + bool Pre(const parser::OpenACCStandaloneConstruct &); void Post(const parser::OpenACCStandaloneConstruct &) { PopContext(); } void Post(const parser::AccStandaloneDirective &) { diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90 index c6df28ec5e000..f9f5e8c2165d5 100644 --- a/flang/test/Lower/OpenACC/acc-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-loop.f90 @@ -372,12 +372,15 @@ subroutine sub1(i, j, k) end subroutine ! CHECK: func.func @_QPsub1 -! CHECK: acc.parallel -! CHECK: %[[DC_K:.*]] = fir.alloca i32 {bindc_name = "k"} -! CHECK: %[[DC_J:.*]] = fir.alloca i32 {bindc_name = "j"} -! CHECK: %[[DC_I:.*]] = fir.alloca i32 {bindc_name = "i"} -! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]] : !fir.ref) -> !fir.ref {implicit = true, name = "i"} -! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref) -> !fir.ref {implicit = true, name = "j"} -! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref) -> !fir.ref {implicit = true, name = "k"} +! CHECK-SAME: %[[ARG_I:.*]]: !fir.ref {fir.bindc_name = "i"} +! CHECK-SAME: %[[ARG_J:.*]]: !fir.ref {fir.bindc_name = "j"} +! CHECK-SAME: %[[ARG_K:.*]]: !fir.ref {fir.bindc_name = "k"} +! CHECK: %[[DC_I:.*]]:2 = hlfir.declare %[[ARG_I]] dummy_scope %0 +! CHECK: %[[DC_J:.*]]:2 = hlfir.declare %[[ARG_J]] dummy_scope %0 +! CHECK: %[[DC_K:.*]]:2 = hlfir.declare %[[ARG_K]] dummy_scope %0 +! CHECK: acc.parallel combined(loop) +! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]]#0 : !fir.ref) -> !fir.ref {implicit = true, name = "i"} +! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]]#0 : !fir.ref) -> !fir.ref {implicit = true, name = "j"} +! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]]#0 : !fir.ref) -> !fir.ref {implicit = true, name = "k"} ! CHECK: acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[P_I]] : !fir.ref, @privatization_ref_i32 -> %[[P_J]] : !fir.ref, @privatization_ref_i32 -> %[[P_K]] : !fir.ref) control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%c10{{.*}}, %c100{{.*}}, %c200{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) ! CHECK: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} diff --git a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 index 29985a02eb6ef..cfe27e4f8fca1 100644 --- a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 +++ b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 @@ -243,8 +243,15 @@ program openacc_kernels_loop_validity a(i) = 3.14 end do + !$acc kernels loop default(none) private(N, a) + do i = 1, N + a(i) = 3.14 + end do + !$acc kernels loop default(none) + !ERROR: The DEFAULT(NONE) clause requires that 'n' must be listed in a data-mapping clause do i = 1, N + !ERROR: The DEFAULT(NONE) clause requires that 'a' must be listed in a data-mapping clause a(i) = 3.14 end do From 695660cdfd1ca65cd6e02e6950d10c990dfa0036 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 18 Jul 2025 13:53:17 -0700 Subject: [PATCH 400/813] [AMDGPU] Provide control to force VGPR MFMA form (#148079) This gives an override to the user to force select VGPR form of MFMA. Eventually we will drop this in favor of compiler making better decisions, but this provides a mechanism for users to address the cases where MayNeedAGPRs favors the AGPR form and performance is degraded due to poor RA. --- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 14 +- .../CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll | 76 + .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll | 3906 +++++++++++++++++ 3 files changed, 3994 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 75ce67c00228d..8c2e9b620ad16 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -29,6 +29,16 @@ enum { MAX_LANES = 64 }; using namespace llvm; +// TODO -- delete this flag once we have more robust mechanisms to allocate the +// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases +// where it is better to produce the VGPR form (e.g. if there are VGPR users +// of the MFMA result). +cl::opt MFMAVGPRForm( + "amdgpu-mfma-vgpr-form", cl::Hidden, + cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " + "unspecified, default to compiler heuristics"), + cl::init(false)); + const GCNTargetMachine &getTM(const GCNSubtarget *STI) { const SITargetLowering *TLI = STI->getTargetLowering(); return static_cast(TLI->getTargetMachine()); @@ -69,8 +79,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F, PSInputAddr = AMDGPU::getInitialPSInputAddr(F); } - MayNeedAGPRs = ST.hasMAIInsts(); - if (ST.hasGFX90AInsts() && + MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm; + if (!MFMAVGPRForm && ST.hasGFX90AInsts() && ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() && !mayUseAGPRs(F)) MayNeedAGPRs = false; // We will select all MAI with VGPR operands. diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll new file mode 100644 index 0000000000000..87a7c2ef6c95c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s + +declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) + +define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) { +; HEURRC-LABEL: default: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: default: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 { +; HEURRC-LABEL: request_agpr: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: request_agpr: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #1 { +; HEURRC-LABEL: request_no_agpr: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: request_no_agpr: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_setpc_b64 s[30:31] + %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) + ret <4 x float> %result +} + +attributes #0 = { "amdgpu-agpr-alloc"="32,256" } +attributes #1 = { "amdgpu-agpr-alloc"="0,0" } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 4628a9c15391b..866dba7746565 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) @@ -25,6 +27,48 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_f32_16x16x32_f16: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_f32_16x16x32_f16: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_f32_16x16x32_f16: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -45,6 +89,48 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_f32_16x16x32_f16__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_f32_16x16x32_f16__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_f32_16x16x32_f16__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) ret <4 x float> %result } @@ -91,6 +177,84 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; AGPR-NEXT: v_mov_b32_e32 v8, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s0 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s1 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s2 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s3 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPR-NEXT: v_mov_b32_e32 v12, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPR-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) store <4 x float> %result, ptr addrspace(1) %out ret void @@ -138,6 +302,84 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; AGPR-NEXT: v_mov_b32_e32 v8, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s0 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s1 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s2 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s3 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPR-NEXT: v_mov_b32_e32 v12, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPR-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) store <4 x float> %result, ptr addrspace(1) %out ret void @@ -271,6 +513,258 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, s16 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b32_e32 v9, s17 +; HEURRC-NEXT: v_mov_b32_e32 v10, s18 +; HEURRC-NEXT: v_mov_b32_e32 v11, s19 +; HEURRC-NEXT: s_nop 4 +; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s12 +; HEURRC-NEXT: v_mov_b32_e32 v1, s13 +; HEURRC-NEXT: v_mov_b32_e32 v2, s14 +; HEURRC-NEXT: v_mov_b32_e32 v3, s15 +; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s16 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; VGPRRC-NEXT: v_mov_b32_e32 v41, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s19 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 +; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_32x32x16_f16: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b64_e32 v[12:13], 48 +; AGPR-NEXT: v_mov_b64_e32 v[14:15], 32 +; AGPR-NEXT: v_mov_b64_e32 v[16:17], 16 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: v_mov_b64_e32 v[18:19], 0 +; AGPR-NEXT: v_mov_b32_e32 v8, s16 +; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: v_mov_b32_e32 v9, s17 +; AGPR-NEXT: v_mov_b32_e32 v10, s18 +; AGPR-NEXT: v_mov_b32_e32 v11, s19 +; AGPR-NEXT: s_nop 4 +; AGPR-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_32x32x16_f16: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b64_e32 v[44:45], 48 +; VGPR-NEXT: v_mov_b64_e32 v[46:47], 32 +; VGPR-NEXT: v_mov_b64_e32 v[48:49], 16 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[50:51], 0 +; VGPR-NEXT: v_mov_b32_e32 v40, s16 +; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] +; VGPR-NEXT: v_mov_b32_e32 v41, s17 +; VGPR-NEXT: v_mov_b32_e32 v42, s18 +; VGPR-NEXT: v_mov_b32_e32 v43, s19 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v0, s20 +; VGPR-NEXT: v_mov_b32_e32 v1, s21 +; VGPR-NEXT: v_mov_b32_e32 v2, s22 +; VGPR-NEXT: v_mov_b32_e32 v3, s23 +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s8 +; VGPR-NEXT: v_mov_b32_e32 v1, s9 +; VGPR-NEXT: v_mov_b32_e32 v2, s10 +; VGPR-NEXT: v_mov_b32_e32 v3, s11 +; VGPR-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s12 +; VGPR-NEXT: v_mov_b32_e32 v1, s13 +; VGPR-NEXT: v_mov_b32_e32 v2, s14 +; VGPR-NEXT: v_mov_b32_e32 v3, s15 +; VGPR-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store volatile <16 x float> %result, ptr addrspace(1) null store volatile <16 x float> %arg2, ptr addrspace(1) null @@ -401,6 +895,258 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[16:17], 16 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[18:19], 0 +; HEURRC-NEXT: v_mov_b32_e32 v8, s16 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: v_mov_b32_e32 v9, s17 +; HEURRC-NEXT: v_mov_b32_e32 v10, s18 +; HEURRC-NEXT: v_mov_b32_e32 v11, s19 +; HEURRC-NEXT: s_nop 4 +; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s12 +; HEURRC-NEXT: v_mov_b32_e32 v1, s13 +; HEURRC-NEXT: v_mov_b32_e32 v2, s14 +; HEURRC-NEXT: v_mov_b32_e32 v3, s15 +; HEURRC-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[48:49], 16 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPRRC-NEXT: v_mov_b64_e32 v[50:51], 0 +; VGPRRC-NEXT: v_mov_b32_e32 v40, s16 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s19 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 +; VGPRRC-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_32x32x16_f16__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b64_e32 v[12:13], 48 +; AGPR-NEXT: v_mov_b64_e32 v[14:15], 32 +; AGPR-NEXT: v_mov_b64_e32 v[16:17], 16 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: v_mov_b64_e32 v[18:19], 0 +; AGPR-NEXT: v_mov_b32_e32 v8, s16 +; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: v_mov_b32_e32 v9, s17 +; AGPR-NEXT: v_mov_b32_e32 v10, s18 +; AGPR-NEXT: v_mov_b32_e32 v11, s19 +; AGPR-NEXT: s_nop 4 +; AGPR-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[16:17], a[20:23], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[18:19], a[16:19], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[14:15], v[8:11], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v[18:19], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v[16:17], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_32x32x16_f16__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b64_e32 v[44:45], 48 +; VGPR-NEXT: v_mov_b64_e32 v[46:47], 32 +; VGPR-NEXT: v_mov_b64_e32 v[48:49], 16 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[50:51], 0 +; VGPR-NEXT: v_mov_b32_e32 v40, s16 +; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; VGPR-NEXT: v_mov_b32_e32 v41, s17 +; VGPR-NEXT: v_mov_b32_e32 v42, s18 +; VGPR-NEXT: v_mov_b32_e32 v43, s19 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[28:31], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[24:27], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[48:49], v[20:23], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[50:51], v[16:19], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[40:43], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v0, s20 +; VGPR-NEXT: v_mov_b32_e32 v1, s21 +; VGPR-NEXT: v_mov_b32_e32 v2, s22 +; VGPR-NEXT: v_mov_b32_e32 v3, s23 +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s8 +; VGPR-NEXT: v_mov_b32_e32 v1, s9 +; VGPR-NEXT: v_mov_b32_e32 v2, s10 +; VGPR-NEXT: v_mov_b32_e32 v3, s11 +; VGPR-NEXT: global_store_dwordx4 v[50:51], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s12 +; VGPR-NEXT: v_mov_b32_e32 v1, s13 +; VGPR-NEXT: v_mov_b32_e32 v2, s14 +; VGPR-NEXT: v_mov_b32_e32 v3, s15 +; VGPR-NEXT: global_store_dwordx4 v[48:49], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 2, i32 3, i32 1) store volatile <16 x float> %result, ptr addrspace(1) null store volatile <16 x float> %arg2, ptr addrspace(1) null @@ -448,6 +1194,134 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4 +; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5 +; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6 +; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7 +; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8 +; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9 +; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10 +; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11 +; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12 +; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13 +; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14 +; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__mac: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, v8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, v9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, v10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, v11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, v12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, v13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, v14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, v15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, v16 +; VGPRRC-NEXT: v_mov_b32_e32 v9, v17 +; VGPRRC-NEXT: v_mov_b32_e32 v10, v18 +; VGPRRC-NEXT: v_mov_b32_e32 v11, v19 +; VGPRRC-NEXT: v_mov_b32_e32 v12, v20 +; VGPRRC-NEXT: v_mov_b32_e32 v13, v21 +; VGPRRC-NEXT: v_mov_b32_e32 v14, v22 +; VGPRRC-NEXT: v_mov_b32_e32 v15, v23 +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_f32_32x32x16_f16__mac: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, v12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, v13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, v14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, v15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, v16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, v17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, v18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, v19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, v20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, v21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, v22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, v23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 3 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: v_accvgpr_read_b32 v4, a4 +; AGPR-NEXT: v_accvgpr_read_b32 v5, a5 +; AGPR-NEXT: v_accvgpr_read_b32 v6, a6 +; AGPR-NEXT: v_accvgpr_read_b32 v7, a7 +; AGPR-NEXT: v_accvgpr_read_b32 v8, a8 +; AGPR-NEXT: v_accvgpr_read_b32 v9, a9 +; AGPR-NEXT: v_accvgpr_read_b32 v10, a10 +; AGPR-NEXT: v_accvgpr_read_b32 v11, a11 +; AGPR-NEXT: v_accvgpr_read_b32 v12, a12 +; AGPR-NEXT: v_accvgpr_read_b32 v13, a13 +; AGPR-NEXT: v_accvgpr_read_b32 v14, a14 +; AGPR-NEXT: v_accvgpr_read_b32 v15, a15 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_f32_32x32x16_f16__mac: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 3 +; VGPR-NEXT: v_mov_b32_e32 v0, v8 +; VGPR-NEXT: v_mov_b32_e32 v1, v9 +; VGPR-NEXT: v_mov_b32_e32 v2, v10 +; VGPR-NEXT: v_mov_b32_e32 v3, v11 +; VGPR-NEXT: v_mov_b32_e32 v4, v12 +; VGPR-NEXT: v_mov_b32_e32 v5, v13 +; VGPR-NEXT: v_mov_b32_e32 v6, v14 +; VGPR-NEXT: v_mov_b32_e32 v7, v15 +; VGPR-NEXT: v_mov_b32_e32 v8, v16 +; VGPR-NEXT: v_mov_b32_e32 v9, v17 +; VGPR-NEXT: v_mov_b32_e32 v10, v18 +; VGPR-NEXT: v_mov_b32_e32 v11, v19 +; VGPR-NEXT: v_mov_b32_e32 v12, v20 +; VGPR-NEXT: v_mov_b32_e32 v13, v21 +; VGPR-NEXT: v_mov_b32_e32 v14, v22 +; VGPR-NEXT: v_mov_b32_e32 v15, v23 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) ret <16 x float> %result } @@ -493,6 +1367,134 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4 +; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5 +; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6 +; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7 +; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8 +; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9 +; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10 +; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11 +; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12 +; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13 +; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14 +; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, v8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, v9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, v10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, v11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, v12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, v13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, v14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, v15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, v16 +; VGPRRC-NEXT: v_mov_b32_e32 v9, v17 +; VGPRRC-NEXT: v_mov_b32_e32 v10, v18 +; VGPRRC-NEXT: v_mov_b32_e32 v11, v19 +; VGPRRC-NEXT: v_mov_b32_e32 v12, v20 +; VGPRRC-NEXT: v_mov_b32_e32 v13, v21 +; VGPRRC-NEXT: v_mov_b32_e32 v14, v22 +; VGPRRC-NEXT: v_mov_b32_e32 v15, v23 +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, v12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, v13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, v14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, v15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, v16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, v17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, v18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, v19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, v20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, v21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, v22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, v23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 3 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: v_accvgpr_read_b32 v4, a4 +; AGPR-NEXT: v_accvgpr_read_b32 v5, a5 +; AGPR-NEXT: v_accvgpr_read_b32 v6, a6 +; AGPR-NEXT: v_accvgpr_read_b32 v7, a7 +; AGPR-NEXT: v_accvgpr_read_b32 v8, a8 +; AGPR-NEXT: v_accvgpr_read_b32 v9, a9 +; AGPR-NEXT: v_accvgpr_read_b32 v10, a10 +; AGPR-NEXT: v_accvgpr_read_b32 v11, a11 +; AGPR-NEXT: v_accvgpr_read_b32 v12, a12 +; AGPR-NEXT: v_accvgpr_read_b32 v13, a13 +; AGPR-NEXT: v_accvgpr_read_b32 v14, a14 +; AGPR-NEXT: v_accvgpr_read_b32 v15, a15 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_f32_32x32x16_f16__mac__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 3 +; VGPR-NEXT: v_mov_b32_e32 v0, v8 +; VGPR-NEXT: v_mov_b32_e32 v1, v9 +; VGPR-NEXT: v_mov_b32_e32 v2, v10 +; VGPR-NEXT: v_mov_b32_e32 v3, v11 +; VGPR-NEXT: v_mov_b32_e32 v4, v12 +; VGPR-NEXT: v_mov_b32_e32 v5, v13 +; VGPR-NEXT: v_mov_b32_e32 v6, v14 +; VGPR-NEXT: v_mov_b32_e32 v7, v15 +; VGPR-NEXT: v_mov_b32_e32 v8, v16 +; VGPR-NEXT: v_mov_b32_e32 v9, v17 +; VGPR-NEXT: v_mov_b32_e32 v10, v18 +; VGPR-NEXT: v_mov_b32_e32 v11, v19 +; VGPR-NEXT: v_mov_b32_e32 v12, v20 +; VGPR-NEXT: v_mov_b32_e32 v13, v21 +; VGPR-NEXT: v_mov_b32_e32 v14, v22 +; VGPR-NEXT: v_mov_b32_e32 v15, v23 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 1, i32 1) ret <16 x float> %result } @@ -615,6 +1617,246 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd(<8 x half> %arg0, ; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v8, s20 +; HEURRC-NEXT: v_mov_b32_e32 v9, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v10, s22 +; HEURRC-NEXT: v_mov_b32_e32 v11, s23 +; HEURRC-NEXT: v_mov_b32_e32 v0, s16 +; HEURRC-NEXT: v_mov_b32_e32 v1, s17 +; HEURRC-NEXT: v_mov_b32_e32 v2, s18 +; HEURRC-NEXT: v_mov_b32_e32 v3, s19 +; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s12 +; HEURRC-NEXT: v_mov_b32_e32 v1, s13 +; HEURRC-NEXT: v_mov_b32_e32 v2, s14 +; HEURRC-NEXT: v_mov_b32_e32 v3, s15 +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] +; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: v_mov_b32_e32 v12, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; AGPR-NEXT: v_accvgpr_write_b32 a31, s23 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; AGPR-NEXT: v_accvgpr_write_b32 a30, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a29, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a28, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a27, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a26, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a25, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a24, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a23, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a22, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a21, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a20, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a19, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a18, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a17, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a16, s8 +; AGPR-NEXT: v_mov_b32_e32 v8, s20 +; AGPR-NEXT: v_mov_b32_e32 v9, s21 +; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] +; AGPR-NEXT: v_mov_b32_e32 v10, s22 +; AGPR-NEXT: v_mov_b32_e32 v11, s23 +; AGPR-NEXT: v_mov_b32_e32 v0, s16 +; AGPR-NEXT: v_mov_b32_e32 v1, s17 +; AGPR-NEXT: v_mov_b32_e32 v2, s18 +; AGPR-NEXT: v_mov_b32_e32 v3, s19 +; AGPR-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: v_mov_b32_e32 v44, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPR-NEXT: v_mov_b32_e32 v40, s20 +; VGPR-NEXT: v_mov_b32_e32 v41, s21 +; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] +; VGPR-NEXT: v_mov_b32_e32 v42, s22 +; VGPR-NEXT: v_mov_b32_e32 v43, s23 +; VGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: v_mov_b32_e32 v16, s16 +; VGPR-NEXT: v_mov_b32_e32 v17, s17 +; VGPR-NEXT: v_mov_b32_e32 v18, s18 +; VGPR-NEXT: v_mov_b32_e32 v19, s19 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s12 +; VGPR-NEXT: v_mov_b32_e32 v17, s13 +; VGPR-NEXT: v_mov_b32_e32 v18, s14 +; VGPR-NEXT: v_mov_b32_e32 v19, s15 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s8 +; VGPR-NEXT: v_mov_b32_e32 v17, s9 +; VGPR-NEXT: v_mov_b32_e32 v18, s10 +; VGPR-NEXT: v_mov_b32_e32 v19, s11 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store volatile <16 x float> %arg2, ptr addrspace(1) %out store volatile <16 x float> %result, ptr addrspace(1) %out @@ -739,6 +1981,246 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd__flags(<8 x half> ; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: v_mov_b32_e32 v12, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: v_mov_b32_e32 v8, s20 +; HEURRC-NEXT: v_mov_b32_e32 v9, s21 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v10, s22 +; HEURRC-NEXT: v_mov_b32_e32 v11, s23 +; HEURRC-NEXT: v_mov_b32_e32 v0, s16 +; HEURRC-NEXT: v_mov_b32_e32 v1, s17 +; HEURRC-NEXT: v_mov_b32_e32 v2, s18 +; HEURRC-NEXT: v_mov_b32_e32 v3, s19 +; HEURRC-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s12 +; HEURRC-NEXT: v_mov_b32_e32 v1, s13 +; HEURRC-NEXT: v_mov_b32_e32 v2, s14 +; HEURRC-NEXT: v_mov_b32_e32 v3, s15 +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPRRC-NEXT: v_mov_b32_e32 v44, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPRRC-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPRRC-NEXT: v_mov_b32_e32 v40, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v41, s21 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: v_mov_b32_e32 v42, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v43, s23 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 +; VGPRRC-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: v_mov_b32_e32 v12, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; AGPR-NEXT: v_accvgpr_write_b32 a31, s23 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; AGPR-NEXT: v_accvgpr_write_b32 a30, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a29, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a28, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a27, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a26, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a25, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a24, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a23, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a22, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a21, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a20, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a19, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a18, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a17, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a16, s8 +; AGPR-NEXT: v_mov_b32_e32 v8, s20 +; AGPR-NEXT: v_mov_b32_e32 v9, s21 +; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 +; AGPR-NEXT: v_mov_b32_e32 v10, s22 +; AGPR-NEXT: v_mov_b32_e32 v11, s23 +; AGPR-NEXT: v_mov_b32_e32 v0, s16 +; AGPR-NEXT: v_mov_b32_e32 v1, s17 +; AGPR-NEXT: v_mov_b32_e32 v2, s18 +; AGPR-NEXT: v_mov_b32_e32 v3, s19 +; AGPR-NEXT: global_store_dwordx4 v12, v[8:11], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, a[8:11], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, a[12:15], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, a[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v12, a[4:7], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: v_mov_b32_e32 v44, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[34:35], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[32:33], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[38:39], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[36:37], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPR-NEXT: v_mov_b32_e32 v40, s20 +; VGPR-NEXT: v_mov_b32_e32 v41, s21 +; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPR-NEXT: v_mov_b32_e32 v42, s22 +; VGPR-NEXT: v_mov_b32_e32 v43, s23 +; VGPR-NEXT: global_store_dwordx4 v44, v[40:43], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: v_mov_b32_e32 v16, s16 +; VGPR-NEXT: v_mov_b32_e32 v17, s17 +; VGPR-NEXT: v_mov_b32_e32 v18, s18 +; VGPR-NEXT: v_mov_b32_e32 v19, s19 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s12 +; VGPR-NEXT: v_mov_b32_e32 v17, s13 +; VGPR-NEXT: v_mov_b32_e32 v18, s14 +; VGPR-NEXT: v_mov_b32_e32 v19, s15 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s8 +; VGPR-NEXT: v_mov_b32_e32 v17, s9 +; VGPR-NEXT: v_mov_b32_e32 v18, s10 +; VGPR-NEXT: v_mov_b32_e32 v19, s11 +; VGPR-NEXT: global_store_dwordx4 v44, v[16:19], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[0:3], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v44, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 1, i32 2, i32 3) store volatile <16 x float> %arg2, ptr addrspace(1) %out store volatile <16 x float> %result, ptr addrspace(1) %out @@ -819,6 +2301,136 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; VGPRRC-NEXT: v_mov_b32_e32 v16, 0 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] +; AGPR-NEXT: v_mov_b32_e32 v0, 0 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 2 +; AGPR-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; AGPR-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] +; VGPR-NEXT: v_mov_b32_e32 v16, 0 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 0, i32 0, i32 0) store <16 x float> %result, ptr addrspace(1) %out ret void @@ -898,6 +2510,136 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mov_b32_e32 v16, 0 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[24:25] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[26:27] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[28:29] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[30:31] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; AGPR-NEXT: v_mov_b32_e32 v0, 0 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 2 +; AGPR-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; AGPR-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_32x32x16_f16__vgprcd_mac_flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[24:25] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[26:27] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[28:29] +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[30:31] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_f32_32x32x16_f16 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; VGPR-NEXT: v_mov_b32_e32 v16, 0 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPR-NEXT: s_endpgm %result = call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %arg0, <8 x half> %arg1, <16 x float> %arg2, i32 3, i32 2, i32 1) store <16 x float> %result, ptr addrspace(1) %out ret void @@ -925,6 +2667,48 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_i32_16x16x64_i8: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_i32_16x16x64_i8: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_i32_16x16x64_i8: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) ret <4 x i32> %result } @@ -945,6 +2729,48 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_i32_16x16x64_i8__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_i32_16x16x64_i8__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_i32_16x16x64_i8__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 1, i32 1, i32 1) ret <4 x i32> %result } @@ -995,6 +2821,104 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; AGPR-NEXT: v_mov_b32_e32 v8, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: v_mov_b32_e32 v4, s12 +; AGPR-NEXT: v_mov_b32_e32 v5, s13 +; AGPR-NEXT: v_mov_b32_e32 v6, s14 +; AGPR-NEXT: v_mov_b32_e32 v7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a0, s0 +; AGPR-NEXT: v_accvgpr_write_b32 a1, s1 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s2 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s3 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPR-NEXT: v_mov_b32_e32 v12, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v0, s8 +; VGPR-NEXT: v_mov_b32_e32 v1, s9 +; VGPR-NEXT: v_mov_b32_e32 v2, s10 +; VGPR-NEXT: v_mov_b32_e32 v3, s11 +; VGPR-NEXT: v_mov_b32_e32 v4, s12 +; VGPR-NEXT: v_mov_b32_e32 v5, s13 +; VGPR-NEXT: v_mov_b32_e32 v6, s14 +; VGPR-NEXT: v_mov_b32_e32 v7, s15 +; VGPR-NEXT: v_mov_b32_e32 v8, s0 +; VGPR-NEXT: v_mov_b32_e32 v9, s1 +; VGPR-NEXT: v_mov_b32_e32 v10, s2 +; VGPR-NEXT: v_mov_b32_e32 v11, s3 +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPR-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) store <4 x i32> %result, ptr addrspace(1) %out ret void @@ -1046,6 +2970,104 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: v_mov_b32_e32 v4, s12 +; HEURRC-NEXT: v_mov_b32_e32 v5, s13 +; HEURRC-NEXT: v_mov_b32_e32 v6, s14 +; HEURRC-NEXT: v_mov_b32_e32 v7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, s15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, s0 +; VGPRRC-NEXT: v_mov_b32_e32 v9, s1 +; VGPRRC-NEXT: v_mov_b32_e32 v10, s2 +; VGPRRC-NEXT: v_mov_b32_e32 v11, s3 +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; AGPR-NEXT: v_mov_b32_e32 v8, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: v_mov_b32_e32 v4, s12 +; AGPR-NEXT: v_mov_b32_e32 v5, s13 +; AGPR-NEXT: v_mov_b32_e32 v6, s14 +; AGPR-NEXT: v_mov_b32_e32 v7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a0, s0 +; AGPR-NEXT: v_accvgpr_write_b32 a1, s1 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s2 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s3 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPR-NEXT: v_mov_b32_e32 v12, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v0, s8 +; VGPR-NEXT: v_mov_b32_e32 v1, s9 +; VGPR-NEXT: v_mov_b32_e32 v2, s10 +; VGPR-NEXT: v_mov_b32_e32 v3, s11 +; VGPR-NEXT: v_mov_b32_e32 v4, s12 +; VGPR-NEXT: v_mov_b32_e32 v5, s13 +; VGPR-NEXT: v_mov_b32_e32 v6, s14 +; VGPR-NEXT: v_mov_b32_e32 v7, s15 +; VGPR-NEXT: v_mov_b32_e32 v8, s0 +; VGPR-NEXT: v_mov_b32_e32 v9, s1 +; VGPR-NEXT: v_mov_b32_e32 v10, s2 +; VGPR-NEXT: v_mov_b32_e32 v11, s3 +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_i32_16x16x64_i8 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPR-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1) store <4 x i32> %result, ptr addrspace(1) %out ret void @@ -1187,6 +3209,282 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_i32_32x32x32_i8: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s24 +; HEURRC-NEXT: v_mov_b32_e32 v1, s25 +; HEURRC-NEXT: v_mov_b32_e32 v2, s26 +; HEURRC-NEXT: v_mov_b32_e32 v3, s27 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v4, s28 +; HEURRC-NEXT: v_mov_b32_e32 v5, s29 +; HEURRC-NEXT: v_mov_b32_e32 v6, s30 +; HEURRC-NEXT: v_mov_b32_e32 v7, s31 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0 +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v0, s16 +; HEURRC-NEXT: v_mov_b32_e32 v1, s17 +; HEURRC-NEXT: v_mov_b32_e32 v2, s18 +; HEURRC-NEXT: v_mov_b32_e32 v3, s19 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s12 +; HEURRC-NEXT: v_mov_b32_e32 v1, s13 +; HEURRC-NEXT: v_mov_b32_e32 v2, s14 +; HEURRC-NEXT: v_mov_b32_e32 v3, s15 +; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v32, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s27 +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b32_e32 v36, s28 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s29 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s30 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s31 +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s19 +; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 +; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_i32_32x32x32_i8: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b64_e32 v[8:9], 48 +; AGPR-NEXT: v_mov_b64_e32 v[10:11], 32 +; AGPR-NEXT: v_mov_b64_e32 v[12:13], 16 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b32_e32 v0, s24 +; AGPR-NEXT: v_mov_b32_e32 v1, s25 +; AGPR-NEXT: v_mov_b32_e32 v2, s26 +; AGPR-NEXT: v_mov_b32_e32 v3, s27 +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_mov_b32_e32 v4, s28 +; AGPR-NEXT: v_mov_b32_e32 v5, s29 +; AGPR-NEXT: v_mov_b32_e32 v6, s30 +; AGPR-NEXT: v_mov_b32_e32 v7, s31 +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: v_mov_b64_e32 v[14:15], 0 +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] +; AGPR-NEXT: v_mov_b32_e32 v0, s16 +; AGPR-NEXT: v_mov_b32_e32 v1, s17 +; AGPR-NEXT: v_mov_b32_e32 v2, s18 +; AGPR-NEXT: v_mov_b32_e32 v3, s19 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_i32_32x32x32_i8: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b64_e32 v[40:41], 48 +; VGPR-NEXT: v_mov_b64_e32 v[42:43], 32 +; VGPR-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v32, s24 +; VGPR-NEXT: v_mov_b32_e32 v33, s25 +; VGPR-NEXT: v_mov_b32_e32 v34, s26 +; VGPR-NEXT: v_mov_b32_e32 v35, s27 +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b32_e32 v36, s28 +; VGPR-NEXT: v_mov_b32_e32 v37, s29 +; VGPR-NEXT: v_mov_b32_e32 v38, s30 +; VGPR-NEXT: v_mov_b32_e32 v39, s31 +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 3 +; VGPR-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v0, s16 +; VGPR-NEXT: v_mov_b32_e32 v1, s17 +; VGPR-NEXT: v_mov_b32_e32 v2, s18 +; VGPR-NEXT: v_mov_b32_e32 v3, s19 +; VGPR-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s20 +; VGPR-NEXT: v_mov_b32_e32 v1, s21 +; VGPR-NEXT: v_mov_b32_e32 v2, s22 +; VGPR-NEXT: v_mov_b32_e32 v3, s23 +; VGPR-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s8 +; VGPR-NEXT: v_mov_b32_e32 v1, s9 +; VGPR-NEXT: v_mov_b32_e32 v2, s10 +; VGPR-NEXT: v_mov_b32_e32 v3, s11 +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s12 +; VGPR-NEXT: v_mov_b32_e32 v1, s13 +; VGPR-NEXT: v_mov_b32_e32 v2, s14 +; VGPR-NEXT: v_mov_b32_e32 v3, s15 +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) store volatile <16 x i32> %result, ptr addrspace(1) null store volatile <16 x i32> %arg2, ptr addrspace(1) null @@ -1323,6 +3621,282 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: global_store_dwordx4 v[26:27], v[0:3], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: v_mov_b64_e32 v[8:9], 48 +; HEURRC-NEXT: v_mov_b64_e32 v[10:11], 32 +; HEURRC-NEXT: v_mov_b64_e32 v[12:13], 16 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s24 +; HEURRC-NEXT: v_mov_b32_e32 v1, s25 +; HEURRC-NEXT: v_mov_b32_e32 v2, s26 +; HEURRC-NEXT: v_mov_b32_e32 v3, s27 +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v4, s28 +; HEURRC-NEXT: v_mov_b32_e32 v5, s29 +; HEURRC-NEXT: v_mov_b32_e32 v6, s30 +; HEURRC-NEXT: v_mov_b32_e32 v7, s31 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: v_mov_b64_e32 v[14:15], 0 +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v0, s16 +; HEURRC-NEXT: v_mov_b32_e32 v1, s17 +; HEURRC-NEXT: v_mov_b32_e32 v2, s18 +; HEURRC-NEXT: v_mov_b32_e32 v3, s19 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s12 +; HEURRC-NEXT: v_mov_b32_e32 v1, s13 +; HEURRC-NEXT: v_mov_b32_e32 v2, s14 +; HEURRC-NEXT: v_mov_b32_e32 v3, s15 +; HEURRC-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: v_mov_b64_e32 v[40:41], 48 +; VGPRRC-NEXT: v_mov_b64_e32 v[42:43], 32 +; VGPRRC-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v32, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s27 +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b32_e32 v36, s28 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s29 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s30 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s31 +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPRRC-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v0, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s19 +; VGPRRC-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s23 +; VGPRRC-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s11 +; VGPRRC-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v0, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v1, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v2, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v3, s15 +; VGPRRC-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_i32_32x32x32_i8__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b64_e32 v[8:9], 48 +; AGPR-NEXT: v_mov_b64_e32 v[10:11], 32 +; AGPR-NEXT: v_mov_b64_e32 v[12:13], 16 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b32_e32 v0, s24 +; AGPR-NEXT: v_mov_b32_e32 v1, s25 +; AGPR-NEXT: v_mov_b32_e32 v2, s26 +; AGPR-NEXT: v_mov_b32_e32 v3, s27 +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_mov_b32_e32 v4, s28 +; AGPR-NEXT: v_mov_b32_e32 v5, s29 +; AGPR-NEXT: v_mov_b32_e32 v6, s30 +; AGPR-NEXT: v_mov_b32_e32 v7, s31 +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: v_mov_b64_e32 v[14:15], 0 +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[16:31], v[0:3], v[4:7], a[0:15] cbsz:2 abid:3 blgp:1 +; AGPR-NEXT: v_mov_b32_e32 v0, s16 +; AGPR-NEXT: v_mov_b32_e32 v1, s17 +; AGPR-NEXT: v_mov_b32_e32 v2, s18 +; AGPR-NEXT: v_mov_b32_e32 v3, s19 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[12:13], a[20:23], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[14:15], a[16:19], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v[10:11], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: global_store_dwordx4 v[8:9], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v[14:15], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v[12:13], v[0:3], off sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_i32_32x32x32_i8__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b64_e32 v[40:41], 48 +; VGPR-NEXT: v_mov_b64_e32 v[42:43], 32 +; VGPR-NEXT: v_mov_b64_e32 v[44:45], 16 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v32, s24 +; VGPR-NEXT: v_mov_b32_e32 v33, s25 +; VGPR-NEXT: v_mov_b32_e32 v34, s26 +; VGPR-NEXT: v_mov_b32_e32 v35, s27 +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b32_e32 v36, s28 +; VGPR-NEXT: v_mov_b32_e32 v37, s29 +; VGPR-NEXT: v_mov_b32_e32 v38, s30 +; VGPR-NEXT: v_mov_b32_e32 v39, s31 +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[46:47], 0 +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[16:31], v[32:35], v[36:39], v[0:15] cbsz:2 abid:3 blgp:1 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 3 +; VGPR-NEXT: global_store_dwordx4 v[40:41], v[28:31], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[42:43], v[24:27], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[20:23], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[16:19], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v0, s16 +; VGPR-NEXT: v_mov_b32_e32 v1, s17 +; VGPR-NEXT: v_mov_b32_e32 v2, s18 +; VGPR-NEXT: v_mov_b32_e32 v3, s19 +; VGPR-NEXT: global_store_dwordx4 v[42:43], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s20 +; VGPR-NEXT: v_mov_b32_e32 v1, s21 +; VGPR-NEXT: v_mov_b32_e32 v2, s22 +; VGPR-NEXT: v_mov_b32_e32 v3, s23 +; VGPR-NEXT: global_store_dwordx4 v[40:41], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s8 +; VGPR-NEXT: v_mov_b32_e32 v1, s9 +; VGPR-NEXT: v_mov_b32_e32 v2, s10 +; VGPR-NEXT: v_mov_b32_e32 v3, s11 +; VGPR-NEXT: global_store_dwordx4 v[46:47], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v0, s12 +; VGPR-NEXT: v_mov_b32_e32 v1, s13 +; VGPR-NEXT: v_mov_b32_e32 v2, s14 +; VGPR-NEXT: v_mov_b32_e32 v3, s15 +; VGPR-NEXT: global_store_dwordx4 v[44:45], v[0:3], off sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 2, i32 3, i32 1) store volatile <16 x i32> %result, ptr addrspace(1) null store volatile <16 x i32> %arg2, ptr addrspace(1) null @@ -1370,6 +3944,134 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4 +; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5 +; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6 +; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7 +; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8 +; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9 +; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10 +; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11 +; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12 +; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13 +; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14 +; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__mac: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, v8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, v9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, v10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, v11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, v12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, v13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, v14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, v15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, v16 +; VGPRRC-NEXT: v_mov_b32_e32 v9, v17 +; VGPRRC-NEXT: v_mov_b32_e32 v10, v18 +; VGPRRC-NEXT: v_mov_b32_e32 v11, v19 +; VGPRRC-NEXT: v_mov_b32_e32 v12, v20 +; VGPRRC-NEXT: v_mov_b32_e32 v13, v21 +; VGPRRC-NEXT: v_mov_b32_e32 v14, v22 +; VGPRRC-NEXT: v_mov_b32_e32 v15, v23 +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_i32_32x32x32_i8__mac: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, v12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, v13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, v14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, v15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, v16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, v17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, v18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, v19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, v20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, v21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, v22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, v23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 3 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: v_accvgpr_read_b32 v4, a4 +; AGPR-NEXT: v_accvgpr_read_b32 v5, a5 +; AGPR-NEXT: v_accvgpr_read_b32 v6, a6 +; AGPR-NEXT: v_accvgpr_read_b32 v7, a7 +; AGPR-NEXT: v_accvgpr_read_b32 v8, a8 +; AGPR-NEXT: v_accvgpr_read_b32 v9, a9 +; AGPR-NEXT: v_accvgpr_read_b32 v10, a10 +; AGPR-NEXT: v_accvgpr_read_b32 v11, a11 +; AGPR-NEXT: v_accvgpr_read_b32 v12, a12 +; AGPR-NEXT: v_accvgpr_read_b32 v13, a13 +; AGPR-NEXT: v_accvgpr_read_b32 v14, a14 +; AGPR-NEXT: v_accvgpr_read_b32 v15, a15 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_i32_32x32x32_i8__mac: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 3 +; VGPR-NEXT: v_mov_b32_e32 v0, v8 +; VGPR-NEXT: v_mov_b32_e32 v1, v9 +; VGPR-NEXT: v_mov_b32_e32 v2, v10 +; VGPR-NEXT: v_mov_b32_e32 v3, v11 +; VGPR-NEXT: v_mov_b32_e32 v4, v12 +; VGPR-NEXT: v_mov_b32_e32 v5, v13 +; VGPR-NEXT: v_mov_b32_e32 v6, v14 +; VGPR-NEXT: v_mov_b32_e32 v7, v15 +; VGPR-NEXT: v_mov_b32_e32 v8, v16 +; VGPR-NEXT: v_mov_b32_e32 v9, v17 +; VGPR-NEXT: v_mov_b32_e32 v10, v18 +; VGPR-NEXT: v_mov_b32_e32 v11, v19 +; VGPR-NEXT: v_mov_b32_e32 v12, v20 +; VGPR-NEXT: v_mov_b32_e32 v13, v21 +; VGPR-NEXT: v_mov_b32_e32 v14, v22 +; VGPR-NEXT: v_mov_b32_e32 v15, v23 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) ret <16 x i32> %result } @@ -1415,6 +4117,134 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; GCN-NEXT: v_accvgpr_read_b32 v14, a14 ; GCN-NEXT: v_accvgpr_read_b32 v15, a15 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, v12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, v13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, v14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, v15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, v16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, v17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, v18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, v19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, v20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, v21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, v22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, v23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: s_nop 3 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: v_accvgpr_read_b32 v4, a4 +; HEURRC-NEXT: v_accvgpr_read_b32 v5, a5 +; HEURRC-NEXT: v_accvgpr_read_b32 v6, a6 +; HEURRC-NEXT: v_accvgpr_read_b32 v7, a7 +; HEURRC-NEXT: v_accvgpr_read_b32 v8, a8 +; HEURRC-NEXT: v_accvgpr_read_b32 v9, a9 +; HEURRC-NEXT: v_accvgpr_read_b32 v10, a10 +; HEURRC-NEXT: v_accvgpr_read_b32 v11, a11 +; HEURRC-NEXT: v_accvgpr_read_b32 v12, a12 +; HEURRC-NEXT: v_accvgpr_read_b32 v13, a13 +; HEURRC-NEXT: v_accvgpr_read_b32 v14, a14 +; HEURRC-NEXT: v_accvgpr_read_b32 v15, a15 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 3 +; VGPRRC-NEXT: v_mov_b32_e32 v0, v8 +; VGPRRC-NEXT: v_mov_b32_e32 v1, v9 +; VGPRRC-NEXT: v_mov_b32_e32 v2, v10 +; VGPRRC-NEXT: v_mov_b32_e32 v3, v11 +; VGPRRC-NEXT: v_mov_b32_e32 v4, v12 +; VGPRRC-NEXT: v_mov_b32_e32 v5, v13 +; VGPRRC-NEXT: v_mov_b32_e32 v6, v14 +; VGPRRC-NEXT: v_mov_b32_e32 v7, v15 +; VGPRRC-NEXT: v_mov_b32_e32 v8, v16 +; VGPRRC-NEXT: v_mov_b32_e32 v9, v17 +; VGPRRC-NEXT: v_mov_b32_e32 v10, v18 +; VGPRRC-NEXT: v_mov_b32_e32 v11, v19 +; VGPRRC-NEXT: v_mov_b32_e32 v12, v20 +; VGPRRC-NEXT: v_mov_b32_e32 v13, v21 +; VGPRRC-NEXT: v_mov_b32_e32 v14, v22 +; VGPRRC-NEXT: v_mov_b32_e32 v15, v23 +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, v12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, v13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, v14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, v15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, v16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, v17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, v18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, v19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, v20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, v21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, v22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, v23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 3 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: v_accvgpr_read_b32 v4, a4 +; AGPR-NEXT: v_accvgpr_read_b32 v5, a5 +; AGPR-NEXT: v_accvgpr_read_b32 v6, a6 +; AGPR-NEXT: v_accvgpr_read_b32 v7, a7 +; AGPR-NEXT: v_accvgpr_read_b32 v8, a8 +; AGPR-NEXT: v_accvgpr_read_b32 v9, a9 +; AGPR-NEXT: v_accvgpr_read_b32 v10, a10 +; AGPR-NEXT: v_accvgpr_read_b32 v11, a11 +; AGPR-NEXT: v_accvgpr_read_b32 v12, a12 +; AGPR-NEXT: v_accvgpr_read_b32 v13, a13 +; AGPR-NEXT: v_accvgpr_read_b32 v14, a14 +; AGPR-NEXT: v_accvgpr_read_b32 v15, a15 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_i32_32x32x32_i8__mac__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[8:23], v[0:3], v[4:7], v[8:23] cbsz:1 abid:1 blgp:1 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 3 +; VGPR-NEXT: v_mov_b32_e32 v0, v8 +; VGPR-NEXT: v_mov_b32_e32 v1, v9 +; VGPR-NEXT: v_mov_b32_e32 v2, v10 +; VGPR-NEXT: v_mov_b32_e32 v3, v11 +; VGPR-NEXT: v_mov_b32_e32 v4, v12 +; VGPR-NEXT: v_mov_b32_e32 v5, v13 +; VGPR-NEXT: v_mov_b32_e32 v6, v14 +; VGPR-NEXT: v_mov_b32_e32 v7, v15 +; VGPR-NEXT: v_mov_b32_e32 v8, v16 +; VGPR-NEXT: v_mov_b32_e32 v9, v17 +; VGPR-NEXT: v_mov_b32_e32 v10, v18 +; VGPR-NEXT: v_mov_b32_e32 v11, v19 +; VGPR-NEXT: v_mov_b32_e32 v12, v20 +; VGPR-NEXT: v_mov_b32_e32 v13, v21 +; VGPR-NEXT: v_mov_b32_e32 v14, v22 +; VGPR-NEXT: v_mov_b32_e32 v15, v23 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 1, i32 1) ret <16 x i32> %result } @@ -1544,6 +4374,274 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd(<4 x i32> %arg0, <4 ; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: v_mov_b32_e32 v4, s24 +; HEURRC-NEXT: v_mov_b32_e32 v5, s25 +; HEURRC-NEXT: v_mov_b32_e32 v6, s26 +; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s16 +; HEURRC-NEXT: v_mov_b32_e32 v1, s17 +; HEURRC-NEXT: v_mov_b32_e32 v2, s18 +; HEURRC-NEXT: v_mov_b32_e32 v3, s19 +; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s12 +; HEURRC-NEXT: v_mov_b32_e32 v1, s13 +; HEURRC-NEXT: v_mov_b32_e32 v2, s14 +; HEURRC-NEXT: v_mov_b32_e32 v3, s15 +; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; VGPRRC-NEXT: s_nop 6 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: v_mov_b32_e32 v8, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b32_e32 v4, s24 +; AGPR-NEXT: v_mov_b32_e32 v5, s25 +; AGPR-NEXT: v_mov_b32_e32 v6, s26 +; AGPR-NEXT: v_mov_b32_e32 v7, s27 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a31, s23 +; AGPR-NEXT: v_accvgpr_write_b32 a30, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a29, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a28, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a27, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a26, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a25, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a24, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a23, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a22, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a21, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a20, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a19, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a18, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a17, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a16, s8 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s16 +; AGPR-NEXT: v_mov_b32_e32 v1, s17 +; AGPR-NEXT: v_mov_b32_e32 v2, s18 +; AGPR-NEXT: v_mov_b32_e32 v3, s19 +; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: v_mov_b32_e32 v40, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v32, s20 +; VGPR-NEXT: v_mov_b32_e32 v33, s21 +; VGPR-NEXT: v_mov_b32_e32 v34, s22 +; VGPR-NEXT: v_mov_b32_e32 v35, s23 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b32_e32 v36, s24 +; VGPR-NEXT: v_mov_b32_e32 v37, s25 +; VGPR-NEXT: v_mov_b32_e32 v38, s26 +; VGPR-NEXT: v_mov_b32_e32 v39, s27 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] +; VGPR-NEXT: s_nop 6 +; VGPR-NEXT: v_mov_b32_e32 v16, s20 +; VGPR-NEXT: v_mov_b32_e32 v17, s21 +; VGPR-NEXT: v_mov_b32_e32 v18, s22 +; VGPR-NEXT: v_mov_b32_e32 v19, s23 +; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s16 +; VGPR-NEXT: v_mov_b32_e32 v17, s17 +; VGPR-NEXT: v_mov_b32_e32 v18, s18 +; VGPR-NEXT: v_mov_b32_e32 v19, s19 +; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s12 +; VGPR-NEXT: v_mov_b32_e32 v17, s13 +; VGPR-NEXT: v_mov_b32_e32 v18, s14 +; VGPR-NEXT: v_mov_b32_e32 v19, s15 +; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s8 +; VGPR-NEXT: v_mov_b32_e32 v17, s9 +; VGPR-NEXT: v_mov_b32_e32 v18, s10 +; VGPR-NEXT: v_mov_b32_e32 v19, s11 +; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) store volatile <16 x i32> %arg2, ptr addrspace(1) %out store volatile <16 x i32> %result, ptr addrspace(1) %out @@ -1675,6 +4773,274 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd__flags(<4 x i32> %a ; GISEL-NEXT: global_store_dwordx4 v24, a[28:31], s[0:1] offset:48 sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: v_mov_b32_e32 v4, s24 +; HEURRC-NEXT: v_mov_b32_e32 v5, s25 +; HEURRC-NEXT: v_mov_b32_e32 v6, s26 +; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a31, s23 +; HEURRC-NEXT: v_accvgpr_write_b32 a30, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a29, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a28, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a27, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a26, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a25, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a24, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a23, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a22, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a21, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a20, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a19, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a18, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a17, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a16, s8 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s16 +; HEURRC-NEXT: v_mov_b32_e32 v1, s17 +; HEURRC-NEXT: v_mov_b32_e32 v2, s18 +; HEURRC-NEXT: v_mov_b32_e32 v3, s19 +; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s12 +; HEURRC-NEXT: v_mov_b32_e32 v1, s13 +; HEURRC-NEXT: v_mov_b32_e32 v2, s14 +; HEURRC-NEXT: v_mov_b32_e32 v3, s15 +; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_nop 0 +; HEURRC-NEXT: v_mov_b32_e32 v0, s8 +; HEURRC-NEXT: v_mov_b32_e32 v1, s9 +; HEURRC-NEXT: v_mov_b32_e32 v2, s10 +; HEURRC-NEXT: v_mov_b32_e32 v3, s11 +; HEURRC-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; HEURRC-NEXT: s_waitcnt vmcnt(0) +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPRRC-NEXT: v_mov_b32_e32 v40, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v32, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v33, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v34, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v35, s23 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: v_mov_b32_e32 v36, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v37, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v38, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v39, s27 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPRRC-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPRRC-NEXT: s_nop 6 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s16 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s17 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s18 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s19 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s12 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s13 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s14 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s15 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_nop 0 +; VGPRRC-NEXT: v_mov_b32_e32 v16, s8 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s9 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s10 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s11 +; VGPRRC-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPRRC-NEXT: s_waitcnt vmcnt(0) +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: v_mov_b32_e32 v8, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b32_e32 v4, s24 +; AGPR-NEXT: v_mov_b32_e32 v5, s25 +; AGPR-NEXT: v_mov_b32_e32 v6, s26 +; AGPR-NEXT: v_mov_b32_e32 v7, s27 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a31, s23 +; AGPR-NEXT: v_accvgpr_write_b32 a30, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a29, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a28, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a27, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a26, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a25, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a24, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a23, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a22, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a21, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a20, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a19, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a18, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a17, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a16, s8 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[16:31] cbsz:1 abid:2 blgp:3 +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s16 +; AGPR-NEXT: v_mov_b32_e32 v1, s17 +; AGPR-NEXT: v_mov_b32_e32 v2, s18 +; AGPR-NEXT: v_mov_b32_e32 v3, s19 +; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s12 +; AGPR-NEXT: v_mov_b32_e32 v1, s13 +; AGPR-NEXT: v_mov_b32_e32 v2, s14 +; AGPR-NEXT: v_mov_b32_e32 v3, s15 +; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_nop 0 +; AGPR-NEXT: v_mov_b32_e32 v0, s8 +; AGPR-NEXT: v_mov_b32_e32 v1, s9 +; AGPR-NEXT: v_mov_b32_e32 v2, s10 +; AGPR-NEXT: v_mov_b32_e32 v3, s11 +; AGPR-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v8, a[8:11], s[0:1] offset:32 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v8, a[12:15], s[0:1] offset:48 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[0:1] sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: global_store_dwordx4 v8, a[4:7], s[0:1] offset:16 sc0 sc1 +; AGPR-NEXT: s_waitcnt vmcnt(0) +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: v_mov_b32_e32 v40, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v32, s20 +; VGPR-NEXT: v_mov_b32_e32 v33, s21 +; VGPR-NEXT: v_mov_b32_e32 v34, s22 +; VGPR-NEXT: v_mov_b32_e32 v35, s23 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b32_e32 v36, s24 +; VGPR-NEXT: v_mov_b32_e32 v37, s25 +; VGPR-NEXT: v_mov_b32_e32 v38, s26 +; VGPR-NEXT: v_mov_b32_e32 v39, s27 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[30:31], s[22:23] +; VGPR-NEXT: v_mov_b64_e32 v[28:29], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[26:27], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[24:25], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[32:35], v[36:39], v[16:31] cbsz:1 abid:2 blgp:3 +; VGPR-NEXT: s_nop 6 +; VGPR-NEXT: v_mov_b32_e32 v16, s20 +; VGPR-NEXT: v_mov_b32_e32 v17, s21 +; VGPR-NEXT: v_mov_b32_e32 v18, s22 +; VGPR-NEXT: v_mov_b32_e32 v19, s23 +; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s16 +; VGPR-NEXT: v_mov_b32_e32 v17, s17 +; VGPR-NEXT: v_mov_b32_e32 v18, s18 +; VGPR-NEXT: v_mov_b32_e32 v19, s19 +; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s12 +; VGPR-NEXT: v_mov_b32_e32 v17, s13 +; VGPR-NEXT: v_mov_b32_e32 v18, s14 +; VGPR-NEXT: v_mov_b32_e32 v19, s15 +; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_nop 0 +; VGPR-NEXT: v_mov_b32_e32 v16, s8 +; VGPR-NEXT: v_mov_b32_e32 v17, s9 +; VGPR-NEXT: v_mov_b32_e32 v18, s10 +; VGPR-NEXT: v_mov_b32_e32 v19, s11 +; VGPR-NEXT: global_store_dwordx4 v40, v[16:19], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v40, v[8:11], s[0:1] offset:32 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v40, v[12:15], s[0:1] offset:48 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v40, v[0:3], s[0:1] sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: global_store_dwordx4 v40, v[4:7], s[0:1] offset:16 sc0 sc1 +; VGPR-NEXT: s_waitcnt vmcnt(0) +; VGPR-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 1, i32 2, i32 3) store volatile <16 x i32> %arg2, ptr addrspace(1) %out store volatile <16 x i32> %result, ptr addrspace(1) %out @@ -1760,6 +5126,156 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: v_mov_b32_e32 v4, s24 +; HEURRC-NEXT: v_mov_b32_e32 v5, s25 +; HEURRC-NEXT: v_mov_b32_e32 v6, s26 +; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: v_mov_b32_e32 v20, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v21, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v22, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v23, s27 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; VGPRRC-NEXT: v_mov_b32_e32 v16, 0 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b32_e32 v4, s24 +; AGPR-NEXT: v_mov_b32_e32 v5, s25 +; AGPR-NEXT: v_mov_b32_e32 v6, s26 +; AGPR-NEXT: v_mov_b32_e32 v7, s27 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] +; AGPR-NEXT: v_mov_b32_e32 v0, 0 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 2 +; AGPR-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; AGPR-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v16, s20 +; VGPR-NEXT: v_mov_b32_e32 v17, s21 +; VGPR-NEXT: v_mov_b32_e32 v18, s22 +; VGPR-NEXT: v_mov_b32_e32 v19, s23 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b32_e32 v20, s24 +; VGPR-NEXT: v_mov_b32_e32 v21, s25 +; VGPR-NEXT: v_mov_b32_e32 v22, s26 +; VGPR-NEXT: v_mov_b32_e32 v23, s27 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] +; VGPR-NEXT: v_mov_b32_e32 v16, 0 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPR-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 0, i32 0, i32 0) store <16 x i32> %result, ptr addrspace(1) %out ret void @@ -1844,6 +5360,156 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GISEL-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GISEL-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; HEURRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b32_e32 v0, s20 +; HEURRC-NEXT: v_mov_b32_e32 v1, s21 +; HEURRC-NEXT: v_mov_b32_e32 v2, s22 +; HEURRC-NEXT: v_mov_b32_e32 v3, s23 +; HEURRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; HEURRC-NEXT: v_mov_b32_e32 v4, s24 +; HEURRC-NEXT: v_mov_b32_e32 v5, s25 +; HEURRC-NEXT: v_mov_b32_e32 v6, s26 +; HEURRC-NEXT: v_mov_b32_e32 v7, s27 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s11 +; HEURRC-NEXT: v_accvgpr_write_b32 a4, s12 +; HEURRC-NEXT: v_accvgpr_write_b32 a5, s13 +; HEURRC-NEXT: v_accvgpr_write_b32 a6, s14 +; HEURRC-NEXT: v_accvgpr_write_b32 a7, s15 +; HEURRC-NEXT: v_accvgpr_write_b32 a8, s16 +; HEURRC-NEXT: v_accvgpr_write_b32 a9, s17 +; HEURRC-NEXT: v_accvgpr_write_b32 a10, s18 +; HEURRC-NEXT: v_accvgpr_write_b32 a11, s19 +; HEURRC-NEXT: v_accvgpr_write_b32 a12, s20 +; HEURRC-NEXT: v_accvgpr_write_b32 a13, s21 +; HEURRC-NEXT: v_accvgpr_write_b32 a14, s22 +; HEURRC-NEXT: v_accvgpr_write_b32 a15, s23 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: v_mov_b32_e32 v0, 0 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: s_nop 2 +; HEURRC-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; HEURRC-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; HEURRC-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; HEURRC-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; VGPRRC-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b32_e32 v16, s20 +; VGPRRC-NEXT: v_mov_b32_e32 v17, s21 +; VGPRRC-NEXT: v_mov_b32_e32 v18, s22 +; VGPRRC-NEXT: v_mov_b32_e32 v19, s23 +; VGPRRC-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPRRC-NEXT: v_mov_b32_e32 v20, s24 +; VGPRRC-NEXT: v_mov_b32_e32 v21, s25 +; VGPRRC-NEXT: v_mov_b32_e32 v22, s26 +; VGPRRC-NEXT: v_mov_b32_e32 v23, s27 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPRRC-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPRRC-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: v_mov_b32_e32 v16, 0 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: s_nop 2 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPRRC-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; AGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b32_e32 v0, s20 +; AGPR-NEXT: v_mov_b32_e32 v1, s21 +; AGPR-NEXT: v_mov_b32_e32 v2, s22 +; AGPR-NEXT: v_mov_b32_e32 v3, s23 +; AGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; AGPR-NEXT: v_mov_b32_e32 v4, s24 +; AGPR-NEXT: v_mov_b32_e32 v5, s25 +; AGPR-NEXT: v_mov_b32_e32 v6, s26 +; AGPR-NEXT: v_mov_b32_e32 v7, s27 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, s8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, s9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s11 +; AGPR-NEXT: v_accvgpr_write_b32 a4, s12 +; AGPR-NEXT: v_accvgpr_write_b32 a5, s13 +; AGPR-NEXT: v_accvgpr_write_b32 a6, s14 +; AGPR-NEXT: v_accvgpr_write_b32 a7, s15 +; AGPR-NEXT: v_accvgpr_write_b32 a8, s16 +; AGPR-NEXT: v_accvgpr_write_b32 a9, s17 +; AGPR-NEXT: v_accvgpr_write_b32 a10, s18 +; AGPR-NEXT: v_accvgpr_write_b32 a11, s19 +; AGPR-NEXT: v_accvgpr_write_b32 a12, s20 +; AGPR-NEXT: v_accvgpr_write_b32 a13, s21 +; AGPR-NEXT: v_accvgpr_write_b32 a14, s22 +; AGPR-NEXT: v_accvgpr_write_b32 a15, s23 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 +; AGPR-NEXT: v_mov_b32_e32 v0, 0 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: s_nop 2 +; AGPR-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 +; AGPR-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 +; AGPR-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 +; AGPR-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_i32_32x32x32_i8__vgprcd_mac_flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[20:27], s[4:5], 0x24 +; VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xa4 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b32_e32 v16, s20 +; VGPR-NEXT: v_mov_b32_e32 v17, s21 +; VGPR-NEXT: v_mov_b32_e32 v18, s22 +; VGPR-NEXT: v_mov_b32_e32 v19, s23 +; VGPR-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x64 +; VGPR-NEXT: v_mov_b32_e32 v20, s24 +; VGPR-NEXT: v_mov_b32_e32 v21, s25 +; VGPR-NEXT: v_mov_b32_e32 v22, s26 +; VGPR-NEXT: v_mov_b32_e32 v23, s27 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[16:17] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[18:19] +; VGPR-NEXT: v_mov_b64_e32 v[12:13], s[20:21] +; VGPR-NEXT: v_mov_b64_e32 v[14:15], s[22:23] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_i32_32x32x32_i8 v[0:15], v[16:19], v[20:23], v[0:15] cbsz:3 abid:2 blgp:1 +; VGPR-NEXT: v_mov_b32_e32 v16, 0 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: s_nop 2 +; VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48 +; VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32 +; VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16 +; VGPR-NEXT: global_store_dwordx4 v16, v[0:3], s[0:1] +; VGPR-NEXT: s_endpgm %result = call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x32.i8(<4 x i32> %arg0, <4 x i32> %arg1, <16 x i32> %arg2, i32 3, i32 2, i32 1) store <16 x i32> %result, ptr addrspace(1) %out ret void @@ -1871,6 +5537,48 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_f32_16x16x32_bf16: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_f32_16x16x32_bf16: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) ret <4 x float> %result } @@ -1891,6 +5599,48 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 ; GCN-NEXT: v_accvgpr_read_b32 v3, a3 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8 +; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0 +; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1 +; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2 +; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3 +; HEURRC-NEXT: s_setpc_b64 s[30:31] +; +; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 +; VGPRRC-NEXT: s_setpc_b64 s[30:31] +; AGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; AGPR-NEXT: v_accvgpr_write_b32 a0, v8 +; AGPR-NEXT: v_accvgpr_write_b32 a1, v9 +; AGPR-NEXT: v_accvgpr_write_b32 a2, v10 +; AGPR-NEXT: v_accvgpr_write_b32 a3, v11 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: v_accvgpr_read_b32 v0, a0 +; AGPR-NEXT: v_accvgpr_read_b32 v1, a1 +; AGPR-NEXT: v_accvgpr_read_b32 v2, a2 +; AGPR-NEXT: v_accvgpr_read_b32 v3, a3 +; AGPR-NEXT: s_setpc_b64 s[30:31] +; VGPR-LABEL: test_mfma_f32_16x16x32_bf16__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VGPR-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:1 abid:1 blgp:1 +; VGPR-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) ret <4 x float> %result } @@ -1916,6 +5666,84 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; AGPR-NEXT: v_mov_b32_e32 v8, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s0 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s1 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s2 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s3 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPR-NEXT: v_mov_b32_e32 v12, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPR-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) store <4 x float> %result, ptr addrspace(1) %out ret void @@ -1942,6 +5770,84 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(pt ; GCN-NEXT: s_nop 7 ; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; GCN-NEXT: s_endpgm +; +; HEURRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: +; HEURRC: ; %bb.0: +; HEURRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; HEURRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; HEURRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; HEURRC-NEXT: v_mov_b32_e32 v8, 0 +; HEURRC-NEXT: s_waitcnt lgkmcnt(0) +; HEURRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; HEURRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; HEURRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; HEURRC-NEXT: v_accvgpr_write_b32 a0, s0 +; HEURRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; HEURRC-NEXT: v_accvgpr_write_b32 a1, s1 +; HEURRC-NEXT: v_accvgpr_write_b32 a2, s2 +; HEURRC-NEXT: v_accvgpr_write_b32 a3, s3 +; HEURRC-NEXT: s_nop 1 +; HEURRC-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; HEURRC-NEXT: s_nop 7 +; HEURRC-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; HEURRC-NEXT: s_endpgm +; +; VGPRRC-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: +; VGPRRC: ; %bb.0: +; VGPRRC-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPRRC-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPRRC-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPRRC-NEXT: v_mov_b32_e32 v12, 0 +; VGPRRC-NEXT: s_waitcnt lgkmcnt(0) +; VGPRRC-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPRRC-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPRRC-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPRRC-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; VGPRRC-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPRRC-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPRRC-NEXT: s_nop 1 +; VGPRRC-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPRRC-NEXT: s_nop 7 +; VGPRRC-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPRRC-NEXT: s_endpgm +; AGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: +; AGPR: ; %bb.0: +; AGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; AGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; AGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; AGPR-NEXT: v_mov_b32_e32 v8, 0 +; AGPR-NEXT: s_waitcnt lgkmcnt(0) +; AGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; AGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; AGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; AGPR-NEXT: v_accvgpr_write_b32 a0, s0 +; AGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; AGPR-NEXT: v_accvgpr_write_b32 a1, s1 +; AGPR-NEXT: v_accvgpr_write_b32 a2, s2 +; AGPR-NEXT: v_accvgpr_write_b32 a3, s3 +; AGPR-NEXT: s_nop 1 +; AGPR-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; AGPR-NEXT: s_nop 7 +; AGPR-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; AGPR-NEXT: s_endpgm +; VGPR-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: +; VGPR: ; %bb.0: +; VGPR-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; VGPR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; VGPR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; VGPR-NEXT: v_mov_b32_e32 v12, 0 +; VGPR-NEXT: s_waitcnt lgkmcnt(0) +; VGPR-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; VGPR-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; VGPR-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; VGPR-NEXT: v_mov_b64_e32 v[10:11], s[2:3] +; VGPR-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; VGPR-NEXT: v_mov_b64_e32 v[8:9], s[0:1] +; VGPR-NEXT: s_nop 1 +; VGPR-NEXT: v_mfma_f32_16x16x32_bf16 v[0:3], v[0:3], v[4:7], v[8:11] cbsz:3 abid:2 blgp:1 +; VGPR-NEXT: s_nop 7 +; VGPR-NEXT: global_store_dwordx4 v12, v[0:3], s[6:7] +; VGPR-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) store <4 x float> %result, ptr addrspace(1) %out ret void From 004c67ea257039e4e98abc26dd4ac6e8f3d7a171 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 18 Jul 2025 22:58:19 +0200 Subject: [PATCH 401/813] [LV] Vectorize maxnum/minnum w/o fast-math flags. (#148239) Update LV to vectorize maxnum/minnum reductions without fast-math flags, by adding an extra check in the loop if any inputs to maxnum/minnum are NaN, due to maxnum/minnum behavior w.r.t to signaling NaNs. Signed-zeros are already handled consistently by maxnum/minnum. If any input is NaN, *exit the vector loop, *compute the reduction result up to the vector iteration that contained NaN inputs and * resume in the scalar loop New recurrence kinds are added for reductions using maxnum/minnum without fast-math flags. PR: https://github.com/llvm/llvm-project/pull/148239 --- llvm/include/llvm/Analysis/IVDescriptors.h | 3 + llvm/lib/Analysis/IVDescriptors.cpp | 26 +- llvm/lib/Transforms/Utils/LoopUtils.cpp | 10 +- .../Vectorize/LoopVectorizationPlanner.h | 12 +- .../Transforms/Vectorize/LoopVectorize.cpp | 18 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 6 + .../Transforms/Vectorize/VPlanAnalysis.cpp | 1 + .../Vectorize/VPlanConstruction.cpp | 160 +++++++++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 +- .../Transforms/Vectorize/VPlanTransforms.h | 6 + .../AArch64/fmax-without-fast-math-flags.ll | 55 +++- .../AArch64/fmin-without-fast-math-flags.ll | 55 +++- ...fmax-without-fast-math-flags-interleave.ll | 55 +++- .../fmax-without-fast-math-flags.ll | 272 +++++++++++++++++- .../fmin-without-fast-math-flags.ll | 94 +++++- .../LoopVectorize/minmax_reduction.ll | 8 +- 16 files changed, 731 insertions(+), 58 deletions(-) diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index b985292ccee40..1dc73205a0ebb 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -47,6 +47,8 @@ enum class RecurKind { FMul, ///< Product of floats. FMin, ///< FP min implemented in terms of select(cmp()). FMax, ///< FP max implemented in terms of select(cmp()). + FMinNum, ///< FP min with llvm.minnum semantics including NaNs. + FMaxNum, ///< FP max with llvm.maxnum semantics including NaNs. FMinimum, ///< FP min with llvm.minimum semantics FMaximum, ///< FP max with llvm.maximum semantics FMinimumNum, ///< FP min with llvm.minimumnum semantics @@ -250,6 +252,7 @@ class RecurrenceDescriptor { /// Returns true if the recurrence kind is a floating-point min/max kind. static bool isFPMinMaxRecurrenceKind(RecurKind Kind) { return Kind == RecurKind::FMin || Kind == RecurKind::FMax || + Kind == RecurKind::FMinNum || Kind == RecurKind::FMaxNum || Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum; } diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp index 39f74beca082f..8be5de3bf356f 100644 --- a/llvm/lib/Analysis/IVDescriptors.cpp +++ b/llvm/lib/Analysis/IVDescriptors.cpp @@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr( m_Intrinsic(m_Value(), m_Value())) || match(I, m_Intrinsic(m_Value(), m_Value())); }; - if (isIntMinMaxRecurrenceKind(Kind) || - (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind))) + if (isIntMinMaxRecurrenceKind(Kind)) return isMinMaxPattern(I, Kind, Prev); - else if (isFMulAddIntrinsic(I)) + if (isFPMinMaxRecurrenceKind(Kind)) { + InstDesc Res = isMinMaxPattern(I, Kind, Prev); + if (!Res.isRecurrence()) + return InstDesc(false, I); + if (HasRequiredFMF()) + return Res; + // We may be able to vectorize FMax/FMin reductions using maxnum/minnum + // intrinsics with extra checks ensuring the vector loop handles only + // non-NaN inputs. + if (match(I, m_Intrinsic(m_Value(), m_Value()))) { + assert(Kind == RecurKind::FMax && + "unexpected recurrence kind for maxnum"); + return InstDesc(I, RecurKind::FMaxNum); + } + if (match(I, m_Intrinsic(m_Value(), m_Value()))) { + assert(Kind == RecurKind::FMin && + "unexpected recurrence kind for minnum"); + return InstDesc(I, RecurKind::FMinNum); + } + return InstDesc(false, I); + } + if (isFMulAddIntrinsic(I)) return InstDesc(Kind == RecurKind::FMulAdd, I, I->hasAllowReassoc() ? nullptr : I); return InstDesc(false, I); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 200d1fb854155..e7623aaff105d 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { case RecurKind::UMin: return Intrinsic::vector_reduce_umin; case RecurKind::FMax: + case RecurKind::FMaxNum: return Intrinsic::vector_reduce_fmax; case RecurKind::FMin: + case RecurKind::FMinNum: return Intrinsic::vector_reduce_fmin; case RecurKind::FMaximum: return Intrinsic::vector_reduce_fmaximum; @@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) { case RecurKind::SMax: return Intrinsic::smax; case RecurKind::FMin: + case RecurKind::FMinNum: return Intrinsic::minnum; case RecurKind::FMax: + case RecurKind::FMaxNum: return Intrinsic::maxnum; case RecurKind::FMinimum: return Intrinsic::minimum; @@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right) { Type *Ty = Left->getType(); if (Ty->isIntOrIntVectorTy() || - (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum || + (RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum || + RK == RecurKind::FMinimum || RK == RecurKind::FMaximum || RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) { - // TODO: Add float minnum/maxnum support when FMF nnan is set. Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK); return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr, "rdx.minmax"); @@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src, case RecurKind::UMin: case RecurKind::FMax: case RecurKind::FMin: + case RecurKind::FMinNum: + case RecurKind::FMaxNum: case RecurKind::FMinimum: case RecurKind::FMaximum: case RecurKind::FMinimumNum: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 11853859484e3..f57ce0c3ccb4d 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -230,7 +230,6 @@ class VPBuilder { /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A /// and \p B. - /// TODO: add createFCmp when needed. VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { @@ -240,6 +239,17 @@ class VPBuilder { new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); } + /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A + /// and \p B. + VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { + assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE && + Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate"); + return tryInsertInstruction( + new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name)); + } + VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f142e0796b52a..6e420632d83e5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4361,10 +4361,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { - // Cross iteration phis such as reductions need special handling and are - // currently unsupported. - if (any_of(OrigLoop->getHeader()->phis(), - [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) + // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum + // reductions need special handling and are currently unsupported. + if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { + if (!Legal->isReductionVariable(&Phi)) + return Legal->isFixedOrderRecurrence(&Phi); + RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind(); + return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum; + })) return false; // Phis with uses outside of the loop require special handling and are @@ -8787,6 +8791,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + // Apply mandatory transformation to handle FP maxnum/minnum reduction with + // NaNs if possible, bail out otherwise. + if (!VPlanTransforms::runPass( + VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath, *Plan)) + return nullptr; + // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 6ad5c60105a28..0d0b342505214 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -23202,6 +23202,8 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: @@ -23339,6 +23341,8 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: @@ -23441,6 +23445,8 @@ class HorizontalReduction { case RecurKind::FindFirstIVUMin: case RecurKind::FindLastIVSMax: case RecurKind::FindLastIVUMax: + case RecurKind::FMaxNum: + case RecurKind::FMinNum: case RecurKind::FMaximumNum: case RecurKind::FMinimumNum: case RecurKind::None: diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index ca8729ae2e00e..3499e650ae853 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { return ResTy; } case Instruction::ICmp: + case Instruction::FCmp: case VPInstruction::ActiveLaneMask: assert(inferScalarType(R->getOperand(0)) == inferScalarType(R->getOperand(1)) && diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 7fb5e82f9d32b..1a614c3c12119 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -652,3 +652,163 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, Term->addMetadata(LLVMContext::MD_prof, BranchWeights); } } + +bool VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan) { + auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * { + auto *MinMaxR = dyn_cast( + RedPhiR->getBackedgeValue()->getDefiningRecipe()); + if (!MinMaxR) + return nullptr; + + auto *RepR = dyn_cast(MinMaxR); + if (!isa(MinMaxR) && + !(RepR && isa(RepR->getUnderlyingInstr()))) + return nullptr; + +#ifndef NDEBUG + Intrinsic::ID RdxIntrinsicId = + RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum + : Intrinsic::minnum; + assert((isa(MinMaxR) && + cast(MinMaxR)->getVectorIntrinsicID() == + RdxIntrinsicId) || + (RepR && + cast(RepR->getUnderlyingInstr())->getIntrinsicID() == + RdxIntrinsicId) && + "Intrinsic did not match recurrence kind"); +#endif + + if (MinMaxR->getOperand(0) == RedPhiR) + return MinMaxR->getOperand(1); + + assert(MinMaxR->getOperand(1) == RedPhiR && + "Reduction phi operand expected"); + return MinMaxR->getOperand(0); + }; + + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPReductionPHIRecipe *RedPhiR = nullptr; + bool HasUnsupportedPhi = false; + for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) { + if (isa(&R)) + continue; + auto *Cur = dyn_cast(&R); + if (!Cur) { + // TODO: Also support fixed-order recurrence phis. + HasUnsupportedPhi = true; + continue; + } + // For now, only a single reduction is supported. + // TODO: Support multiple MaxNum/MinNum reductions and other reductions. + if (RedPhiR) + return false; + if (Cur->getRecurrenceKind() != RecurKind::FMaxNum && + Cur->getRecurrenceKind() != RecurKind::FMinNum) { + HasUnsupportedPhi = true; + continue; + } + RedPhiR = Cur; + } + + if (!RedPhiR) + return true; + + // We won't be able to resume execution in the scalar tail, if there are + // unsupported header phis or there is no scalar tail at all, due to + // tail-folding. + if (HasUnsupportedPhi || !Plan.hasScalarTail()) + return false; + + VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR); + if (!MinMaxOp) + return false; + + RecurKind RedPhiRK = RedPhiR->getRecurrenceKind(); + assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) && + "unsupported reduction"); + + /// Check if the vector loop of \p Plan can early exit and restart + /// execution of last vector iteration in the scalar loop. This requires all + /// recipes up to early exit point be side-effect free as they are + /// re-executed. Currently we check that the loop is free of any recipe that + /// may write to memory. Expected to operate on an early VPlan w/o nested + /// regions. + for (VPBlockBase *VPB : vp_depth_first_shallow( + Plan.getVectorLoopRegion()->getEntryBasicBlock())) { + auto *VPBB = cast(VPB); + for (auto &R : *VPBB) { + if (R.mayWriteToMemory() && + !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue()))) + return false; + } + } + + VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock(); + VPBuilder Builder(LatchVPBB->getTerminator()); + auto *LatchExitingBranch = cast(LatchVPBB->getTerminator()); + assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount && + "Unexpected terminator"); + auto *IsLatchExitTaken = + Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0), + LatchExitingBranch->getOperand(1)); + + VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp); + VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN}); + auto *AnyExitTaken = + Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken}); + Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken); + LatchExitingBranch->eraseFromParent(); + + // If we exit early due to NaNs, compute the final reduction result based on + // the reduction phi at the beginning of the last vector iteration. + auto *RdxResult = find_singleton( + RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * { + auto *VPI = dyn_cast(U); + if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult) + return VPI; + return nullptr; + }); + + auto *MiddleVPBB = Plan.getMiddleBlock(); + Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin()); + auto *NewSel = + Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1)); + RdxResult->setOperand(1, NewSel); + + auto *ScalarPH = Plan.getScalarPreheader(); + // Update resume phis for inductions in the scalar preheader. If AnyNaN is + // true, the resume from the start of the last vector iteration via the + // canonical IV, otherwise from the original value. + for (auto &R : ScalarPH->phis()) { + auto *ResumeR = cast(&R); + VPValue *VecV = ResumeR->getOperand(0); + if (VecV == RdxResult) + continue; + if (auto *DerivedIV = dyn_cast(VecV)) { + if (DerivedIV->getNumUsers() == 1 && + DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) { + auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), + &Plan.getVectorTripCount()); + DerivedIV->moveAfter(&*Builder.getInsertPoint()); + DerivedIV->setOperand(1, NewSel); + continue; + } + } + // Bail out and abandon the current, partially modified, VPlan if we + // encounter resume phi that cannot be updated yet. + if (VecV != &Plan.getVectorTripCount()) { + LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with " + "FMaxNum/FMinNum reduction.\n"); + return false; + } + auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV); + ResumeR->setOperand(0, NewSel); + } + + auto *MiddleTerm = MiddleVPBB->getTerminator(); + Builder.setInsertPoint(MiddleTerm); + VPValue *MiddleCond = MiddleTerm->getOperand(0); + VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN)); + MiddleTerm->setOperand(0, NewCond); + return true; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1664bcc3881aa..57b713d3dfcb9 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -587,6 +587,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this)); return Builder.CreateFreeze(Op, Name); } + case Instruction::FCmp: case Instruction::ICmp: { bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this); Value *A = State.get(getOperand(0), OnlyFirstLaneUsed); @@ -860,7 +861,7 @@ Value *VPInstruction::generate(VPTransformState &State) { Value *Res = State.get(getOperand(0)); for (VPValue *Op : drop_begin(operands())) Res = Builder.CreateOr(Res, State.get(Op)); - return Builder.CreateOrReduce(Res); + return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res); } case VPInstruction::FirstActiveLane: { if (getNumOperands() == 1) { @@ -1033,6 +1034,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const { switch (getOpcode()) { case Instruction::ExtractElement: case Instruction::Freeze: + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: case VPInstruction::AnyOf: @@ -1068,6 +1070,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const { return Op == getOperand(1); case Instruction::PHI: return true; + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: case Instruction::Or: @@ -1100,6 +1103,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const { switch (getOpcode()) { default: return false; + case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: return vputils::onlyFirstPartUsed(this); @@ -1786,7 +1790,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const { return Opcode == Instruction::ZExt; break; case OperationType::Cmp: - return Opcode == Instruction::ICmp; + return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp; case OperationType::Other: return true; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 84a12470f45e4..04cb7a7a5c19b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -103,6 +103,12 @@ struct VPlanTransforms { /// not valid. static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder); + /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do, + /// try to update the vector loop to exit early if any input is NaN and resume + /// executing in the scalar loop to handle the NaNs there. Return false if + /// this attempt was unsuccessful. + static bool handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan); + /// Clear NSW/NUW flags from reduction instructions if necessary. static void clearReductionWrapFlags(VPlan &Plan); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll index 451574a258c2b..427a05cc1c843 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll @@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll index e93ee5563b057..1a8e5940d88e7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll @@ -42,18 +42,59 @@ define float @fminnum(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fminnum( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll index b2e080fef2e57..a2eddad179216 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll @@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]] +; CHECK-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll index 5661406b88a5a..1ca5586942d7c 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll @@ -192,18 +192,51 @@ define float @fmaxnum_1(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum_1( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: @@ -227,18 +260,234 @@ define float @fmaxnum_2(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fmaxnum_2( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 +; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ] + %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv + %l = load float, ptr %gep.src, align 4 + %max.next = call float @llvm.maxnum.f32(float %max, float %l) + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret float %max.next +} + +define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) { +; CHECK-LABEL: define float @fmaxnum_induction_starts_at_10( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -10 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = add i64 10, [[INDEX]] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = add i64 10, [[TMP9]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 10, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 +; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop ] + %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ] + %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv + %l = load float, ptr %gep.src, align 4 + %max.next = call float @llvm.maxnum.f32(float %l, float %max) + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret float %max.next +} + +define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) { +; CHECK-LABEL: define float @fmaxnum_induction_starts_at_value( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[START:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[START]] +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = add i64 [[START]], [[INDEX]] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]]) +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[START]], [[TMP9]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 +; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ %start, %entry ], [ %iv.next, %loop ] + %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ] + %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv + %l = load float, ptr %gep.src, align 4 + %max.next = call float @llvm.maxnum.f32(float %l, float %max) + %iv.next = add nuw nsw i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %n + br i1 %ec, label %exit, label %loop + +exit: + ret float %max.next +} + +define float @fmaxnum_with_additional_add(ptr noalias %src, ptr noalias %src.2, i64 %n) { +; CHECK-LABEL: define float @fmaxnum_with_additional_add( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC_2]], i64 [[IV]] +; CHECK-NEXT: [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4 +; CHECK-NEXT: [[SUM_NEXT]] = add i32 [[SUM]], [[L_SRC_2]] ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 -; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]]) +; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]]) ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] ; CHECK: [[EXIT]]: +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: store i32 [[SUM_NEXT_LCSSA]], ptr [[SRC_2]], align 4 ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: @@ -247,14 +496,19 @@ entry: loop: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ] + %sum = phi i32 [ 0, %entry ], [ %sum.next, %loop ] + %gep.src.2 = getelementptr inbounds nuw i32, ptr %src.2, i64 %iv + %l.src.2 = load i32, ptr %gep.src.2, align 4 + %sum.next = add i32 %sum, %l.src.2 %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv %l = load float, ptr %gep.src, align 4 - %max.next = call float @llvm.maxnum.f32(float %max, float %l) + %max.next = call float @llvm.maxnum.f32(float %l, float %max) %iv.next = add nuw nsw i64 %iv, 1 %ec = icmp eq i64 %iv.next, %n br i1 %ec, label %exit, label %loop exit: + store i32 %sum.next, ptr %src.2 ret float %max.next } diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll index 148beb64a3609..68bc8d0640a3f 100644 --- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll @@ -192,18 +192,51 @@ define float @fminnum_1(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fminnum_1( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[L]], float [[MAX]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: @@ -227,18 +260,51 @@ define float @fminnum_2(ptr %src, i64 %n) { ; CHECK-LABEL: define float @fminnum_2( ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true) +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]] -; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4 +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]] +; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4 ; CHECK-NEXT: [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]]) -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: [[EXIT]]: -; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[MAX_NEXT_LCSSA]] ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll index 85a90f2e04c5e..e7ab02cd98a5e 100644 --- a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll @@ -1001,8 +1001,10 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +; This can be vectorized with additional runtime checks for NaNs. ; CHECK-LABEL: @fmin_intrinsic_nofast( -; CHECK-NOT: <2 x float> @llvm.minnum.v2f32 +; CHECK: <2 x float> @llvm.minnum.v2f32 +; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]] define float @fmin_intrinsic_nofast(ptr nocapture readonly %x) { entry: br label %for.body @@ -1021,8 +1023,10 @@ for.body: ; preds = %entry, %for.body br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +; This can be vectorized with additional runtime checks for NaNs. ; CHECK-LABEL: @fmax_intrinsic_nofast( -; CHECK-NOT: <2 x float> @llvm.maxnum.v2f32 +; CHECK: <2 x float> @llvm.maxnum.v2f32 +; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]] define float @fmax_intrinsic_nofast(ptr nocapture readonly %x) { entry: br label %for.body From fb5c94e712e683e8b7c3cd04b3e47584c226d751 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Fri, 18 Jul 2025 14:00:32 -0700 Subject: [PATCH 402/813] [profdata] Use --hot-func-list to show all hot functions (#149428) The `--hot-func-list` flag is used for sample profiles to dump the list of hot functions. Add support to dump hot functions for IRPGO profiles as well. This also removes a `priority_queue` used for `--topn`. We can instead store all functions and sort at the end before dumping. Since we are storing `StringRef`s, I believe this won't consume too much memory. --- llvm/test/tools/llvm-profdata/c-general.test | 6 +-- .../tools/llvm-profdata/show-hot.proftext | 35 +++++++++++++ llvm/tools/llvm-profdata/llvm-profdata.cpp | 50 ++++++++----------- 3 files changed, 60 insertions(+), 31 deletions(-) create mode 100644 llvm/test/tools/llvm-profdata/show-hot.proftext diff --git a/llvm/test/tools/llvm-profdata/c-general.test b/llvm/test/tools/llvm-profdata/c-general.test index 7c48f7b04a05c..ab4849fac034f 100644 --- a/llvm/test/tools/llvm-profdata/c-general.test +++ b/llvm/test/tools/llvm-profdata/c-general.test @@ -22,6 +22,6 @@ SWITCHES-LABEL: Functions shown: 1 CHECK-LABEL: Total functions: 12 CHECK-NEXT: Maximum function count: 1 CHECK-NEXT: Maximum internal block count: 100 -TOPN: boolean_operators, max count = 100 -TOPN-NEXT: simple_loops, max count = 100 -TOPN-NEXT: conditionals, max count = 100 +TOPN: simple_loops, max count = 100 +TOPN-NEXT: conditionals, max count = 100 +TOPN-NEXT: boolean_operators, max count = 100 diff --git a/llvm/test/tools/llvm-profdata/show-hot.proftext b/llvm/test/tools/llvm-profdata/show-hot.proftext new file mode 100644 index 0000000000000..5c9bd61c20d28 --- /dev/null +++ b/llvm/test/tools/llvm-profdata/show-hot.proftext @@ -0,0 +1,35 @@ +# RUN: llvm-profdata show %s --hot-func-list | FileCheck %s + +# CHECK: # Hot count threshold: 101 +# CHECK: hot_b +# CHECK: hot_a +# CHECK: hot_c + +:ir +hot_a +# Func Hash: +0x1234 +# Num Counters: +1 +# Counter Values: +101 + +hot_b +0x5678 +1 +202 + +hot_c +0x5678 +1 +101 + +cold_d +0xabcd +1 +1 + +cold_e +0xefff +1 +0 diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index 5efabd5f2a7c6..96d135b9746ff 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -46,7 +46,6 @@ #include #include #include -#include using namespace llvm; using ProfCorrelatorKind = InstrProfCorrelator::ProfCorrelatorKind; @@ -2846,9 +2845,8 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { auto FS = vfs::getRealFileSystem(); auto ReaderOrErr = InstrProfReader::create(Filename, *FS); std::vector Cutoffs = std::move(DetailedSummaryCutoffs); - if (ShowDetailedSummary && Cutoffs.empty()) { + if (Cutoffs.empty() && (ShowDetailedSummary || ShowHotFuncList)) Cutoffs = ProfileSummaryBuilder::DefaultCutoffs; - } InstrProfSummaryBuilder Builder(std::move(Cutoffs)); if (Error E = ReaderOrErr.takeError()) exitWithError(std::move(E), Filename); @@ -2860,15 +2858,7 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { int NumVPKind = IPVK_Last - IPVK_First + 1; std::vector VPStats(NumVPKind); - auto MinCmp = [](const std::pair &v1, - const std::pair &v2) { - return v1.second > v2.second; - }; - - std::priority_queue, - std::vector>, - decltype(MinCmp)> - HottestFuncs(MinCmp); + std::vector> NameAndMaxCount; if (!TextFormat && OnlyListBelow) { OS << "The list of functions with the maximum counter less than " @@ -2942,15 +2932,8 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { if (OnlyListBelow) continue; - if (TopNFunctions) { - if (HottestFuncs.size() == TopNFunctions) { - if (HottestFuncs.top().second < FuncMax) { - HottestFuncs.pop(); - HottestFuncs.emplace(std::make_pair(std::string(Func.Name), FuncMax)); - } - } else - HottestFuncs.emplace(std::make_pair(std::string(Func.Name), FuncMax)); - } + if (TopNFunctions || ShowHotFuncList) + NameAndMaxCount.emplace_back(Func.Name, FuncMax); if (Show) { if (!ShownFunctions) @@ -3029,16 +3012,27 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { << "): " << PS->getNumFunctions() - BelowCutoffFunctions << "\n"; } + // Sort by MaxCount in decreasing order + llvm::stable_sort(NameAndMaxCount, [](const auto &L, const auto &R) { + return L.second > R.second; + }); if (TopNFunctions) { - std::vector> SortedHottestFuncs; - while (!HottestFuncs.empty()) { - SortedHottestFuncs.emplace_back(HottestFuncs.top()); - HottestFuncs.pop(); - } OS << "Top " << TopNFunctions << " functions with the largest internal block counts: \n"; - for (auto &hotfunc : llvm::reverse(SortedHottestFuncs)) - OS << " " << hotfunc.first << ", max count = " << hotfunc.second << "\n"; + auto TopFuncs = ArrayRef(NameAndMaxCount).take_front(TopNFunctions); + for (auto [Name, MaxCount] : TopFuncs) + OS << " " << Name << ", max count = " << MaxCount << "\n"; + } + + if (ShowHotFuncList) { + auto HotCountThreshold = + ProfileSummaryBuilder::getHotCountThreshold(PS->getDetailedSummary()); + OS << "# Hot count threshold: " << HotCountThreshold << "\n"; + for (auto [Name, MaxCount] : NameAndMaxCount) { + if (MaxCount < HotCountThreshold) + break; + OS << Name << "\n"; + } } if (ShownFunctions && ShowIndirectCallTargets) { From ffb453989b0e95d85b6cfa543b65fec23b65649d Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 18 Jul 2025 17:03:26 -0400 Subject: [PATCH 403/813] [NFC][AMDGPU] Align all gfx1250 VOP1 MC tests with downstream (#149567) This PR adds all VOP1 tests that haven't yet been upstreamed by copying the relevant test files directly from downstream. Afterward, the auto-generation script is run with the `--unique` option to deduplicate any redundant tests that may have been introduced during the downstream merge. Co-authored-by: Mekhanoshin, Stanislav Co-authored-by: Mekhanoshin, Stanislav --- llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s | 39 + .../gfx1250_asm_vop3_from_vop1-fake16.s | 3789 ++++++++++++++- .../MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s | 3913 ++++++++++++++- .../gfx1250_asm_vop3_from_vop1_dpp8-fake16.s | 4 - .../AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s | 20 +- .../Disassembler/AMDGPU/gfx1250_dasm_vop1.txt | 53 +- .../AMDGPU/gfx1250_dasm_vop3_from_vop1.txt | 4261 ++++++++++++++++- .../gfx1250_dasm_vop3_from_vop1_dpp16.txt | 360 +- .../gfx1250_dasm_vop3_from_vop1_dpp8.txt | 120 +- 9 files changed, 11938 insertions(+), 621 deletions(-) diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s index 3ddbc365224db..a313741ffe22d 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s @@ -664,6 +664,45 @@ v_cvt_f32_fp8_e32 v1, 3 v_cvt_f32_fp8_e32 v1, v3 // GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] +v_cvt_pk_f32_bf8_e32 v[2:3], s3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[4:5], s5 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], s5 ; encoding: [0x05,0xde,0x08,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], 3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xde,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[4:5], 3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], 3 ; encoding: [0x83,0xde,0x08,0x7e] + +v_cvt_pk_f32_bf8_e32 v[2:3], v3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e] + +v_cvt_pk_f32_bf8_e32 v[4:5], v3 +// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v3 ; encoding: [0x03,0xdf,0x08,0x7e] + +v_cvt_pk_f32_bf8_e32 v[4:5], v127.h +// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v127.h ; encoding: [0xff,0xdf,0x08,0x7e] + +v_cvt_pk_f32_bf8_e32 v[4:5], v127.l +// GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], v127.l ; encoding: [0x7f,0xdf,0x08,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], s3 +// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], 3 +// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[2:3], v3 +// GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e] + +v_cvt_pk_f32_fp8_e32 v[4:5], v127.h +// GFX1250: v_cvt_pk_f32_fp8_e32 v[4:5], v127.h ; encoding: [0xff,0xdd,0x08,0x7e] + +v_cvt_pk_f32_fp8_e32 v[4:5], v127.l +// GFX1250: v_cvt_pk_f32_fp8_e32 v[4:5], v127.l ; encoding: [0x7f,0xdd,0x08,0x7e] + v_sat_pk4_i4_i8 v1, v2 // GFX1250: v_sat_pk4_i4_i8_e32 v1, v2 ; encoding: [0x02,0xe7,0x02,0x7e] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s index aa4e49d85f1ff..0931523bbf40c 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1-fake16.s @@ -1,167 +1,3659 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s -v_tanh_bf16_e64 v5, v1 -// GFX1250: v_tanh_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x00,0x00] +v_bfrev_b32_e64 v5, v1 +// GFX1250: v_bfrev_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00] + +v_bfrev_b32_e64 v5, v255 +// GFX1250: v_bfrev_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xb8,0xd5,0xff,0x01,0x00,0x00] + +v_bfrev_b32_e64 v5, s1 +// GFX1250: v_bfrev_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, s105 +// GFX1250: v_bfrev_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xb8,0xd5,0x69,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, vcc_lo +// GFX1250: v_bfrev_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb8,0xd5,0x6a,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, vcc_hi +// GFX1250: v_bfrev_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb8,0xd5,0x6b,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, ttmp15 +// GFX1250: v_bfrev_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb8,0xd5,0x7b,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, m0 +// GFX1250: v_bfrev_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xb8,0xd5,0x7d,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, exec_lo +// GFX1250: v_bfrev_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb8,0xd5,0x7e,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, exec_hi +// GFX1250: v_bfrev_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb8,0xd5,0x7f,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, null +// GFX1250: v_bfrev_b32_e64 v5, null ; encoding: [0x05,0x00,0xb8,0xd5,0x7c,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, -1 +// GFX1250: v_bfrev_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xb8,0xd5,0xc1,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, 0.5 +// GFX1250: v_bfrev_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb8,0xd5,0xf0,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, src_scc +// GFX1250: v_bfrev_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb8,0xd5,0xfd,0x00,0x00,0x00] + +v_bfrev_b32_e64 v255, 0xaf123456 +// GFX1250: v_bfrev_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_ceil_f16_e64 v5, v1 +// GFX1250: v_ceil_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00] + +v_ceil_f16_e64 v5, v255 +// GFX1250: v_ceil_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00] + +v_ceil_f16_e64 v5, s1 +// GFX1250: v_ceil_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, s105 +// GFX1250: v_ceil_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, vcc_lo +// GFX1250: v_ceil_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, vcc_hi +// GFX1250: v_ceil_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, ttmp15 +// GFX1250: v_ceil_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, m0 +// GFX1250: v_ceil_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, exec_lo +// GFX1250: v_ceil_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, exec_hi +// GFX1250: v_ceil_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, null +// GFX1250: v_ceil_f16_e64 v5, null ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, -1 +// GFX1250: v_ceil_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_ceil_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08] + +v_ceil_f16_e64 v5, src_scc mul:4 +// GFX1250: v_ceil_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10] + +v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_ceil_f32_e64 v5, v1 +// GFX1250: v_ceil_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00] + +v_ceil_f32_e64 v5, v255 +// GFX1250: v_ceil_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa2,0xd5,0xff,0x01,0x00,0x00] + +v_ceil_f32_e64 v5, s1 +// GFX1250: v_ceil_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, s105 +// GFX1250: v_ceil_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa2,0xd5,0x69,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, vcc_lo +// GFX1250: v_ceil_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa2,0xd5,0x6a,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, vcc_hi +// GFX1250: v_ceil_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa2,0xd5,0x6b,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, ttmp15 +// GFX1250: v_ceil_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa2,0xd5,0x7b,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, m0 +// GFX1250: v_ceil_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa2,0xd5,0x7d,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, exec_lo +// GFX1250: v_ceil_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa2,0xd5,0x7e,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, exec_hi +// GFX1250: v_ceil_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa2,0xd5,0x7f,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, null +// GFX1250: v_ceil_f32_e64 v5, null ; encoding: [0x05,0x00,0xa2,0xd5,0x7c,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, -1 +// GFX1250: v_ceil_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa2,0xd5,0xc1,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_ceil_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa2,0xd5,0xf0,0x00,0x00,0x08] + +v_ceil_f32_e64 v5, src_scc mul:4 +// GFX1250: v_ceil_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa2,0xd5,0xfd,0x00,0x00,0x10] + +v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa2,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_ceil_f64_e64 v[6:7], v[2:3] +// GFX1250: v_ceil_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x01,0x00,0x00] + +v_ceil_f64_e64 v[6:7], v[254:255] +// GFX1250: v_ceil_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x98,0xd5,0xfe,0x01,0x00,0x00] + +v_ceil_f64_e64 v[6:7], s[2:3] +// GFX1250: v_ceil_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], s[104:105] +// GFX1250: v_ceil_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x98,0xd5,0x68,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], vcc +// GFX1250: v_ceil_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x98,0xd5,0x6a,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_ceil_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x98,0xd5,0x7a,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], exec +// GFX1250: v_ceil_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x98,0xd5,0x7e,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], null +// GFX1250: v_ceil_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x98,0xd5,0x7c,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], -1 +// GFX1250: v_ceil_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x98,0xd5,0xc1,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_ceil_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x98,0xd5,0xf0,0x00,0x00,0x08] + +v_ceil_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_ceil_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x98,0xd5,0xfd,0x00,0x00,0x30] + +v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x98,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cls_i32_e64 v5, v1 +// GFX1250: v_cls_i32_e64 v5, v1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00] + +v_cls_i32_e64 v5, v255 +// GFX1250: v_cls_i32_e64 v5, v255 ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00] + +v_cls_i32_e64 v5, s1 +// GFX1250: v_cls_i32_e64 v5, s1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00] + +v_cls_i32_e64 v5, s105 +// GFX1250: v_cls_i32_e64 v5, s105 ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00] + +v_cls_i32_e64 v5, vcc_lo +// GFX1250: v_cls_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00] + +v_cls_i32_e64 v5, vcc_hi +// GFX1250: v_cls_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00] + +v_cls_i32_e64 v5, ttmp15 +// GFX1250: v_cls_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00] + +v_cls_i32_e64 v5, m0 +// GFX1250: v_cls_i32_e64 v5, m0 ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00] + +v_cls_i32_e64 v5, exec_lo +// GFX1250: v_cls_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00] + +v_cls_i32_e64 v5, exec_hi +// GFX1250: v_cls_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00] + +v_cls_i32_e64 v5, null +// GFX1250: v_cls_i32_e64 v5, null ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00] + +v_cls_i32_e64 v5, -1 +// GFX1250: v_cls_i32_e64 v5, -1 ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00] + +v_cls_i32_e64 v5, 0.5 +// GFX1250: v_cls_i32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00] + +v_cls_i32_e64 v5, src_scc +// GFX1250: v_cls_i32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00] + +v_cls_i32_e64 v255, 0xaf123456 +// GFX1250: v_cls_i32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_clz_i32_u32_e64 v5, v1 +// GFX1250: v_clz_i32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00] + +v_clz_i32_u32_e64 v5, v255 +// GFX1250: v_clz_i32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00] + +v_clz_i32_u32_e64 v5, s1 +// GFX1250: v_clz_i32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, s105 +// GFX1250: v_clz_i32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, vcc_lo +// GFX1250: v_clz_i32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, vcc_hi +// GFX1250: v_clz_i32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, ttmp15 +// GFX1250: v_clz_i32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, m0 +// GFX1250: v_clz_i32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, exec_lo +// GFX1250: v_clz_i32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, exec_hi +// GFX1250: v_clz_i32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, null +// GFX1250: v_clz_i32_u32_e64 v5, null ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, -1 +// GFX1250: v_clz_i32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, 0.5 +// GFX1250: v_clz_i32_u32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, src_scc +// GFX1250: v_clz_i32_u32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v255, 0xaf123456 +// GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_cos_f16_e64 v5, v1 +// GFX1250: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16_e64 v5, v255 +// GFX1250: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] + +v_cos_f16_e64 v5, s1 +// GFX1250: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] + +v_cos_f16_e64 v5, s105 +// GFX1250: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] + +v_cos_f16_e64 v5, vcc_lo +// GFX1250: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_f16_e64 v5, vcc_hi +// GFX1250: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_f16_e64 v5, ttmp15 +// GFX1250: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_f16_e64 v5, m0 +// GFX1250: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_f16_e64 v5, exec_lo +// GFX1250: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_f16_e64 v5, exec_hi +// GFX1250: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_f16_e64 v5, null +// GFX1250: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_f16_e64 v5, -1 +// GFX1250: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] + +v_cos_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] + +v_cos_f16_e64 v5, src_scc mul:4 +// GFX1250: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] + +v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_cos_f32_e64 v5, v1 +// GFX1250: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f32_e64 v5, v255 +// GFX1250: v_cos_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb6,0xd5,0xff,0x01,0x00,0x00] + +v_cos_f32_e64 v5, s1 +// GFX1250: v_cos_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x00,0x00,0x00] + +v_cos_f32_e64 v5, s105 +// GFX1250: v_cos_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb6,0xd5,0x69,0x00,0x00,0x00] + +v_cos_f32_e64 v5, vcc_lo +// GFX1250: v_cos_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb6,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_f32_e64 v5, vcc_hi +// GFX1250: v_cos_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb6,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_f32_e64 v5, ttmp15 +// GFX1250: v_cos_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb6,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_f32_e64 v5, m0 +// GFX1250: v_cos_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb6,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_f32_e64 v5, exec_lo +// GFX1250: v_cos_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb6,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_f32_e64 v5, exec_hi +// GFX1250: v_cos_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb6,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_f32_e64 v5, null +// GFX1250: v_cos_f32_e64 v5, null ; encoding: [0x05,0x00,0xb6,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_f32_e64 v5, -1 +// GFX1250: v_cos_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb6,0xd5,0xc1,0x00,0x00,0x00] + +v_cos_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_cos_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb6,0xd5,0xf0,0x00,0x00,0x08] + +v_cos_f32_e64 v5, src_scc mul:4 +// GFX1250: v_cos_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb6,0xd5,0xfd,0x00,0x00,0x10] + +v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb6,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_ctz_i32_b32_e64 v5, v1 +// GFX1250: v_ctz_i32_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00] + +v_ctz_i32_b32_e64 v5, v255 +// GFX1250: v_ctz_i32_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00] + +v_ctz_i32_b32_e64 v5, s1 +// GFX1250: v_ctz_i32_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, s105 +// GFX1250: v_ctz_i32_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, vcc_lo +// GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, vcc_hi +// GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, ttmp15 +// GFX1250: v_ctz_i32_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, m0 +// GFX1250: v_ctz_i32_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, exec_lo +// GFX1250: v_ctz_i32_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, exec_hi +// GFX1250: v_ctz_i32_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, null +// GFX1250: v_ctz_i32_b32_e64 v5, null ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, -1 +// GFX1250: v_ctz_i32_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, 0.5 +// GFX1250: v_ctz_i32_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, src_scc +// GFX1250: v_ctz_i32_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v255, 0xaf123456 +// GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_cvt_f32_bf8_e64 v1, s3 +// GFX1250: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 +// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 +// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 +// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 +// GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 +// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 +// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 +// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 +// GFX1250: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 +// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 +// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 +// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 +// GFX1250: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 +// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 +// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 +// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 +// GFX1250: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 +// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 +// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 +// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], s3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], v3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], v3 ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], s3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], s3 ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], 3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 ; encoding: [0x04,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], v3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3 ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f16_f32_e64 v5, v1 +// GFX1250: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f16_f32_e64 v5, v255 +// GFX1250: v_cvt_f16_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f16_f32_e64 v5, s1 +// GFX1250: v_cvt_f16_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, s105 +// GFX1250: v_cvt_f16_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_f16_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_f16_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_f16_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, m0 +// GFX1250: v_cvt_f16_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, exec_lo +// GFX1250: v_cvt_f16_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, exec_hi +// GFX1250: v_cvt_f16_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, null +// GFX1250: v_cvt_f16_f32_e64 v5, null ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, -1 +// GFX1250: v_cvt_f16_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f16_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f16_f32_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f16_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_cvt_f16_i16_e64 v5, v1 +// GFX1250: v_cvt_f16_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f16_i16_e64 v5, v255 +// GFX1250: v_cvt_f16_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f16_i16_e64 v5, s1 +// GFX1250: v_cvt_f16_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, s105 +// GFX1250: v_cvt_f16_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, vcc_lo +// GFX1250: v_cvt_f16_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, vcc_hi +// GFX1250: v_cvt_f16_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, ttmp15 +// GFX1250: v_cvt_f16_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, m0 +// GFX1250: v_cvt_f16_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, exec_lo +// GFX1250: v_cvt_f16_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, exec_hi +// GFX1250: v_cvt_f16_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, null +// GFX1250: v_cvt_f16_i16_e64 v5, null ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, -1 +// GFX1250: v_cvt_f16_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f16_i16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd1,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f16_i16_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f16_i16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 +// GFX1250: v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00] + +v_cvt_f16_u16_e64 v5, v1 +// GFX1250: v_cvt_f16_u16_e64 v5, v1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f16_u16_e64 v5, v255 +// GFX1250: v_cvt_f16_u16_e64 v5, v255 ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f16_u16_e64 v5, s1 +// GFX1250: v_cvt_f16_u16_e64 v5, s1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, s105 +// GFX1250: v_cvt_f16_u16_e64 v5, s105 ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, vcc_lo +// GFX1250: v_cvt_f16_u16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, vcc_hi +// GFX1250: v_cvt_f16_u16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, ttmp15 +// GFX1250: v_cvt_f16_u16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, m0 +// GFX1250: v_cvt_f16_u16_e64 v5, m0 ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, exec_lo +// GFX1250: v_cvt_f16_u16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, exec_hi +// GFX1250: v_cvt_f16_u16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, null +// GFX1250: v_cvt_f16_u16_e64 v5, null ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, -1 +// GFX1250: v_cvt_f16_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f16_u16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd0,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f16_u16_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f16_u16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 +// GFX1250: v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00] + +v_cvt_f32_f16_e64 v5, v1 +// GFX1250: v_cvt_f32_f16_e64 v5, v1 ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_f16_e64 v5, v255 +// GFX1250: v_cvt_f32_f16_e64 v5, v255 ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_f16_e64 v5, s1 +// GFX1250: v_cvt_f32_f16_e64 v5, s1 ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, s105 +// GFX1250: v_cvt_f32_f16_e64 v5, s105 ; encoding: [0x05,0x00,0x8b,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8b,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8b,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8b,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, m0 +// GFX1250: v_cvt_f32_f16_e64 v5, m0 ; encoding: [0x05,0x00,0x8b,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, exec_lo +// GFX1250: v_cvt_f32_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8b,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, exec_hi +// GFX1250: v_cvt_f32_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8b,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, null +// GFX1250: v_cvt_f32_f16_e64 v5, null ; encoding: [0x05,0x00,0x8b,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, -1 +// GFX1250: v_cvt_f32_f16_e64 v5, -1 ; encoding: [0x05,0x00,0x8b,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8b,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_f16_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8b,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0x8b,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_cvt_f32_f64_e64 v5, v[2:3] +// GFX1250: v_cvt_f32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x01,0x00,0x00] + +v_cvt_f32_f64_e64 v5, v[254:255] +// GFX1250: v_cvt_f32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x8f,0xd5,0xfe,0x01,0x00,0x00] + +v_cvt_f32_f64_e64 v5, s[2:3] +// GFX1250: v_cvt_f32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, s[104:105] +// GFX1250: v_cvt_f32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x8f,0xd5,0x68,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, vcc +// GFX1250: v_cvt_f32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x8f,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, ttmp[14:15] +// GFX1250: v_cvt_f32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x8f,0xd5,0x7a,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, exec +// GFX1250: v_cvt_f32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x8f,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, null +// GFX1250: v_cvt_f32_f64_e64 v5, null ; encoding: [0x05,0x00,0x8f,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, -1 +// GFX1250: v_cvt_f32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x8f,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_f64_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8f,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_f64_e64 v5, -|src_scc| mul:4 +// GFX1250: v_cvt_f32_f64_e64 v5, -|src_scc| mul:4 ; encoding: [0x05,0x01,0x8f,0xd5,0xfd,0x00,0x00,0x30] + +v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x8f,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_i32_e64 v5, v1 +// GFX1250: v_cvt_f32_i32_e64 v5, v1 ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_i32_e64 v5, v255 +// GFX1250: v_cvt_f32_i32_e64 v5, v255 ; encoding: [0x05,0x00,0x85,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_i32_e64 v5, s1 +// GFX1250: v_cvt_f32_i32_e64 v5, s1 ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, s105 +// GFX1250: v_cvt_f32_i32_e64 v5, s105 ; encoding: [0x05,0x00,0x85,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x85,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x85,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x85,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, m0 +// GFX1250: v_cvt_f32_i32_e64 v5, m0 ; encoding: [0x05,0x00,0x85,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, exec_lo +// GFX1250: v_cvt_f32_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x85,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, exec_hi +// GFX1250: v_cvt_f32_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x85,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, null +// GFX1250: v_cvt_f32_i32_e64 v5, null ; encoding: [0x05,0x00,0x85,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, -1 +// GFX1250: v_cvt_f32_i32_e64 v5, -1 ; encoding: [0x05,0x00,0x85,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_i32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x85,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_i32_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_i32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x85,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x85,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_u32_e64 v5, v1 +// GFX1250: v_cvt_f32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_u32_e64 v5, v255 +// GFX1250: v_cvt_f32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0x86,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_u32_e64 v5, s1 +// GFX1250: v_cvt_f32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, s105 +// GFX1250: v_cvt_f32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0x86,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x86,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x86,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x86,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, m0 +// GFX1250: v_cvt_f32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0x86,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, exec_lo +// GFX1250: v_cvt_f32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x86,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, exec_hi +// GFX1250: v_cvt_f32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x86,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, null +// GFX1250: v_cvt_f32_u32_e64 v5, null ; encoding: [0x05,0x00,0x86,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, -1 +// GFX1250: v_cvt_f32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0x86,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_u32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x86,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_u32_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_u32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x86,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x86,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte0_e64 v5, v1 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, v1 ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, v255 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, v255 ; encoding: [0x05,0x00,0x91,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, s1 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, s1 ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, s105 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, s105 ; encoding: [0x05,0x00,0x91,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x91,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x91,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x91,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, m0 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, m0 ; encoding: [0x05,0x00,0x91,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, exec_lo +// GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_lo ; encoding: [0x05,0x00,0x91,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, exec_hi +// GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_hi ; encoding: [0x05,0x00,0x91,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, null +// GFX1250: v_cvt_f32_ubyte0_e64 v5, null ; encoding: [0x05,0x00,0x91,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, -1 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, -1 ; encoding: [0x05,0x00,0x91,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x91,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_ubyte0_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x91,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x91,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte1_e64 v5, v1 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, v1 ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, v255 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, v255 ; encoding: [0x05,0x00,0x92,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, s1 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, s1 ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, s105 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, s105 ; encoding: [0x05,0x00,0x92,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x92,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x92,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x92,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, m0 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, m0 ; encoding: [0x05,0x00,0x92,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, exec_lo +// GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_lo ; encoding: [0x05,0x00,0x92,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, exec_hi +// GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_hi ; encoding: [0x05,0x00,0x92,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, null +// GFX1250: v_cvt_f32_ubyte1_e64 v5, null ; encoding: [0x05,0x00,0x92,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, -1 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, -1 ; encoding: [0x05,0x00,0x92,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x92,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_ubyte1_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x92,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x92,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte2_e64 v5, v1 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, v1 ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, v255 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, v255 ; encoding: [0x05,0x00,0x93,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, s1 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, s1 ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, s105 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, s105 ; encoding: [0x05,0x00,0x93,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x93,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x93,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x93,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, m0 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, m0 ; encoding: [0x05,0x00,0x93,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, exec_lo +// GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_lo ; encoding: [0x05,0x00,0x93,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, exec_hi +// GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_hi ; encoding: [0x05,0x00,0x93,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, null +// GFX1250: v_cvt_f32_ubyte2_e64 v5, null ; encoding: [0x05,0x00,0x93,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, -1 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, -1 ; encoding: [0x05,0x00,0x93,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x93,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_ubyte2_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x93,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x93,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte3_e64 v5, v1 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, v1 ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, v255 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, v255 ; encoding: [0x05,0x00,0x94,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, s1 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, s1 ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, s105 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, s105 ; encoding: [0x05,0x00,0x94,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x94,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x94,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x94,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, m0 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, m0 ; encoding: [0x05,0x00,0x94,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, exec_lo +// GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_lo ; encoding: [0x05,0x00,0x94,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, exec_hi +// GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_hi ; encoding: [0x05,0x00,0x94,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, null +// GFX1250: v_cvt_f32_ubyte3_e64 v5, null ; encoding: [0x05,0x00,0x94,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, -1 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, -1 ; encoding: [0x05,0x00,0x94,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x94,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_ubyte3_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x94,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x94,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f64_f32_e64 v[6:7], v1 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], v255 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x90,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], s1 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], s105 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x90,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], vcc_lo +// GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x90,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], vcc_hi +// GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x90,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], ttmp15 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x90,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], m0 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x90,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], exec_lo +// GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x90,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], exec_hi +// GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x90,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], null +// GFX1250: v_cvt_f64_f32_e64 v[6:7], null ; encoding: [0x06,0x00,0x90,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], -1 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x90,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x90,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f64_f32_e64 v[6:7], src_scc mul:4 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x90,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 +// GFX1250: v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 ; encoding: [0xfe,0x81,0x90,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_cvt_f64_i32_e64 v[6:7], v1 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], v255 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x84,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], s1 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], s105 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x84,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], vcc_lo +// GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x84,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], vcc_hi +// GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x84,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], ttmp15 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x84,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], m0 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x84,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], exec_lo +// GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x84,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], exec_hi +// GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x84,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], null +// GFX1250: v_cvt_f64_i32_e64 v[6:7], null ; encoding: [0x06,0x00,0x84,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], -1 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x84,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x84,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f64_i32_e64 v[6:7], src_scc mul:4 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x84,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x84,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f64_u32_e64 v[6:7], v1 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], v255 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x96,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], s1 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], s105 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x96,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], vcc_lo +// GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x96,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], vcc_hi +// GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x96,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], ttmp15 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x96,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], m0 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x96,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], exec_lo +// GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x96,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], exec_hi +// GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x96,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], null +// GFX1250: v_cvt_f64_u32_e64 v[6:7], null ; encoding: [0x06,0x00,0x96,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], -1 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x96,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x96,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f64_u32_e64 v[6:7], src_scc mul:4 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x96,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x96,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_floor_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, null +// GFX1250: v_cvt_floor_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_flr_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, null +// GFX1250: v_cvt_floor_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_i16_f16_e64 v5, v1 +// GFX1250: v_cvt_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_i16_f16_e64 v5, v255 +// GFX1250: v_cvt_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_i16_f16_e64 v5, s1 +// GFX1250: v_cvt_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, s105 +// GFX1250: v_cvt_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, m0 +// GFX1250: v_cvt_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, exec_lo +// GFX1250: v_cvt_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, exec_hi +// GFX1250: v_cvt_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, null +// GFX1250: v_cvt_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, -1 +// GFX1250: v_cvt_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, 0.5 +// GFX1250: v_cvt_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, src_scc +// GFX1250: v_cvt_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp +// GFX1250: v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_cvt_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x88,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x88,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x88,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x88,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x88,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x88,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x88,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x88,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, null +// GFX1250: v_cvt_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x88,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x88,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x88,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x88,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp +// GFX1250: v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x88,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_i32_f64_e64 v5, v[2:3] +// GFX1250: v_cvt_i32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x01,0x00,0x00] + +v_cvt_i32_f64_e64 v5, v[254:255] +// GFX1250: v_cvt_i32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x83,0xd5,0xfe,0x01,0x00,0x00] + +v_cvt_i32_f64_e64 v5, s[2:3] +// GFX1250: v_cvt_i32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, s[104:105] +// GFX1250: v_cvt_i32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x83,0xd5,0x68,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, vcc +// GFX1250: v_cvt_i32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x83,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, ttmp[14:15] +// GFX1250: v_cvt_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x83,0xd5,0x7a,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, exec +// GFX1250: v_cvt_i32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x83,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, null +// GFX1250: v_cvt_i32_f64_e64 v5, null ; encoding: [0x05,0x00,0x83,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, -1 +// GFX1250: v_cvt_i32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x83,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, 0.5 +// GFX1250: v_cvt_i32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0x83,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, -|src_scc| +// GFX1250: v_cvt_i32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0x83,0xd5,0xfd,0x00,0x00,0x20] + +v_cvt_i32_f64_e64 v255, 0xaf123456 clamp +// GFX1250: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_cvt_i32_i16_e64 v5, v1 +// GFX1250: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_i32_i16_e64 v5, v255 +// GFX1250: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_i32_i16_e64 v5, s1 +// GFX1250: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, s105 +// GFX1250: v_cvt_i32_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xea,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, vcc_lo +// GFX1250: v_cvt_i32_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xea,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, vcc_hi +// GFX1250: v_cvt_i32_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xea,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, ttmp15 +// GFX1250: v_cvt_i32_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xea,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, m0 +// GFX1250: v_cvt_i32_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xea,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, exec_lo +// GFX1250: v_cvt_i32_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xea,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, exec_hi +// GFX1250: v_cvt_i32_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xea,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, null +// GFX1250: v_cvt_i32_i16_e64 v5, null ; encoding: [0x05,0x00,0xea,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, -1 +// GFX1250: v_cvt_i32_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, 0.5 +// GFX1250: v_cvt_i32_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xea,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, src_scc +// GFX1250: v_cvt_i32_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v255, 0xfe0b +// GFX1250: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, null +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_norm_i16_f16_e64 v5, v1 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, v255 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, s1 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, s105 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, m0 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, exec_lo +// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, exec_hi +// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, null +// GFX1250: v_cvt_norm_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, -1 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, 0.5 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, src_scc +// GFX1250: v_cvt_norm_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v255, -|0xfe0b| +// GFX1250: v_cvt_norm_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, v1 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, v255 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, s1 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, s105 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, m0 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, exec_lo +// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, exec_hi +// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, null +// GFX1250: v_cvt_norm_u16_f16_e64 v5, null ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, -1 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, 0.5 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, src_scc +// GFX1250: v_cvt_norm_u16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v255, -|0xfe0b| +// GFX1250: v_cvt_norm_u16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, v1 +// GFX1250: v_cvt_off_f32_i4_e64 v5, v1 ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, v255 +// GFX1250: v_cvt_off_f32_i4_e64 v5, v255 ; encoding: [0x05,0x00,0x8e,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, s1 +// GFX1250: v_cvt_off_f32_i4_e64 v5, s1 ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, s105 +// GFX1250: v_cvt_off_f32_i4_e64 v5, s105 ; encoding: [0x05,0x00,0x8e,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, vcc_lo +// GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8e,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, vcc_hi +// GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8e,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, ttmp15 +// GFX1250: v_cvt_off_f32_i4_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8e,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, m0 +// GFX1250: v_cvt_off_f32_i4_e64 v5, m0 ; encoding: [0x05,0x00,0x8e,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, exec_lo +// GFX1250: v_cvt_off_f32_i4_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8e,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, exec_hi +// GFX1250: v_cvt_off_f32_i4_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8e,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, null +// GFX1250: v_cvt_off_f32_i4_e64 v5, null ; encoding: [0x05,0x00,0x8e,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, -1 +// GFX1250: v_cvt_off_f32_i4_e64 v5, -1 ; encoding: [0x05,0x00,0x8e,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_off_f32_i4_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8e,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_off_f32_i4_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_off_f32_i4_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8e,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 +// GFX1250: v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 ; encoding: [0xff,0x80,0x8e,0xd5,0xff,0x00,0x00,0x18,0x4f,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, null +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_u16_f16_e64 v5, v1 +// GFX1250: v_cvt_u16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_u16_f16_e64 v5, v255 +// GFX1250: v_cvt_u16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_u16_f16_e64 v5, s1 +// GFX1250: v_cvt_u16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, s105 +// GFX1250: v_cvt_u16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_u16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_u16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_u16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, m0 +// GFX1250: v_cvt_u16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, exec_lo +// GFX1250: v_cvt_u16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, exec_hi +// GFX1250: v_cvt_u16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, null +// GFX1250: v_cvt_u16_f16_e64 v5, null ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, -1 +// GFX1250: v_cvt_u16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, 0.5 +// GFX1250: v_cvt_u16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, src_scc +// GFX1250: v_cvt_u16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp +// GFX1250: v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_cvt_u32_f32_e64 v5, v1 +// GFX1250: v_cvt_u32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_u32_f32_e64 v5, v255 +// GFX1250: v_cvt_u32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x87,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_u32_f32_e64 v5, s1 +// GFX1250: v_cvt_u32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, s105 +// GFX1250: v_cvt_u32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x87,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_u32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x87,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_u32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x87,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_u32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x87,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, m0 +// GFX1250: v_cvt_u32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x87,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_u32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x87,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_u32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x87,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, null +// GFX1250: v_cvt_u32_f32_e64 v5, null ; encoding: [0x05,0x00,0x87,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, -1 +// GFX1250: v_cvt_u32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x87,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_u32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x87,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, src_scc +// GFX1250: v_cvt_u32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x87,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp +// GFX1250: v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x87,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_u32_f64_e64 v5, v[2:3] +// GFX1250: v_cvt_u32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x01,0x00,0x00] + +v_cvt_u32_f64_e64 v5, v[254:255] +// GFX1250: v_cvt_u32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x95,0xd5,0xfe,0x01,0x00,0x00] + +v_cvt_u32_f64_e64 v5, s[2:3] +// GFX1250: v_cvt_u32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, s[104:105] +// GFX1250: v_cvt_u32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x95,0xd5,0x68,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, vcc +// GFX1250: v_cvt_u32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x95,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, ttmp[14:15] +// GFX1250: v_cvt_u32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x95,0xd5,0x7a,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, exec +// GFX1250: v_cvt_u32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x95,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, null +// GFX1250: v_cvt_u32_f64_e64 v5, null ; encoding: [0x05,0x00,0x95,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, -1 +// GFX1250: v_cvt_u32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x95,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, 0.5 +// GFX1250: v_cvt_u32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0x95,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, -|src_scc| +// GFX1250: v_cvt_u32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0x95,0xd5,0xfd,0x00,0x00,0x20] + +v_cvt_u32_f64_e64 v255, 0xaf123456 clamp +// GFX1250: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_cvt_u32_u16_e64 v5, v1 +// GFX1250: v_cvt_u32_u16_e64 v5, v1 ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_u32_u16_e64 v5, v255 +// GFX1250: v_cvt_u32_u16_e64 v5, v255 ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_u32_u16_e64 v5, s1 +// GFX1250: v_cvt_u32_u16_e64 v5, s1 ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, s105 +// GFX1250: v_cvt_u32_u16_e64 v5, s105 ; encoding: [0x05,0x00,0xeb,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, vcc_lo +// GFX1250: v_cvt_u32_u16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xeb,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, vcc_hi +// GFX1250: v_cvt_u32_u16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xeb,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, ttmp15 +// GFX1250: v_cvt_u32_u16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xeb,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, m0 +// GFX1250: v_cvt_u32_u16_e64 v5, m0 ; encoding: [0x05,0x00,0xeb,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, exec_lo +// GFX1250: v_cvt_u32_u16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xeb,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, exec_hi +// GFX1250: v_cvt_u32_u16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xeb,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, null +// GFX1250: v_cvt_u32_u16_e64 v5, null ; encoding: [0x05,0x00,0xeb,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, -1 +// GFX1250: v_cvt_u32_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, 0.5 +// GFX1250: v_cvt_u32_u16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xeb,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, src_scc +// GFX1250: v_cvt_u32_u16_e64 v5, src_scc ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v255, 0xfe0b +// GFX1250: v_cvt_u32_u16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_exp_f16_e64 v5, v1 +// GFX1250: v_exp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00] + +v_exp_f16_e64 v5, v255 +// GFX1250: v_exp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00] + +v_exp_f16_e64 v5, s1 +// GFX1250: v_exp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00] + +v_exp_f16_e64 v5, s105 +// GFX1250: v_exp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00] + +v_exp_f16_e64 v5, vcc_lo +// GFX1250: v_exp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00] + +v_exp_f16_e64 v5, vcc_hi +// GFX1250: v_exp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00] + +v_exp_f16_e64 v5, ttmp15 +// GFX1250: v_exp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00] + +v_exp_f16_e64 v5, m0 +// GFX1250: v_exp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00] + +v_exp_f16_e64 v5, exec_lo +// GFX1250: v_exp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00] + +v_exp_f16_e64 v5, exec_hi +// GFX1250: v_exp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00] + +v_exp_f16_e64 v5, null +// GFX1250: v_exp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00] + +v_exp_f16_e64 v5, -1 +// GFX1250: v_exp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00] + +v_exp_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_exp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08] + +v_exp_f16_e64 v5, src_scc mul:4 +// GFX1250: v_exp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10] + +v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_exp_f32_e64 v5, v1 +// GFX1250: v_exp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00] + +v_exp_f32_e64 v5, v255 +// GFX1250: v_exp_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa5,0xd5,0xff,0x01,0x00,0x00] + +v_exp_f32_e64 v5, s1 +// GFX1250: v_exp_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x00,0x00,0x00] + +v_exp_f32_e64 v5, s105 +// GFX1250: v_exp_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa5,0xd5,0x69,0x00,0x00,0x00] + +v_exp_f32_e64 v5, vcc_lo +// GFX1250: v_exp_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa5,0xd5,0x6a,0x00,0x00,0x00] + +v_exp_f32_e64 v5, vcc_hi +// GFX1250: v_exp_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa5,0xd5,0x6b,0x00,0x00,0x00] + +v_exp_f32_e64 v5, ttmp15 +// GFX1250: v_exp_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa5,0xd5,0x7b,0x00,0x00,0x00] + +v_exp_f32_e64 v5, m0 +// GFX1250: v_exp_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa5,0xd5,0x7d,0x00,0x00,0x00] + +v_exp_f32_e64 v5, exec_lo +// GFX1250: v_exp_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa5,0xd5,0x7e,0x00,0x00,0x00] + +v_exp_f32_e64 v5, exec_hi +// GFX1250: v_exp_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa5,0xd5,0x7f,0x00,0x00,0x00] + +v_exp_f32_e64 v5, null +// GFX1250: v_exp_f32_e64 v5, null ; encoding: [0x05,0x00,0xa5,0xd5,0x7c,0x00,0x00,0x00] + +v_exp_f32_e64 v5, -1 +// GFX1250: v_exp_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa5,0xd5,0xc1,0x00,0x00,0x00] + +v_exp_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_exp_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa5,0xd5,0xf0,0x00,0x00,0x08] + +v_exp_f32_e64 v5, src_scc mul:4 +// GFX1250: v_exp_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa5,0xd5,0xfd,0x00,0x00,0x10] + +v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_ffbh_i32_e64 v5, v1 +// GFX1250: v_cls_i32_e64 v5, v1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00] + +v_ffbh_i32_e64 v5, v255 +// GFX1250: v_cls_i32_e64 v5, v255 ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00] + +v_ffbh_i32_e64 v5, s1 +// GFX1250: v_cls_i32_e64 v5, s1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, s105 +// GFX1250: v_cls_i32_e64 v5, s105 ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, vcc_lo +// GFX1250: v_cls_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, vcc_hi +// GFX1250: v_cls_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, ttmp15 +// GFX1250: v_cls_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, m0 +// GFX1250: v_cls_i32_e64 v5, m0 ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, exec_lo +// GFX1250: v_cls_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, exec_hi +// GFX1250: v_cls_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, null +// GFX1250: v_cls_i32_e64 v5, null ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, -1 +// GFX1250: v_cls_i32_e64 v5, -1 ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, 0.5 +// GFX1250: v_cls_i32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, src_scc +// GFX1250: v_cls_i32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00] + +v_ffbh_i32_e64 v255, 0xaf123456 +// GFX1250: v_cls_i32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_ffbh_u32_e64 v5, v1 +// GFX1250: v_clz_i32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00] + +v_ffbh_u32_e64 v5, v255 +// GFX1250: v_clz_i32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00] + +v_ffbh_u32_e64 v5, s1 +// GFX1250: v_clz_i32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, s105 +// GFX1250: v_clz_i32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, vcc_lo +// GFX1250: v_clz_i32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, vcc_hi +// GFX1250: v_clz_i32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, ttmp15 +// GFX1250: v_clz_i32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, m0 +// GFX1250: v_clz_i32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, exec_lo +// GFX1250: v_clz_i32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, exec_hi +// GFX1250: v_clz_i32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, null +// GFX1250: v_clz_i32_u32_e64 v5, null ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, -1 +// GFX1250: v_clz_i32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, 0.5 +// GFX1250: v_clz_i32_u32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, src_scc +// GFX1250: v_clz_i32_u32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00] + +v_ffbh_u32_e64 v255, 0xaf123456 +// GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_ffbl_b32_e64 v5, v1 +// GFX1250: v_ctz_i32_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00] + +v_ffbl_b32_e64 v5, v255 +// GFX1250: v_ctz_i32_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00] + +v_ffbl_b32_e64 v5, s1 +// GFX1250: v_ctz_i32_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, s105 +// GFX1250: v_ctz_i32_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, vcc_lo +// GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, vcc_hi +// GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, ttmp15 +// GFX1250: v_ctz_i32_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, m0 +// GFX1250: v_ctz_i32_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, exec_lo +// GFX1250: v_ctz_i32_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, exec_hi +// GFX1250: v_ctz_i32_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, null +// GFX1250: v_ctz_i32_b32_e64 v5, null ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, -1 +// GFX1250: v_ctz_i32_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, 0.5 +// GFX1250: v_ctz_i32_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, src_scc +// GFX1250: v_ctz_i32_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00] + +v_ffbl_b32_e64 v255, 0xaf123456 +// GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_floor_f16_e64 v5, v1 +// GFX1250: v_floor_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00] + +v_floor_f16_e64 v5, v255 +// GFX1250: v_floor_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00] + +v_floor_f16_e64 v5, s1 +// GFX1250: v_floor_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00] + +v_floor_f16_e64 v5, s105 +// GFX1250: v_floor_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00] + +v_floor_f16_e64 v5, vcc_lo +// GFX1250: v_floor_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00] + +v_floor_f16_e64 v5, vcc_hi +// GFX1250: v_floor_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00] + +v_floor_f16_e64 v5, ttmp15 +// GFX1250: v_floor_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00] + +v_floor_f16_e64 v5, m0 +// GFX1250: v_floor_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00] + +v_floor_f16_e64 v5, exec_lo +// GFX1250: v_floor_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00] + +v_floor_f16_e64 v5, exec_hi +// GFX1250: v_floor_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00] + +v_floor_f16_e64 v5, null +// GFX1250: v_floor_f16_e64 v5, null ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00] + +v_floor_f16_e64 v5, -1 +// GFX1250: v_floor_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00] + +v_floor_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_floor_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08] + +v_floor_f16_e64 v5, src_scc mul:4 +// GFX1250: v_floor_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10] + +v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_floor_f32_e64 v5, v1 +// GFX1250: v_floor_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00] + +v_floor_f32_e64 v5, v255 +// GFX1250: v_floor_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa4,0xd5,0xff,0x01,0x00,0x00] + +v_floor_f32_e64 v5, s1 +// GFX1250: v_floor_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x00,0x00,0x00] + +v_floor_f32_e64 v5, s105 +// GFX1250: v_floor_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa4,0xd5,0x69,0x00,0x00,0x00] + +v_floor_f32_e64 v5, vcc_lo +// GFX1250: v_floor_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa4,0xd5,0x6a,0x00,0x00,0x00] + +v_floor_f32_e64 v5, vcc_hi +// GFX1250: v_floor_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa4,0xd5,0x6b,0x00,0x00,0x00] + +v_floor_f32_e64 v5, ttmp15 +// GFX1250: v_floor_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa4,0xd5,0x7b,0x00,0x00,0x00] + +v_floor_f32_e64 v5, m0 +// GFX1250: v_floor_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa4,0xd5,0x7d,0x00,0x00,0x00] + +v_floor_f32_e64 v5, exec_lo +// GFX1250: v_floor_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa4,0xd5,0x7e,0x00,0x00,0x00] + +v_floor_f32_e64 v5, exec_hi +// GFX1250: v_floor_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa4,0xd5,0x7f,0x00,0x00,0x00] + +v_floor_f32_e64 v5, null +// GFX1250: v_floor_f32_e64 v5, null ; encoding: [0x05,0x00,0xa4,0xd5,0x7c,0x00,0x00,0x00] + +v_floor_f32_e64 v5, -1 +// GFX1250: v_floor_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa4,0xd5,0xc1,0x00,0x00,0x00] + +v_floor_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_floor_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa4,0xd5,0xf0,0x00,0x00,0x08] + +v_floor_f32_e64 v5, src_scc mul:4 +// GFX1250: v_floor_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa4,0xd5,0xfd,0x00,0x00,0x10] + +v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa4,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_floor_f64_e64 v[6:7], v[2:3] +// GFX1250: v_floor_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x01,0x00,0x00] + +v_floor_f64_e64 v[6:7], v[254:255] +// GFX1250: v_floor_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x9a,0xd5,0xfe,0x01,0x00,0x00] + +v_floor_f64_e64 v[6:7], s[2:3] +// GFX1250: v_floor_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], s[104:105] +// GFX1250: v_floor_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x9a,0xd5,0x68,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], vcc +// GFX1250: v_floor_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x9a,0xd5,0x6a,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_floor_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x9a,0xd5,0x7a,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], exec +// GFX1250: v_floor_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x9a,0xd5,0x7e,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], null +// GFX1250: v_floor_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x9a,0xd5,0x7c,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], -1 +// GFX1250: v_floor_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x9a,0xd5,0xc1,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_floor_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x9a,0xd5,0xf0,0x00,0x00,0x08] + +v_floor_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_floor_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x9a,0xd5,0xfd,0x00,0x00,0x30] + +v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_fract_f16_e64 v5, v1 +// GFX1250: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16_e64 v5, v255 +// GFX1250: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] + +v_fract_f16_e64 v5, s1 +// GFX1250: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] + +v_fract_f16_e64 v5, s105 +// GFX1250: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] + +v_fract_f16_e64 v5, vcc_lo +// GFX1250: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f16_e64 v5, vcc_hi +// GFX1250: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] + +v_fract_f16_e64 v5, ttmp15 +// GFX1250: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] + +v_fract_f16_e64 v5, m0 +// GFX1250: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] + +v_fract_f16_e64 v5, exec_lo +// GFX1250: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f16_e64 v5, exec_hi +// GFX1250: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] + +v_fract_f16_e64 v5, null +// GFX1250: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f16_e64 v5, -1 +// GFX1250: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] + +v_fract_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] + +v_fract_f16_e64 v5, src_scc mul:4 +// GFX1250: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] + +v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_fract_f32_e64 v5, v1 +// GFX1250: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f32_e64 v5, v255 +// GFX1250: v_fract_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa0,0xd5,0xff,0x01,0x00,0x00] + +v_fract_f32_e64 v5, s1 +// GFX1250: v_fract_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x00,0x00,0x00] + +v_fract_f32_e64 v5, s105 +// GFX1250: v_fract_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa0,0xd5,0x69,0x00,0x00,0x00] + +v_fract_f32_e64 v5, vcc_lo +// GFX1250: v_fract_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa0,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f32_e64 v5, vcc_hi +// GFX1250: v_fract_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa0,0xd5,0x6b,0x00,0x00,0x00] + +v_fract_f32_e64 v5, ttmp15 +// GFX1250: v_fract_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa0,0xd5,0x7b,0x00,0x00,0x00] + +v_fract_f32_e64 v5, m0 +// GFX1250: v_fract_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa0,0xd5,0x7d,0x00,0x00,0x00] + +v_fract_f32_e64 v5, exec_lo +// GFX1250: v_fract_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa0,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f32_e64 v5, exec_hi +// GFX1250: v_fract_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa0,0xd5,0x7f,0x00,0x00,0x00] + +v_fract_f32_e64 v5, null +// GFX1250: v_fract_f32_e64 v5, null ; encoding: [0x05,0x00,0xa0,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f32_e64 v5, -1 +// GFX1250: v_fract_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa0,0xd5,0xc1,0x00,0x00,0x00] + +v_fract_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_fract_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa0,0xd5,0xf0,0x00,0x00,0x08] + +v_fract_f32_e64 v5, src_scc mul:4 +// GFX1250: v_fract_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa0,0xd5,0xfd,0x00,0x00,0x10] + +v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_fract_f64_e64 v[6:7], v[2:3] +// GFX1250: v_fract_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x01,0x00,0x00] + +v_fract_f64_e64 v[6:7], v[254:255] +// GFX1250: v_fract_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbe,0xd5,0xfe,0x01,0x00,0x00] + +v_fract_f64_e64 v[6:7], s[2:3] +// GFX1250: v_fract_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], s[104:105] +// GFX1250: v_fract_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbe,0xd5,0x68,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], vcc +// GFX1250: v_fract_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xbe,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_fract_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbe,0xd5,0x7a,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], exec +// GFX1250: v_fract_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xbe,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], null +// GFX1250: v_fract_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xbe,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], -1 +// GFX1250: v_fract_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xbe,0xd5,0xc1,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_fract_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xbe,0xd5,0xf0,0x00,0x00,0x08] + +v_fract_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_fract_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbe,0xd5,0xfd,0x00,0x00,0x30] + +v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_frexp_exp_i16_f16_e64 v5, v1 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, v255 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, s1 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, s105 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, vcc_lo +// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, vcc_hi +// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, ttmp15 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, m0 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, exec_lo +// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, exec_hi +// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, null +// GFX1250: v_frexp_exp_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, -1 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, 0.5 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, src_scc +// GFX1250: v_frexp_exp_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| +// GFX1250: v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, v1 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, v255 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xbf,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, s1 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, s105 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xbf,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, vcc_lo +// GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbf,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, vcc_hi +// GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbf,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, ttmp15 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbf,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, m0 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xbf,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, exec_lo +// GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbf,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, exec_hi +// GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbf,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, null +// GFX1250: v_frexp_exp_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0xbf,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, -1 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xbf,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, 0.5 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbf,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, src_scc +// GFX1250: v_frexp_exp_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbf,0xd5,0xfd,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0xbf,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_frexp_exp_i32_f64_e64 v5, v[2:3] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x01,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, v[254:255] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0xbc,0xd5,0xfe,0x01,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, s[2:3] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, s[104:105] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0xbc,0xd5,0x68,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, vcc +// GFX1250: v_frexp_exp_i32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0xbc,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0xbc,0xd5,0x7a,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, exec +// GFX1250: v_frexp_exp_i32_f64_e64 v5, exec ; encoding: [0x05,0x00,0xbc,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, null +// GFX1250: v_frexp_exp_i32_f64_e64 v5, null ; encoding: [0x05,0x00,0xbc,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, -1 +// GFX1250: v_frexp_exp_i32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0xbc,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, 0.5 +// GFX1250: v_frexp_exp_i32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbc,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, -|src_scc| +// GFX1250: v_frexp_exp_i32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0xbc,0xd5,0xfd,0x00,0x00,0x20] + +v_frexp_exp_i32_f64_e64 v255, 0xaf123456 +// GFX1250: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_frexp_mant_f16_e64 v5, v1 +// GFX1250: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v5, v255 +// GFX1250: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v5, s1 +// GFX1250: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, s105 +// GFX1250: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, vcc_lo +// GFX1250: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, vcc_hi +// GFX1250: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, ttmp15 +// GFX1250: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, m0 +// GFX1250: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, exec_lo +// GFX1250: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, exec_hi +// GFX1250: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, null +// GFX1250: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, -1 +// GFX1250: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] + +v_frexp_mant_f16_e64 v5, src_scc mul:4 +// GFX1250: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] + +v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f32_e64 v5, v1 +// GFX1250: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f32_e64 v5, v255 +// GFX1250: v_frexp_mant_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xc0,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_mant_f32_e64 v5, s1 +// GFX1250: v_frexp_mant_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, s105 +// GFX1250: v_frexp_mant_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xc0,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, vcc_lo +// GFX1250: v_frexp_mant_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xc0,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, vcc_hi +// GFX1250: v_frexp_mant_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xc0,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, ttmp15 +// GFX1250: v_frexp_mant_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xc0,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, m0 +// GFX1250: v_frexp_mant_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xc0,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, exec_lo +// GFX1250: v_frexp_mant_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xc0,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, exec_hi +// GFX1250: v_frexp_mant_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xc0,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, null +// GFX1250: v_frexp_mant_f32_e64 v5, null ; encoding: [0x05,0x00,0xc0,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, -1 +// GFX1250: v_frexp_mant_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xc0,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_frexp_mant_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xc0,0xd5,0xf0,0x00,0x00,0x08] + +v_frexp_mant_f32_e64 v5, src_scc mul:4 +// GFX1250: v_frexp_mant_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xc0,0xd5,0xfd,0x00,0x00,0x10] + +v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xc0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_frexp_mant_f64_e64 v[6:7], v[2:3] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x01,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], v[254:255] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbd,0xd5,0xfe,0x01,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], s[2:3] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], s[104:105] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbd,0xd5,0x68,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], vcc +// GFX1250: v_frexp_mant_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xbd,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbd,0xd5,0x7a,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], exec +// GFX1250: v_frexp_mant_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xbd,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], null +// GFX1250: v_frexp_mant_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xbd,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], -1 +// GFX1250: v_frexp_mant_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xbd,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xbd,0xd5,0xf0,0x00,0x00,0x08] + +v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbd,0xd5,0xfd,0x00,0x00,0x30] + +v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_log_f16_e64 v5, v1 +// GFX1250: v_log_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00] + +v_log_f16_e64 v5, v255 +// GFX1250: v_log_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00] + +v_log_f16_e64 v5, s1 +// GFX1250: v_log_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00] + +v_log_f16_e64 v5, s105 +// GFX1250: v_log_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00] + +v_log_f16_e64 v5, vcc_lo +// GFX1250: v_log_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00] + +v_log_f16_e64 v5, vcc_hi +// GFX1250: v_log_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00] + +v_log_f16_e64 v5, ttmp15 +// GFX1250: v_log_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00] + +v_log_f16_e64 v5, m0 +// GFX1250: v_log_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00] + +v_log_f16_e64 v5, exec_lo +// GFX1250: v_log_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00] + +v_log_f16_e64 v5, exec_hi +// GFX1250: v_log_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00] + +v_log_f16_e64 v5, null +// GFX1250: v_log_f16_e64 v5, null ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00] + +v_log_f16_e64 v5, -1 +// GFX1250: v_log_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00] + +v_log_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_log_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08] + +v_log_f16_e64 v5, src_scc mul:4 +// GFX1250: v_log_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10] + +v_log_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_log_f32_e64 v5, v1 +// GFX1250: v_log_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00] + +v_log_f32_e64 v5, v255 +// GFX1250: v_log_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa7,0xd5,0xff,0x01,0x00,0x00] + +v_log_f32_e64 v5, s1 +// GFX1250: v_log_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x00,0x00,0x00] + +v_log_f32_e64 v5, s105 +// GFX1250: v_log_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa7,0xd5,0x69,0x00,0x00,0x00] + +v_log_f32_e64 v5, vcc_lo +// GFX1250: v_log_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa7,0xd5,0x6a,0x00,0x00,0x00] + +v_log_f32_e64 v5, vcc_hi +// GFX1250: v_log_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa7,0xd5,0x6b,0x00,0x00,0x00] + +v_log_f32_e64 v5, ttmp15 +// GFX1250: v_log_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa7,0xd5,0x7b,0x00,0x00,0x00] + +v_log_f32_e64 v5, m0 +// GFX1250: v_log_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa7,0xd5,0x7d,0x00,0x00,0x00] + +v_log_f32_e64 v5, exec_lo +// GFX1250: v_log_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa7,0xd5,0x7e,0x00,0x00,0x00] + +v_log_f32_e64 v5, exec_hi +// GFX1250: v_log_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa7,0xd5,0x7f,0x00,0x00,0x00] + +v_log_f32_e64 v5, null +// GFX1250: v_log_f32_e64 v5, null ; encoding: [0x05,0x00,0xa7,0xd5,0x7c,0x00,0x00,0x00] + +v_log_f32_e64 v5, -1 +// GFX1250: v_log_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa7,0xd5,0xc1,0x00,0x00,0x00] + +v_log_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_log_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa7,0xd5,0xf0,0x00,0x00,0x08] + +v_log_f32_e64 v5, src_scc mul:4 +// GFX1250: v_log_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa7,0xd5,0xfd,0x00,0x00,0x10] + +v_log_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_log_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa7,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_mov_b32_e64 v5, v1 +// GFX1250: v_mov_b32_e64 v5, v1 ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x01,0x00,0x00] + +v_mov_b32_e64 v5, v255 +// GFX1250: v_mov_b32_e64 v5, v255 ; encoding: [0x05,0x00,0x81,0xd5,0xff,0x01,0x00,0x00] + +v_mov_b32_e64 v5, s1 +// GFX1250: v_mov_b32_e64 v5, s1 ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x00,0x00,0x00] + +v_mov_b32_e64 v5, s105 +// GFX1250: v_mov_b32_e64 v5, s105 ; encoding: [0x05,0x00,0x81,0xd5,0x69,0x00,0x00,0x00] + +v_mov_b32_e64 v5, vcc_lo +// GFX1250: v_mov_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x81,0xd5,0x6a,0x00,0x00,0x00] + +v_mov_b32_e64 v5, vcc_hi +// GFX1250: v_mov_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x81,0xd5,0x6b,0x00,0x00,0x00] + +v_mov_b32_e64 v5, ttmp15 +// GFX1250: v_mov_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x81,0xd5,0x7b,0x00,0x00,0x00] + +v_mov_b32_e64 v5, m0 +// GFX1250: v_mov_b32_e64 v5, m0 ; encoding: [0x05,0x00,0x81,0xd5,0x7d,0x00,0x00,0x00] + +v_mov_b32_e64 v5, exec_lo +// GFX1250: v_mov_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x81,0xd5,0x7e,0x00,0x00,0x00] + +v_mov_b32_e64 v5, exec_hi +// GFX1250: v_mov_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x81,0xd5,0x7f,0x00,0x00,0x00] + +v_mov_b32_e64 v5, null +// GFX1250: v_mov_b32_e64 v5, null ; encoding: [0x05,0x00,0x81,0xd5,0x7c,0x00,0x00,0x00] + +v_mov_b32_e64 v5, -1 +// GFX1250: v_mov_b32_e64 v5, -1 ; encoding: [0x05,0x00,0x81,0xd5,0xc1,0x00,0x00,0x00] + +v_mov_b32_e64 v5, 0.5 +// GFX1250: v_mov_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x81,0xd5,0xf0,0x00,0x00,0x00] + +v_mov_b32_e64 v5, src_scc +// GFX1250: v_mov_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0x81,0xd5,0xfd,0x00,0x00,0x00] + +v_mov_b32_e64 v255, 0xaf123456 +// GFX1250: v_mov_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0x81,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_movreld_b32_e64 v5, v1 +// GFX1250: v_movreld_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x01,0x00,0x00] + +v_movreld_b32_e64 v5, v255 +// GFX1250: v_movreld_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xc2,0xd5,0xff,0x01,0x00,0x00] + +v_movreld_b32_e64 v5, s1 +// GFX1250: v_movreld_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, s105 +// GFX1250: v_movreld_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xc2,0xd5,0x69,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, vcc_lo +// GFX1250: v_movreld_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xc2,0xd5,0x6a,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, vcc_hi +// GFX1250: v_movreld_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xc2,0xd5,0x6b,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, ttmp15 +// GFX1250: v_movreld_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xc2,0xd5,0x7b,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, m0 +// GFX1250: v_movreld_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xc2,0xd5,0x7d,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, exec_lo +// GFX1250: v_movreld_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xc2,0xd5,0x7e,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, exec_hi +// GFX1250: v_movreld_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xc2,0xd5,0x7f,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, null +// GFX1250: v_movreld_b32_e64 v5, null ; encoding: [0x05,0x00,0xc2,0xd5,0x7c,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, -1 +// GFX1250: v_movreld_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xc2,0xd5,0xc1,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, 0.5 +// GFX1250: v_movreld_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xc2,0xd5,0xf0,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, src_scc +// GFX1250: v_movreld_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xc2,0xd5,0xfd,0x00,0x00,0x00] + +v_movreld_b32_e64 v255, 0xaf123456 +// GFX1250: v_movreld_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xc2,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_movrels_b32_e64 v5, v1 +// GFX1250: v_movrels_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc3,0xd5,0x01,0x01,0x00,0x00] + +v_movrels_b32_e64 v255, v255 +// GFX1250: v_movrels_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc3,0xd5,0xff,0x01,0x00,0x00] + +v_movrelsd_2_b32_e64 v5, v1 +// GFX1250: v_movrelsd_2_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc8,0xd5,0x01,0x01,0x00,0x00] + +v_movrelsd_2_b32_e64 v255, v255 +// GFX1250: v_movrelsd_2_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc8,0xd5,0xff,0x01,0x00,0x00] + +v_movrelsd_b32_e64 v5, v1 +// GFX1250: v_movrelsd_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc4,0xd5,0x01,0x01,0x00,0x00] + +v_movrelsd_b32_e64 v255, v255 +// GFX1250: v_movrelsd_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00] + +v_nop_e64 +// GFX1250: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] + +v_not_b16_e64 v5, v1 +// GFX1250: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16_e64 v5, v255 +// GFX1250: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] + +v_not_b16_e64 v5, s1 +// GFX1250: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] + +v_not_b16_e64 v5, s105 +// GFX1250: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] + +v_not_b16_e64 v5, vcc_lo +// GFX1250: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] + +v_not_b16_e64 v5, vcc_hi +// GFX1250: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] + +v_not_b16_e64 v5, ttmp15 +// GFX1250: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] + +v_not_b16_e64 v5, m0 +// GFX1250: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] + +v_not_b16_e64 v5, exec_lo +// GFX1250: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] + +v_not_b16_e64 v5, exec_hi +// GFX1250: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] + +v_not_b16_e64 v5, null +// GFX1250: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] + +v_not_b16_e64 v5, -1 +// GFX1250: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] + +v_not_b16_e64 v5, 0.5 +// GFX1250: v_not_b16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] + +v_not_b16_e64 v5, src_scc +// GFX1250: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] + +v_not_b16_e64 v255, 0xfe0b +// GFX1250: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_not_b32_e64 v5, v1 +// GFX1250: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] + +v_not_b32_e64 v5, v255 +// GFX1250: v_not_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xb7,0xd5,0xff,0x01,0x00,0x00] + +v_not_b32_e64 v5, s1 +// GFX1250: v_not_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x00,0x00,0x00] + +v_not_b32_e64 v5, s105 +// GFX1250: v_not_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xb7,0xd5,0x69,0x00,0x00,0x00] + +v_not_b32_e64 v5, vcc_lo +// GFX1250: v_not_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb7,0xd5,0x6a,0x00,0x00,0x00] + +v_not_b32_e64 v5, vcc_hi +// GFX1250: v_not_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb7,0xd5,0x6b,0x00,0x00,0x00] + +v_not_b32_e64 v5, ttmp15 +// GFX1250: v_not_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb7,0xd5,0x7b,0x00,0x00,0x00] + +v_not_b32_e64 v5, m0 +// GFX1250: v_not_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xb7,0xd5,0x7d,0x00,0x00,0x00] + +v_not_b32_e64 v5, exec_lo +// GFX1250: v_not_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb7,0xd5,0x7e,0x00,0x00,0x00] + +v_not_b32_e64 v5, exec_hi +// GFX1250: v_not_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb7,0xd5,0x7f,0x00,0x00,0x00] + +v_not_b32_e64 v5, null +// GFX1250: v_not_b32_e64 v5, null ; encoding: [0x05,0x00,0xb7,0xd5,0x7c,0x00,0x00,0x00] + +v_not_b32_e64 v5, -1 +// GFX1250: v_not_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xb7,0xd5,0xc1,0x00,0x00,0x00] + +v_not_b32_e64 v5, 0.5 +// GFX1250: v_not_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb7,0xd5,0xf0,0x00,0x00,0x00] + +v_not_b32_e64 v5, src_scc +// GFX1250: v_not_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb7,0xd5,0xfd,0x00,0x00,0x00] + +v_not_b32_e64 v255, 0xaf123456 +// GFX1250: v_not_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_pipeflush_e64 +// GFX1250: v_pipeflush ; encoding: [0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, v1 +// GFX1250: v_rcp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00] + +v_rcp_f16_e64 v5, v255 +// GFX1250: v_rcp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00] + +v_rcp_f16_e64 v5, s1 +// GFX1250: v_rcp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, s105 +// GFX1250: v_rcp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, vcc_lo +// GFX1250: v_rcp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, vcc_hi +// GFX1250: v_rcp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, ttmp15 +// GFX1250: v_rcp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, m0 +// GFX1250: v_rcp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, exec_lo +// GFX1250: v_rcp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, exec_hi +// GFX1250: v_rcp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, null +// GFX1250: v_rcp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, -1 +// GFX1250: v_rcp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_rcp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08] + +v_rcp_f16_e64 v5, src_scc mul:4 +// GFX1250: v_rcp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10] + +v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_rcp_f32_e64 v5, v1 +// GFX1250: v_rcp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00] + +v_rcp_f32_e64 v5, v255 +// GFX1250: v_rcp_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xaa,0xd5,0xff,0x01,0x00,0x00] + +v_rcp_f32_e64 v5, s1 +// GFX1250: v_rcp_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, s105 +// GFX1250: v_rcp_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xaa,0xd5,0x69,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, vcc_lo +// GFX1250: v_rcp_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xaa,0xd5,0x6a,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, vcc_hi +// GFX1250: v_rcp_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xaa,0xd5,0x6b,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, ttmp15 +// GFX1250: v_rcp_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xaa,0xd5,0x7b,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, m0 +// GFX1250: v_rcp_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xaa,0xd5,0x7d,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, exec_lo +// GFX1250: v_rcp_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xaa,0xd5,0x7e,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, exec_hi +// GFX1250: v_rcp_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xaa,0xd5,0x7f,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, null +// GFX1250: v_rcp_f32_e64 v5, null ; encoding: [0x05,0x00,0xaa,0xd5,0x7c,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, -1 +// GFX1250: v_rcp_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xaa,0xd5,0xc1,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_rcp_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xaa,0xd5,0xf0,0x00,0x00,0x08] + +v_rcp_f32_e64 v5, src_scc mul:4 +// GFX1250: v_rcp_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xaa,0xd5,0xfd,0x00,0x00,0x10] + +v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xaa,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_rcp_f64_e64 v[6:7], v[2:3] +// GFX1250: v_rcp_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x01,0x00,0x00] + +v_rcp_f64_e64 v[6:7], v[254:255] +// GFX1250: v_rcp_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xaf,0xd5,0xfe,0x01,0x00,0x00] + +v_rcp_f64_e64 v[6:7], s[2:3] +// GFX1250: v_rcp_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], s[104:105] +// GFX1250: v_rcp_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xaf,0xd5,0x68,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], vcc +// GFX1250: v_rcp_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xaf,0xd5,0x6a,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_rcp_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xaf,0xd5,0x7a,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], exec +// GFX1250: v_rcp_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xaf,0xd5,0x7e,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], null +// GFX1250: v_rcp_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xaf,0xd5,0x7c,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], -1 +// GFX1250: v_rcp_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xaf,0xd5,0xc1,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_rcp_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xaf,0xd5,0xf0,0x00,0x00,0x08] + +v_rcp_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_rcp_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xaf,0xd5,0xfd,0x00,0x00,0x30] + +v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xaf,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_rcp_iflag_f32_e64 v5, v1 +// GFX1250: v_rcp_iflag_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x01,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, v255 +// GFX1250: v_rcp_iflag_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xab,0xd5,0xff,0x01,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, s1 +// GFX1250: v_rcp_iflag_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, s105 +// GFX1250: v_rcp_iflag_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xab,0xd5,0x69,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, vcc_lo +// GFX1250: v_rcp_iflag_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xab,0xd5,0x6a,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, vcc_hi +// GFX1250: v_rcp_iflag_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xab,0xd5,0x6b,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, ttmp15 +// GFX1250: v_rcp_iflag_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xab,0xd5,0x7b,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, m0 +// GFX1250: v_rcp_iflag_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xab,0xd5,0x7d,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, exec_lo +// GFX1250: v_rcp_iflag_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xab,0xd5,0x7e,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, exec_hi +// GFX1250: v_rcp_iflag_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xab,0xd5,0x7f,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, null +// GFX1250: v_rcp_iflag_f32_e64 v5, null ; encoding: [0x05,0x00,0xab,0xd5,0x7c,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, -1 +// GFX1250: v_rcp_iflag_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xab,0xd5,0xc1,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_rcp_iflag_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xab,0xd5,0xf0,0x00,0x00,0x08] + +v_rcp_iflag_f32_e64 v5, src_scc mul:4 +// GFX1250: v_rcp_iflag_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xab,0xd5,0xfd,0x00,0x00,0x10] + +v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_rndne_f16_e64 v5, v1 +// GFX1250: v_rndne_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00] + +v_rndne_f16_e64 v5, v255 +// GFX1250: v_rndne_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00] + +v_rndne_f16_e64 v5, s1 +// GFX1250: v_rndne_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, s105 +// GFX1250: v_rndne_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, vcc_lo +// GFX1250: v_rndne_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, vcc_hi +// GFX1250: v_rndne_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, ttmp15 +// GFX1250: v_rndne_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, m0 +// GFX1250: v_rndne_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, exec_lo +// GFX1250: v_rndne_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, exec_hi +// GFX1250: v_rndne_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, null +// GFX1250: v_rndne_f16_e64 v5, null ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, -1 +// GFX1250: v_rndne_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_rndne_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08] + +v_rndne_f16_e64 v5, src_scc mul:4 +// GFX1250: v_rndne_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10] + +v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_rndne_f32_e64 v5, v1 +// GFX1250: v_rndne_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00] + +v_rndne_f32_e64 v5, v255 +// GFX1250: v_rndne_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa3,0xd5,0xff,0x01,0x00,0x00] + +v_rndne_f32_e64 v5, s1 +// GFX1250: v_rndne_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, s105 +// GFX1250: v_rndne_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa3,0xd5,0x69,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, vcc_lo +// GFX1250: v_rndne_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa3,0xd5,0x6a,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, vcc_hi +// GFX1250: v_rndne_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa3,0xd5,0x6b,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, ttmp15 +// GFX1250: v_rndne_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa3,0xd5,0x7b,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, m0 +// GFX1250: v_rndne_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa3,0xd5,0x7d,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, exec_lo +// GFX1250: v_rndne_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa3,0xd5,0x7e,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, exec_hi +// GFX1250: v_rndne_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa3,0xd5,0x7f,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, null +// GFX1250: v_rndne_f32_e64 v5, null ; encoding: [0x05,0x00,0xa3,0xd5,0x7c,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, -1 +// GFX1250: v_rndne_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa3,0xd5,0xc1,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_rndne_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa3,0xd5,0xf0,0x00,0x00,0x08] + +v_rndne_f32_e64 v5, src_scc mul:4 +// GFX1250: v_rndne_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa3,0xd5,0xfd,0x00,0x00,0x10] + +v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_rndne_f64_e64 v[6:7], v[2:3] +// GFX1250: v_rndne_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x01,0x00,0x00] + +v_rndne_f64_e64 v[6:7], v[254:255] +// GFX1250: v_rndne_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x99,0xd5,0xfe,0x01,0x00,0x00] + +v_rndne_f64_e64 v[6:7], s[2:3] +// GFX1250: v_rndne_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], s[104:105] +// GFX1250: v_rndne_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x99,0xd5,0x68,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], vcc +// GFX1250: v_rndne_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x99,0xd5,0x6a,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_rndne_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x99,0xd5,0x7a,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], exec +// GFX1250: v_rndne_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x99,0xd5,0x7e,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], null +// GFX1250: v_rndne_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x99,0xd5,0x7c,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], -1 +// GFX1250: v_rndne_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x99,0xd5,0xc1,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_rndne_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x99,0xd5,0xf0,0x00,0x00,0x08] + +v_rndne_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_rndne_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x99,0xd5,0xfd,0x00,0x00,0x30] + +v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_rsq_f16_e64 v5, v1 +// GFX1250: v_rsq_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00] + +v_rsq_f16_e64 v5, v255 +// GFX1250: v_rsq_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00] + +v_rsq_f16_e64 v5, s1 +// GFX1250: v_rsq_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, s105 +// GFX1250: v_rsq_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, vcc_lo +// GFX1250: v_rsq_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, vcc_hi +// GFX1250: v_rsq_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, ttmp15 +// GFX1250: v_rsq_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, m0 +// GFX1250: v_rsq_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, exec_lo +// GFX1250: v_rsq_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, exec_hi +// GFX1250: v_rsq_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, null +// GFX1250: v_rsq_f16_e64 v5, null ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, -1 +// GFX1250: v_rsq_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_rsq_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08] + +v_rsq_f16_e64 v5, src_scc mul:4 +// GFX1250: v_rsq_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10] + +v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_rsq_f32_e64 v5, v1 +// GFX1250: v_rsq_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00] + +v_rsq_f32_e64 v5, v255 +// GFX1250: v_rsq_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xae,0xd5,0xff,0x01,0x00,0x00] + +v_rsq_f32_e64 v5, s1 +// GFX1250: v_rsq_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, s105 +// GFX1250: v_rsq_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xae,0xd5,0x69,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, vcc_lo +// GFX1250: v_rsq_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xae,0xd5,0x6a,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, vcc_hi +// GFX1250: v_rsq_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xae,0xd5,0x6b,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, ttmp15 +// GFX1250: v_rsq_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xae,0xd5,0x7b,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, m0 +// GFX1250: v_rsq_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xae,0xd5,0x7d,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, exec_lo +// GFX1250: v_rsq_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xae,0xd5,0x7e,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, exec_hi +// GFX1250: v_rsq_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xae,0xd5,0x7f,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, null +// GFX1250: v_rsq_f32_e64 v5, null ; encoding: [0x05,0x00,0xae,0xd5,0x7c,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, -1 +// GFX1250: v_rsq_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xae,0xd5,0xc1,0x00,0x00,0x00] + +v_rsq_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_rsq_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xae,0xd5,0xf0,0x00,0x00,0x08] + +v_rsq_f32_e64 v5, src_scc mul:4 +// GFX1250: v_rsq_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xae,0xd5,0xfd,0x00,0x00,0x10] + +v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xae,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_rsq_f64_e64 v[6:7], v[2:3] +// GFX1250: v_rsq_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x01,0x00,0x00] + +v_rsq_f64_e64 v[6:7], v[254:255] +// GFX1250: v_rsq_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xb1,0xd5,0xfe,0x01,0x00,0x00] + +v_rsq_f64_e64 v[6:7], s[2:3] +// GFX1250: v_rsq_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x00,0x00,0x00] + +v_rsq_f64_e64 v[6:7], s[104:105] +// GFX1250: v_rsq_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xb1,0xd5,0x68,0x00,0x00,0x00] + +v_rsq_f64_e64 v[6:7], vcc +// GFX1250: v_rsq_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xb1,0xd5,0x6a,0x00,0x00,0x00] + +v_rsq_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_rsq_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xb1,0xd5,0x7a,0x00,0x00,0x00] + +v_rsq_f64_e64 v[6:7], exec +// GFX1250: v_rsq_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xb1,0xd5,0x7e,0x00,0x00,0x00] + +v_rsq_f64_e64 v[6:7], null +// GFX1250: v_rsq_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xb1,0xd5,0x7c,0x00,0x00,0x00] + +v_rsq_f64_e64 v[6:7], -1 +// GFX1250: v_rsq_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xb1,0xd5,0xc1,0x00,0x00,0x00] + +v_rsq_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_rsq_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xb1,0xd5,0xf0,0x00,0x00,0x08] + +v_rsq_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_rsq_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb1,0xd5,0xfd,0x00,0x00,0x30] + +v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_sat_pk_u8_i16_e64 v5, v1 +// GFX1250: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] + +v_sat_pk_u8_i16_e64 v5, v255 +// GFX1250: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] + +v_sat_pk_u8_i16_e64 v5, s1 +// GFX1250: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] + +v_sat_pk_u8_i16_e64 v5, s105 +// GFX1250: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] + +v_sat_pk_u8_i16_e64 v5, vcc_lo +// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] + +v_sat_pk_u8_i16_e64 v5, vcc_hi +// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] + +v_sat_pk_u8_i16_e64 v5, ttmp15 +// GFX1250: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, v255 -// GFX1250: v_tanh_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xca,0xd5,0xff,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, m0 +// GFX1250: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, s1 -// GFX1250: v_tanh_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, exec_lo +// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, s105 -// GFX1250: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, exec_hi +// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, vcc_lo -// GFX1250: v_tanh_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xca,0xd5,0x6a,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, null +// GFX1250: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, vcc_hi -// GFX1250: v_tanh_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xca,0xd5,0x6b,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, -1 +// GFX1250: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, ttmp15 -// GFX1250: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, 0.5 +// GFX1250: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, m0 -// GFX1250: v_tanh_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xca,0xd5,0x7d,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, src_scc +// GFX1250: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, exec_lo -// GFX1250: v_tanh_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xca,0xd5,0x7e,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v255, 0xfe0b +// GFX1250: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] -v_tanh_bf16_e64 v5, exec_hi -// GFX1250: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] +v_sin_f16_e64 v5, v1 +// GFX1250: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] -v_tanh_bf16_e64 v5, null -// GFX1250: v_tanh_bf16_e64 v5, null ; encoding: [0x05,0x00,0xca,0xd5,0x7c,0x00,0x00,0x00] +v_sin_f16_e64 v5, v255 +// GFX1250: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] -v_tanh_bf16_e64 v5, -1 -// GFX1250: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] +v_sin_f16_e64 v5, s1 +// GFX1250: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] +v_sin_f16_e64 v5, s105 +// GFX1250: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] +v_sin_f16_e64 v5, vcc_lo +// GFX1250: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] -v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +v_sin_f16_e64 v5, vcc_hi +// GFX1250: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, s3 -// GFX1250: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] +v_sin_f16_e64 v5, ttmp15 +// GFX1250: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 -// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] +v_sin_f16_e64 v5, m0 +// GFX1250: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 -// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] +v_sin_f16_e64 v5, exec_lo +// GFX1250: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 -// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] +v_sin_f16_e64 v5, exec_hi +// GFX1250: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, 3 -// GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] +v_sin_f16_e64 v5, null +// GFX1250: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 -// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] +v_sin_f16_e64 v5, -1 +// GFX1250: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 -// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] +v_sin_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] -v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 -// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] +v_sin_f16_e64 v5, src_scc mul:4 +// GFX1250: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] -v_cvt_f32_bf8_e64 v1, v3 -// GFX1250: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] +v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] -v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 -// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] +v_sin_f32_e64 v5, v1 +// GFX1250: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] -v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 -// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] +v_sin_f32_e64 v5, v255 +// GFX1250: v_sin_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb5,0xd5,0xff,0x01,0x00,0x00] -v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 -// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] +v_sin_f32_e64 v5, s1 +// GFX1250: v_sin_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, s3 -// GFX1250: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] +v_sin_f32_e64 v5, s105 +// GFX1250: v_sin_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb5,0xd5,0x69,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 -// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] +v_sin_f32_e64 v5, vcc_lo +// GFX1250: v_sin_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb5,0xd5,0x6a,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 -// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] +v_sin_f32_e64 v5, vcc_hi +// GFX1250: v_sin_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb5,0xd5,0x6b,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 -// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] +v_sin_f32_e64 v5, ttmp15 +// GFX1250: v_sin_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb5,0xd5,0x7b,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, 3 -// GFX1250: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] +v_sin_f32_e64 v5, m0 +// GFX1250: v_sin_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb5,0xd5,0x7d,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 -// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] +v_sin_f32_e64 v5, exec_lo +// GFX1250: v_sin_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb5,0xd5,0x7e,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 -// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] +v_sin_f32_e64 v5, exec_hi +// GFX1250: v_sin_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb5,0xd5,0x7f,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 -// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] +v_sin_f32_e64 v5, null +// GFX1250: v_sin_f32_e64 v5, null ; encoding: [0x05,0x00,0xb5,0xd5,0x7c,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, v3 -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sin_f32_e64 v5, -1 +// GFX1250: v_sin_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb5,0xd5,0xc1,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sin_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_sin_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08] -v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sin_f32_e64 v5, src_scc mul:4 +// GFX1250: v_sin_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb5,0xd5,0xfd,0x00,0x00,0x10] -v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] -v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sqrt_f16_e64 v5, v1 +// GFX1250: v_sqrt_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00] -v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sqrt_f16_e64 v5, v255 +// GFX1250: v_sqrt_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00] -v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sqrt_f16_e64 v5, s1 +// GFX1250: v_sqrt_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00] -v_prng_b32_e64 v5, v1 -// GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] +v_sqrt_f16_e64 v5, s105 +// GFX1250: v_sqrt_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00] -v_prng_b32_e64 v5, v255 -// GFX1250: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00] +v_sqrt_f16_e64 v5, vcc_lo +// GFX1250: v_sqrt_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00] -v_prng_b32_e64 v5, s1 -// GFX1250: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, vcc_hi +// GFX1250: v_sqrt_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00] -v_prng_b32_e64 v5, s105 -// GFX1250: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, ttmp15 +// GFX1250: v_sqrt_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00] -v_prng_b32_e64 v5, vcc_lo -// GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, m0 +// GFX1250: v_sqrt_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00] -v_prng_b32_e64 v5, vcc_hi -// GFX1250: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, exec_lo +// GFX1250: v_sqrt_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00] -v_prng_b32_e64 v5, ttmp15 -// GFX1250: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, exec_hi +// GFX1250: v_sqrt_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00] -v_prng_b32_e64 v5, m0 -// GFX1250: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, null +// GFX1250: v_sqrt_f16_e64 v5, null ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00] -v_prng_b32_e64 v5, exec_lo -// GFX1250: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, -1 +// GFX1250: v_sqrt_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00] -v_prng_b32_e64 v5, exec_hi -// GFX1250: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_sqrt_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08] -v_prng_b32_e64 v5, null -// GFX1250: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00] +v_sqrt_f16_e64 v5, src_scc mul:4 +// GFX1250: v_sqrt_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10] -v_prng_b32_e64 v5, -1 -// GFX1250: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00] +v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_sqrt_f32_e64 v5, v1 +// GFX1250: v_sqrt_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00] + +v_sqrt_f32_e64 v5, v255 +// GFX1250: v_sqrt_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb3,0xd5,0xff,0x01,0x00,0x00] + +v_sqrt_f32_e64 v5, s1 +// GFX1250: v_sqrt_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, s105 +// GFX1250: v_sqrt_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb3,0xd5,0x69,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, vcc_lo +// GFX1250: v_sqrt_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb3,0xd5,0x6a,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, vcc_hi +// GFX1250: v_sqrt_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb3,0xd5,0x6b,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, ttmp15 +// GFX1250: v_sqrt_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb3,0xd5,0x7b,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, m0 +// GFX1250: v_sqrt_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb3,0xd5,0x7d,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, exec_lo +// GFX1250: v_sqrt_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb3,0xd5,0x7e,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, exec_hi +// GFX1250: v_sqrt_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb3,0xd5,0x7f,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, null +// GFX1250: v_sqrt_f32_e64 v5, null ; encoding: [0x05,0x00,0xb3,0xd5,0x7c,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, -1 +// GFX1250: v_sqrt_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb3,0xd5,0xc1,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_sqrt_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb3,0xd5,0xf0,0x00,0x00,0x08] + +v_sqrt_f32_e64 v5, src_scc mul:4 +// GFX1250: v_sqrt_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb3,0xd5,0xfd,0x00,0x00,0x10] + +v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_sqrt_f64_e64 v[6:7], v[2:3] +// GFX1250: v_sqrt_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x01,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], v[254:255] +// GFX1250: v_sqrt_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xb4,0xd5,0xfe,0x01,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], s[2:3] +// GFX1250: v_sqrt_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], s[104:105] +// GFX1250: v_sqrt_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xb4,0xd5,0x68,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], vcc +// GFX1250: v_sqrt_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xb4,0xd5,0x6a,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_sqrt_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xb4,0xd5,0x7a,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], exec +// GFX1250: v_sqrt_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xb4,0xd5,0x7e,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], null +// GFX1250: v_sqrt_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xb4,0xd5,0x7c,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], -1 +// GFX1250: v_sqrt_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xb4,0xd5,0xc1,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_sqrt_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xb4,0xd5,0xf0,0x00,0x00,0x08] + +v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb4,0xd5,0xfd,0x00,0x00,0x30] + +v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_trunc_f16_e64 v5, v1 +// GFX1250: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16_e64 v5, v255 +// GFX1250: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] + +v_trunc_f16_e64 v5, s1 +// GFX1250: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, s105 +// GFX1250: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, vcc_lo +// GFX1250: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, vcc_hi +// GFX1250: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, ttmp15 +// GFX1250: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, m0 +// GFX1250: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, exec_lo +// GFX1250: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, exec_hi +// GFX1250: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, null +// GFX1250: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, -1 +// GFX1250: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] + +v_trunc_f16_e64 v5, src_scc mul:4 +// GFX1250: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] + +v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_trunc_f32_e64 v5, v1 +// GFX1250: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f32_e64 v5, v255 +// GFX1250: v_trunc_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa1,0xd5,0xff,0x01,0x00,0x00] + +v_trunc_f32_e64 v5, s1 +// GFX1250: v_trunc_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, s105 +// GFX1250: v_trunc_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa1,0xd5,0x69,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, vcc_lo +// GFX1250: v_trunc_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, vcc_hi +// GFX1250: v_trunc_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd5,0x6b,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, ttmp15 +// GFX1250: v_trunc_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa1,0xd5,0x7b,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, m0 +// GFX1250: v_trunc_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa1,0xd5,0x7d,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, exec_lo +// GFX1250: v_trunc_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa1,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, exec_hi +// GFX1250: v_trunc_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa1,0xd5,0x7f,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, null +// GFX1250: v_trunc_f32_e64 v5, null ; encoding: [0x05,0x00,0xa1,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, -1 +// GFX1250: v_trunc_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa1,0xd5,0xc1,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_trunc_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa1,0xd5,0xf0,0x00,0x00,0x08] + +v_trunc_f32_e64 v5, src_scc mul:4 +// GFX1250: v_trunc_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa1,0xd5,0xfd,0x00,0x00,0x10] + +v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa1,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_trunc_f64_e64 v[6:7], v[2:3] +// GFX1250: v_trunc_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x01,0x00,0x00] + +v_trunc_f64_e64 v[6:7], v[254:255] +// GFX1250: v_trunc_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x97,0xd5,0xfe,0x01,0x00,0x00] + +v_trunc_f64_e64 v[6:7], s[2:3] +// GFX1250: v_trunc_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], s[104:105] +// GFX1250: v_trunc_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x97,0xd5,0x68,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], vcc +// GFX1250: v_trunc_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x97,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_trunc_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x97,0xd5,0x7a,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], exec +// GFX1250: v_trunc_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x97,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], null +// GFX1250: v_trunc_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x97,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], -1 +// GFX1250: v_trunc_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x97,0xd5,0xc1,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_trunc_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x97,0xd5,0xf0,0x00,0x00,0x08] + +v_trunc_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_trunc_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x97,0xd5,0xfd,0x00,0x00,0x30] + +v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x97,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] v_tanh_f32_e64 v5, v1 // GFX1250: v_tanh_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00] @@ -253,6 +3745,87 @@ v_tanh_f16_e64 v5, src_scc mul:4 v_tanh_f16_e64 v255, -|0x8000| clamp div:2 // GFX1250: v_tanh_f16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0x9f,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +v_tanh_bf16_e64 v5, v1 +// GFX1250: v_tanh_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x00,0x00] + +v_tanh_bf16_e64 v5, v255 +// GFX1250: v_tanh_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xca,0xd5,0xff,0x01,0x00,0x00] + +v_tanh_bf16_e64 v5, s1 +// GFX1250: v_tanh_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, s105 +// GFX1250: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, vcc_lo +// GFX1250: v_tanh_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xca,0xd5,0x6a,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, vcc_hi +// GFX1250: v_tanh_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xca,0xd5,0x6b,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, ttmp15 +// GFX1250: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, m0 +// GFX1250: v_tanh_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xca,0xd5,0x7d,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, exec_lo +// GFX1250: v_tanh_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xca,0xd5,0x7e,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, exec_hi +// GFX1250: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, null +// GFX1250: v_tanh_bf16_e64 v5, null ; encoding: [0x05,0x00,0xca,0xd5,0x7c,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, -1 +// GFX1250: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] + +v_tanh_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] + +v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +v_prng_b32_e64 v5, v1 +// GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] + +v_prng_b32_e64 v5, v255 +// GFX1250: v_prng_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xcb,0xd5,0xff,0x01,0x00,0x00] + +v_prng_b32_e64 v5, s1 +// GFX1250: v_prng_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x00,0x00,0x00] + +v_prng_b32_e64 v5, s105 +// GFX1250: v_prng_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xcb,0xd5,0x69,0x00,0x00,0x00] + +v_prng_b32_e64 v5, vcc_lo +// GFX1250: v_prng_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x6a,0x00,0x00,0x00] + +v_prng_b32_e64 v5, vcc_hi +// GFX1250: v_prng_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x6b,0x00,0x00,0x00] + +v_prng_b32_e64 v5, ttmp15 +// GFX1250: v_prng_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xcb,0xd5,0x7b,0x00,0x00,0x00] + +v_prng_b32_e64 v5, m0 +// GFX1250: v_prng_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xcb,0xd5,0x7d,0x00,0x00,0x00] + +v_prng_b32_e64 v5, exec_lo +// GFX1250: v_prng_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xcb,0xd5,0x7e,0x00,0x00,0x00] + +v_prng_b32_e64 v5, exec_hi +// GFX1250: v_prng_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xcb,0xd5,0x7f,0x00,0x00,0x00] + +v_prng_b32_e64 v5, null +// GFX1250: v_prng_b32_e64 v5, null ; encoding: [0x05,0x00,0xcb,0xd5,0x7c,0x00,0x00,0x00] + +v_prng_b32_e64 v5, -1 +// GFX1250: v_prng_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xcb,0xd5,0xc1,0x00,0x00,0x00] + v_rcp_bf16_e64 v5, v1 // GFX1250: v_rcp_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xf9,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s index 8f0c43de07077..8e73ecb4232e0 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1.s @@ -1,134 +1,3827 @@ // NOTE: Assertions have been autogenerated by utils/update_mc_test_checks.py UTC_ARGS: --version 5 // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s -v_tanh_bf16_e64 v5, v1 -// GFX1250: v_tanh_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x00,0x00] +v_bfrev_b32_e64 v5, v1 +// GFX1250: v_bfrev_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00] + +v_bfrev_b32_e64 v5, v255 +// GFX1250: v_bfrev_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xb8,0xd5,0xff,0x01,0x00,0x00] + +v_bfrev_b32_e64 v5, s1 +// GFX1250: v_bfrev_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, s105 +// GFX1250: v_bfrev_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xb8,0xd5,0x69,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, vcc_lo +// GFX1250: v_bfrev_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb8,0xd5,0x6a,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, vcc_hi +// GFX1250: v_bfrev_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb8,0xd5,0x6b,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, ttmp15 +// GFX1250: v_bfrev_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb8,0xd5,0x7b,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, m0 +// GFX1250: v_bfrev_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xb8,0xd5,0x7d,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, exec_lo +// GFX1250: v_bfrev_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb8,0xd5,0x7e,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, exec_hi +// GFX1250: v_bfrev_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb8,0xd5,0x7f,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, null +// GFX1250: v_bfrev_b32_e64 v5, null ; encoding: [0x05,0x00,0xb8,0xd5,0x7c,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, -1 +// GFX1250: v_bfrev_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xb8,0xd5,0xc1,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, 0.5 +// GFX1250: v_bfrev_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb8,0xd5,0xf0,0x00,0x00,0x00] + +v_bfrev_b32_e64 v5, src_scc +// GFX1250: v_bfrev_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb8,0xd5,0xfd,0x00,0x00,0x00] + +v_bfrev_b32_e64 v255, 0xaf123456 +// GFX1250: v_bfrev_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_ceil_f16_e64 v5, v1 +// GFX1250: v_ceil_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00] + +v_ceil_f16_e64 v5, v255 +// GFX1250: v_ceil_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00] + +v_ceil_f16_e64 v5, s1 +// GFX1250: v_ceil_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, s105 +// GFX1250: v_ceil_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, vcc_lo +// GFX1250: v_ceil_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, vcc_hi +// GFX1250: v_ceil_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, ttmp15 +// GFX1250: v_ceil_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, m0 +// GFX1250: v_ceil_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, exec_lo +// GFX1250: v_ceil_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, exec_hi +// GFX1250: v_ceil_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, null +// GFX1250: v_ceil_f16_e64 v5, null ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, -1 +// GFX1250: v_ceil_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00] + +v_ceil_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_ceil_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08] + +v_ceil_f16_e64 v5, src_scc mul:4 +// GFX1250: v_ceil_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10] + +v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_ceil_f16 v5.l, v128.l +// GFX1250: v_ceil_f16_e64 v5.l, v128.l ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00] + +v_ceil_f16 v5.h, v128.h +// GFX1250: v_ceil_f16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdc,0xd5,0x80,0x01,0x00,0x00] + +v_ceil_f32_e64 v5, v1 +// GFX1250: v_ceil_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00] + +v_ceil_f32_e64 v5, v255 +// GFX1250: v_ceil_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa2,0xd5,0xff,0x01,0x00,0x00] + +v_ceil_f32_e64 v5, s1 +// GFX1250: v_ceil_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, s105 +// GFX1250: v_ceil_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa2,0xd5,0x69,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, vcc_lo +// GFX1250: v_ceil_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa2,0xd5,0x6a,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, vcc_hi +// GFX1250: v_ceil_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa2,0xd5,0x6b,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, ttmp15 +// GFX1250: v_ceil_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa2,0xd5,0x7b,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, m0 +// GFX1250: v_ceil_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa2,0xd5,0x7d,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, exec_lo +// GFX1250: v_ceil_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa2,0xd5,0x7e,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, exec_hi +// GFX1250: v_ceil_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa2,0xd5,0x7f,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, null +// GFX1250: v_ceil_f32_e64 v5, null ; encoding: [0x05,0x00,0xa2,0xd5,0x7c,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, -1 +// GFX1250: v_ceil_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa2,0xd5,0xc1,0x00,0x00,0x00] + +v_ceil_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_ceil_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa2,0xd5,0xf0,0x00,0x00,0x08] + +v_ceil_f32_e64 v5, src_scc mul:4 +// GFX1250: v_ceil_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa2,0xd5,0xfd,0x00,0x00,0x10] + +v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa2,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_ceil_f64_e64 v[6:7], v[2:3] +// GFX1250: v_ceil_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x01,0x00,0x00] + +v_ceil_f64_e64 v[6:7], v[254:255] +// GFX1250: v_ceil_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x98,0xd5,0xfe,0x01,0x00,0x00] + +v_ceil_f64_e64 v[6:7], s[2:3] +// GFX1250: v_ceil_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], s[104:105] +// GFX1250: v_ceil_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x98,0xd5,0x68,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], vcc +// GFX1250: v_ceil_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x98,0xd5,0x6a,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_ceil_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x98,0xd5,0x7a,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], exec +// GFX1250: v_ceil_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x98,0xd5,0x7e,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], null +// GFX1250: v_ceil_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x98,0xd5,0x7c,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], -1 +// GFX1250: v_ceil_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x98,0xd5,0xc1,0x00,0x00,0x00] + +v_ceil_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_ceil_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x98,0xd5,0xf0,0x00,0x00,0x08] + +v_ceil_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_ceil_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x98,0xd5,0xfd,0x00,0x00,0x30] + +v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x98,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cls_i32_e64 v5, v1 +// GFX1250: v_cls_i32_e64 v5, v1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00] + +v_cls_i32_e64 v5, v255 +// GFX1250: v_cls_i32_e64 v5, v255 ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00] + +v_cls_i32_e64 v5, s1 +// GFX1250: v_cls_i32_e64 v5, s1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00] + +v_cls_i32_e64 v5, s105 +// GFX1250: v_cls_i32_e64 v5, s105 ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00] + +v_cls_i32_e64 v5, vcc_lo +// GFX1250: v_cls_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00] + +v_cls_i32_e64 v5, vcc_hi +// GFX1250: v_cls_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00] + +v_cls_i32_e64 v5, ttmp15 +// GFX1250: v_cls_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00] + +v_cls_i32_e64 v5, m0 +// GFX1250: v_cls_i32_e64 v5, m0 ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00] + +v_cls_i32_e64 v5, exec_lo +// GFX1250: v_cls_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00] + +v_cls_i32_e64 v5, exec_hi +// GFX1250: v_cls_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00] + +v_cls_i32_e64 v5, null +// GFX1250: v_cls_i32_e64 v5, null ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00] + +v_cls_i32_e64 v5, -1 +// GFX1250: v_cls_i32_e64 v5, -1 ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00] + +v_cls_i32_e64 v5, 0.5 +// GFX1250: v_cls_i32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00] + +v_cls_i32_e64 v5, src_scc +// GFX1250: v_cls_i32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00] + +v_cls_i32_e64 v255, 0xaf123456 +// GFX1250: v_cls_i32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_clz_i32_u32_e64 v5, v1 +// GFX1250: v_clz_i32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00] + +v_clz_i32_u32_e64 v5, v255 +// GFX1250: v_clz_i32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00] + +v_clz_i32_u32_e64 v5, s1 +// GFX1250: v_clz_i32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, s105 +// GFX1250: v_clz_i32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, vcc_lo +// GFX1250: v_clz_i32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, vcc_hi +// GFX1250: v_clz_i32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, ttmp15 +// GFX1250: v_clz_i32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, m0 +// GFX1250: v_clz_i32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, exec_lo +// GFX1250: v_clz_i32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, exec_hi +// GFX1250: v_clz_i32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, null +// GFX1250: v_clz_i32_u32_e64 v5, null ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, -1 +// GFX1250: v_clz_i32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, 0.5 +// GFX1250: v_clz_i32_u32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v5, src_scc +// GFX1250: v_clz_i32_u32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00] + +v_clz_i32_u32_e64 v255, 0xaf123456 +// GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_cos_f16_e64 v5, v1 +// GFX1250: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f16_e64 v5, v255 +// GFX1250: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] + +v_cos_f16_e64 v5, s1 +// GFX1250: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] + +v_cos_f16_e64 v5, s105 +// GFX1250: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] + +v_cos_f16_e64 v5, vcc_lo +// GFX1250: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_f16_e64 v5, vcc_hi +// GFX1250: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_f16_e64 v5, ttmp15 +// GFX1250: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_f16_e64 v5, m0 +// GFX1250: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_f16_e64 v5, exec_lo +// GFX1250: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_f16_e64 v5, exec_hi +// GFX1250: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_f16_e64 v5, null +// GFX1250: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_f16_e64 v5, -1 +// GFX1250: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] + +v_cos_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] + +v_cos_f16_e64 v5, src_scc mul:4 +// GFX1250: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] + +v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_cos_f16 v5.l, v128.l +// GFX1250: v_cos_f16_e64 v5.l, v128.l ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00] + +v_cos_f16 v5.h, v128.h +// GFX1250: v_cos_f16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x80,0x01,0x00,0x00] + +v_cos_f32_e64 v5, v1 +// GFX1250: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] + +v_cos_f32_e64 v5, v255 +// GFX1250: v_cos_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb6,0xd5,0xff,0x01,0x00,0x00] + +v_cos_f32_e64 v5, s1 +// GFX1250: v_cos_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x00,0x00,0x00] + +v_cos_f32_e64 v5, s105 +// GFX1250: v_cos_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb6,0xd5,0x69,0x00,0x00,0x00] + +v_cos_f32_e64 v5, vcc_lo +// GFX1250: v_cos_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb6,0xd5,0x6a,0x00,0x00,0x00] + +v_cos_f32_e64 v5, vcc_hi +// GFX1250: v_cos_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb6,0xd5,0x6b,0x00,0x00,0x00] + +v_cos_f32_e64 v5, ttmp15 +// GFX1250: v_cos_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb6,0xd5,0x7b,0x00,0x00,0x00] + +v_cos_f32_e64 v5, m0 +// GFX1250: v_cos_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb6,0xd5,0x7d,0x00,0x00,0x00] + +v_cos_f32_e64 v5, exec_lo +// GFX1250: v_cos_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb6,0xd5,0x7e,0x00,0x00,0x00] + +v_cos_f32_e64 v5, exec_hi +// GFX1250: v_cos_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb6,0xd5,0x7f,0x00,0x00,0x00] + +v_cos_f32_e64 v5, null +// GFX1250: v_cos_f32_e64 v5, null ; encoding: [0x05,0x00,0xb6,0xd5,0x7c,0x00,0x00,0x00] + +v_cos_f32_e64 v5, -1 +// GFX1250: v_cos_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb6,0xd5,0xc1,0x00,0x00,0x00] + +v_cos_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_cos_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb6,0xd5,0xf0,0x00,0x00,0x08] + +v_cos_f32_e64 v5, src_scc mul:4 +// GFX1250: v_cos_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb6,0xd5,0xfd,0x00,0x00,0x10] + +v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb6,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_ctz_i32_b32_e64 v5, v1 +// GFX1250: v_ctz_i32_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00] + +v_ctz_i32_b32_e64 v5, v255 +// GFX1250: v_ctz_i32_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00] + +v_ctz_i32_b32_e64 v5, s1 +// GFX1250: v_ctz_i32_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, s105 +// GFX1250: v_ctz_i32_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, vcc_lo +// GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, vcc_hi +// GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, ttmp15 +// GFX1250: v_ctz_i32_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, m0 +// GFX1250: v_ctz_i32_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, exec_lo +// GFX1250: v_ctz_i32_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, exec_hi +// GFX1250: v_ctz_i32_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, null +// GFX1250: v_ctz_i32_b32_e64 v5, null ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, -1 +// GFX1250: v_ctz_i32_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, 0.5 +// GFX1250: v_ctz_i32_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v5, src_scc +// GFX1250: v_ctz_i32_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00] + +v_ctz_i32_b32_e64 v255, 0xaf123456 +// GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_cvt_f32_bf8_e64 v1, s3 +// GFX1250: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 +// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 +// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 +// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 +// GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 +// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 +// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 +// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 +// GFX1250: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 +// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 +// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 +// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 +// GFX1250: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 +// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 +// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 +// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 +// GFX1250: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 +// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 +// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 +// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp +// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8 v[2:3], v128.h +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8 v[2:3], v128.l +// GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], v128.l ; encoding: [0x02,0x00,0xef,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8 v[2:3], v128.h +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8 v[2:3], v128.l +// GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], v128.l ; encoding: [0x02,0x00,0xee,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], s3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], v3 +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], v3 ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], s3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], s3 ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], 3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 ; encoding: [0x04,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], v3 +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3 ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] +// GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +v_cvt_f16_f32_e64 v5, v1 +// GFX1250: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f16_f32_e64 v5, v255 +// GFX1250: v_cvt_f16_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f16_f32_e64 v5, s1 +// GFX1250: v_cvt_f16_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, s105 +// GFX1250: v_cvt_f16_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_f16_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_f16_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_f16_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, m0 +// GFX1250: v_cvt_f16_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, exec_lo +// GFX1250: v_cvt_f16_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, exec_hi +// GFX1250: v_cvt_f16_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, null +// GFX1250: v_cvt_f16_f32_e64 v5, null ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, -1 +// GFX1250: v_cvt_f16_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f16_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f16_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f16_f32_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f16_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_cvt_f16_f32 v128.l, v15 +// GFX1250: v_cvt_f16_f32_e64 v128.l, v15 ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00] + +v_cvt_f16_f32 v128.h, v15 +// GFX1250: v_cvt_f16_f32_e64 v128.h, v15 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0x0f,0x01,0x00,0x00] + +v_cvt_f16_i16_e64 v5, v1 +// GFX1250: v_cvt_f16_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f16_i16_e64 v5, v255 +// GFX1250: v_cvt_f16_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f16_i16_e64 v5, s1 +// GFX1250: v_cvt_f16_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, s105 +// GFX1250: v_cvt_f16_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, vcc_lo +// GFX1250: v_cvt_f16_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, vcc_hi +// GFX1250: v_cvt_f16_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, ttmp15 +// GFX1250: v_cvt_f16_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, m0 +// GFX1250: v_cvt_f16_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, exec_lo +// GFX1250: v_cvt_f16_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, exec_hi +// GFX1250: v_cvt_f16_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, null +// GFX1250: v_cvt_f16_i16_e64 v5, null ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, -1 +// GFX1250: v_cvt_f16_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f16_i16_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f16_i16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd1,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f16_i16_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f16_i16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 +// GFX1250: v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00] + +v_cvt_f16_i16 v128.l, v15.l +// GFX1250: v_cvt_f16_i16_e64 v128.l, v15.l ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00] + +v_cvt_f16_i16 v128.h, v15.h +// GFX1250: v_cvt_f16_i16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd1,0xd5,0x0f,0x01,0x00,0x00] + +v_cvt_f16_u16_e64 v5, v1 +// GFX1250: v_cvt_f16_u16_e64 v5, v1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f16_u16_e64 v5, v255 +// GFX1250: v_cvt_f16_u16_e64 v5, v255 ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f16_u16_e64 v5, s1 +// GFX1250: v_cvt_f16_u16_e64 v5, s1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, s105 +// GFX1250: v_cvt_f16_u16_e64 v5, s105 ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, vcc_lo +// GFX1250: v_cvt_f16_u16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, vcc_hi +// GFX1250: v_cvt_f16_u16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, ttmp15 +// GFX1250: v_cvt_f16_u16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, m0 +// GFX1250: v_cvt_f16_u16_e64 v5, m0 ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, exec_lo +// GFX1250: v_cvt_f16_u16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, exec_hi +// GFX1250: v_cvt_f16_u16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, null +// GFX1250: v_cvt_f16_u16_e64 v5, null ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, -1 +// GFX1250: v_cvt_f16_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f16_u16_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f16_u16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd0,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f16_u16_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f16_u16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 +// GFX1250: v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00] + +v_cvt_f16_u16 v128.l, v15.l +// GFX1250: v_cvt_f16_u16_e64 v128.l, v15.l ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00] + +v_cvt_f16_u16 v128.h, v15.h +// GFX1250: v_cvt_f16_u16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd0,0xd5,0x0f,0x01,0x00,0x00] + +v_cvt_f32_f16_e64 v5, v1 +// GFX1250: v_cvt_f32_f16_e64 v5, v1 ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_f16_e64 v5, v255 +// GFX1250: v_cvt_f32_f16_e64 v5, v255 ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_f16_e64 v5, s1 +// GFX1250: v_cvt_f32_f16_e64 v5, s1 ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, s105 +// GFX1250: v_cvt_f32_f16_e64 v5, s105 ; encoding: [0x05,0x00,0x8b,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8b,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8b,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8b,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, m0 +// GFX1250: v_cvt_f32_f16_e64 v5, m0 ; encoding: [0x05,0x00,0x8b,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, exec_lo +// GFX1250: v_cvt_f32_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8b,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, exec_hi +// GFX1250: v_cvt_f32_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8b,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, null +// GFX1250: v_cvt_f32_f16_e64 v5, null ; encoding: [0x05,0x00,0x8b,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, -1 +// GFX1250: v_cvt_f32_f16_e64 v5, -1 ; encoding: [0x05,0x00,0x8b,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8b,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_f16_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8b,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0x8b,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_cvt_f32_f16 v1, v128.l +// GFX1250: v_cvt_f32_f16_e64 v1, v128.l ; encoding: [0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_f32_f16 v1, v128.h +// GFX1250: v_cvt_f32_f16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0x8b,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_f32_f64_e64 v5, v[2:3] +// GFX1250: v_cvt_f32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x01,0x00,0x00] + +v_cvt_f32_f64_e64 v5, v[254:255] +// GFX1250: v_cvt_f32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x8f,0xd5,0xfe,0x01,0x00,0x00] + +v_cvt_f32_f64_e64 v5, s[2:3] +// GFX1250: v_cvt_f32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, s[104:105] +// GFX1250: v_cvt_f32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x8f,0xd5,0x68,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, vcc +// GFX1250: v_cvt_f32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x8f,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, ttmp[14:15] +// GFX1250: v_cvt_f32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x8f,0xd5,0x7a,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, exec +// GFX1250: v_cvt_f32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x8f,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, null +// GFX1250: v_cvt_f32_f64_e64 v5, null ; encoding: [0x05,0x00,0x8f,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, -1 +// GFX1250: v_cvt_f32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x8f,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_f64_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_f64_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8f,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_f64_e64 v5, -|src_scc| mul:4 +// GFX1250: v_cvt_f32_f64_e64 v5, -|src_scc| mul:4 ; encoding: [0x05,0x01,0x8f,0xd5,0xfd,0x00,0x00,0x30] + +v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x8f,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_i32_e64 v5, v1 +// GFX1250: v_cvt_f32_i32_e64 v5, v1 ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_i32_e64 v5, v255 +// GFX1250: v_cvt_f32_i32_e64 v5, v255 ; encoding: [0x05,0x00,0x85,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_i32_e64 v5, s1 +// GFX1250: v_cvt_f32_i32_e64 v5, s1 ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, s105 +// GFX1250: v_cvt_f32_i32_e64 v5, s105 ; encoding: [0x05,0x00,0x85,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x85,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x85,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x85,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, m0 +// GFX1250: v_cvt_f32_i32_e64 v5, m0 ; encoding: [0x05,0x00,0x85,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, exec_lo +// GFX1250: v_cvt_f32_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x85,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, exec_hi +// GFX1250: v_cvt_f32_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x85,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, null +// GFX1250: v_cvt_f32_i32_e64 v5, null ; encoding: [0x05,0x00,0x85,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, -1 +// GFX1250: v_cvt_f32_i32_e64 v5, -1 ; encoding: [0x05,0x00,0x85,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_i32_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_i32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x85,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_i32_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_i32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x85,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x85,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_u32_e64 v5, v1 +// GFX1250: v_cvt_f32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_u32_e64 v5, v255 +// GFX1250: v_cvt_f32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0x86,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_u32_e64 v5, s1 +// GFX1250: v_cvt_f32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, s105 +// GFX1250: v_cvt_f32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0x86,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x86,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x86,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x86,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, m0 +// GFX1250: v_cvt_f32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0x86,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, exec_lo +// GFX1250: v_cvt_f32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x86,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, exec_hi +// GFX1250: v_cvt_f32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x86,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, null +// GFX1250: v_cvt_f32_u32_e64 v5, null ; encoding: [0x05,0x00,0x86,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, -1 +// GFX1250: v_cvt_f32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0x86,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_u32_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_u32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x86,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_u32_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_u32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x86,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x86,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte0_e64 v5, v1 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, v1 ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, v255 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, v255 ; encoding: [0x05,0x00,0x91,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, s1 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, s1 ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, s105 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, s105 ; encoding: [0x05,0x00,0x91,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x91,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x91,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x91,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, m0 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, m0 ; encoding: [0x05,0x00,0x91,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, exec_lo +// GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_lo ; encoding: [0x05,0x00,0x91,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, exec_hi +// GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_hi ; encoding: [0x05,0x00,0x91,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, null +// GFX1250: v_cvt_f32_ubyte0_e64 v5, null ; encoding: [0x05,0x00,0x91,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, -1 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, -1 ; encoding: [0x05,0x00,0x91,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x91,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_ubyte0_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_ubyte0_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x91,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x91,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte1_e64 v5, v1 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, v1 ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, v255 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, v255 ; encoding: [0x05,0x00,0x92,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, s1 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, s1 ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, s105 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, s105 ; encoding: [0x05,0x00,0x92,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x92,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x92,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x92,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, m0 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, m0 ; encoding: [0x05,0x00,0x92,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, exec_lo +// GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_lo ; encoding: [0x05,0x00,0x92,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, exec_hi +// GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_hi ; encoding: [0x05,0x00,0x92,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, null +// GFX1250: v_cvt_f32_ubyte1_e64 v5, null ; encoding: [0x05,0x00,0x92,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, -1 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, -1 ; encoding: [0x05,0x00,0x92,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x92,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_ubyte1_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_ubyte1_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x92,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x92,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte2_e64 v5, v1 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, v1 ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, v255 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, v255 ; encoding: [0x05,0x00,0x93,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, s1 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, s1 ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, s105 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, s105 ; encoding: [0x05,0x00,0x93,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x93,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x93,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x93,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, m0 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, m0 ; encoding: [0x05,0x00,0x93,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, exec_lo +// GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_lo ; encoding: [0x05,0x00,0x93,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, exec_hi +// GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_hi ; encoding: [0x05,0x00,0x93,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, null +// GFX1250: v_cvt_f32_ubyte2_e64 v5, null ; encoding: [0x05,0x00,0x93,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, -1 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, -1 ; encoding: [0x05,0x00,0x93,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x93,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_ubyte2_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_ubyte2_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x93,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x93,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f32_ubyte3_e64 v5, v1 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, v1 ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, v255 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, v255 ; encoding: [0x05,0x00,0x94,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, s1 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, s1 ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, s105 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, s105 ; encoding: [0x05,0x00,0x94,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, vcc_lo +// GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x94,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, vcc_hi +// GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x94,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, ttmp15 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x94,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, m0 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, m0 ; encoding: [0x05,0x00,0x94,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, exec_lo +// GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_lo ; encoding: [0x05,0x00,0x94,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, exec_hi +// GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_hi ; encoding: [0x05,0x00,0x94,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, null +// GFX1250: v_cvt_f32_ubyte3_e64 v5, null ; encoding: [0x05,0x00,0x94,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, -1 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, -1 ; encoding: [0x05,0x00,0x94,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x94,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f32_ubyte3_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_f32_ubyte3_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x94,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x94,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f64_f32_e64 v[6:7], v1 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], v255 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x90,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], s1 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], s105 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x90,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], vcc_lo +// GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x90,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], vcc_hi +// GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x90,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], ttmp15 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x90,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], m0 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x90,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], exec_lo +// GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x90,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], exec_hi +// GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x90,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], null +// GFX1250: v_cvt_f64_f32_e64 v[6:7], null ; encoding: [0x06,0x00,0x90,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], -1 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x90,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x90,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f64_f32_e64 v[6:7], src_scc mul:4 +// GFX1250: v_cvt_f64_f32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x90,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 +// GFX1250: v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 ; encoding: [0xfe,0x81,0x90,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_cvt_f64_i32_e64 v[6:7], v1 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], v255 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x84,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], s1 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], s105 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x84,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], vcc_lo +// GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x84,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], vcc_hi +// GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x84,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], ttmp15 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x84,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], m0 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x84,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], exec_lo +// GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x84,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], exec_hi +// GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x84,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], null +// GFX1250: v_cvt_f64_i32_e64 v[6:7], null ; encoding: [0x06,0x00,0x84,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], -1 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x84,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x84,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f64_i32_e64 v[6:7], src_scc mul:4 +// GFX1250: v_cvt_f64_i32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x84,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x84,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_f64_u32_e64 v[6:7], v1 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], v255 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x96,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], s1 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], s105 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x96,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], vcc_lo +// GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x96,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], vcc_hi +// GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x96,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], ttmp15 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x96,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], m0 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x96,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], exec_lo +// GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x96,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], exec_hi +// GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x96,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], null +// GFX1250: v_cvt_f64_u32_e64 v[6:7], null ; encoding: [0x06,0x00,0x96,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], -1 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x96,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x96,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_f64_u32_e64 v[6:7], src_scc mul:4 +// GFX1250: v_cvt_f64_u32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x96,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x96,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_cvt_floor_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, null +// GFX1250: v_cvt_floor_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_flr_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, null +// GFX1250: v_cvt_floor_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_flr_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_i16_f16_e64 v5, v1 +// GFX1250: v_cvt_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_i16_f16_e64 v5, v255 +// GFX1250: v_cvt_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_i16_f16_e64 v5, s1 +// GFX1250: v_cvt_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, s105 +// GFX1250: v_cvt_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, m0 +// GFX1250: v_cvt_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, exec_lo +// GFX1250: v_cvt_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, exec_hi +// GFX1250: v_cvt_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, null +// GFX1250: v_cvt_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, -1 +// GFX1250: v_cvt_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, 0.5 +// GFX1250: v_cvt_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v5, src_scc +// GFX1250: v_cvt_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp +// GFX1250: v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_cvt_i16_f16 v1.l, v128.l +// GFX1250: v_cvt_i16_f16_e64 v1.l, v128.l ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_i16_f16 v1.h, v128.h +// GFX1250: v_cvt_i16_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd3,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x88,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x88,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x88,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x88,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x88,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x88,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x88,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x88,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, null +// GFX1250: v_cvt_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x88,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x88,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x88,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x88,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp +// GFX1250: v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x88,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_i32_f64_e64 v5, v[2:3] +// GFX1250: v_cvt_i32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x01,0x00,0x00] + +v_cvt_i32_f64_e64 v5, v[254:255] +// GFX1250: v_cvt_i32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x83,0xd5,0xfe,0x01,0x00,0x00] + +v_cvt_i32_f64_e64 v5, s[2:3] +// GFX1250: v_cvt_i32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, s[104:105] +// GFX1250: v_cvt_i32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x83,0xd5,0x68,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, vcc +// GFX1250: v_cvt_i32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x83,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, ttmp[14:15] +// GFX1250: v_cvt_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x83,0xd5,0x7a,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, exec +// GFX1250: v_cvt_i32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x83,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, null +// GFX1250: v_cvt_i32_f64_e64 v5, null ; encoding: [0x05,0x00,0x83,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, -1 +// GFX1250: v_cvt_i32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x83,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, 0.5 +// GFX1250: v_cvt_i32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0x83,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_i32_f64_e64 v5, -|src_scc| +// GFX1250: v_cvt_i32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0x83,0xd5,0xfd,0x00,0x00,0x20] + +v_cvt_i32_f64_e64 v255, 0xaf123456 clamp +// GFX1250: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_cvt_i32_i16_e64 v5, v1 +// GFX1250: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_i32_i16_e64 v5, v255 +// GFX1250: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_i32_i16_e64 v5, s1 +// GFX1250: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, s105 +// GFX1250: v_cvt_i32_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xea,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, vcc_lo +// GFX1250: v_cvt_i32_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xea,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, vcc_hi +// GFX1250: v_cvt_i32_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xea,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, ttmp15 +// GFX1250: v_cvt_i32_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xea,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, m0 +// GFX1250: v_cvt_i32_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xea,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, exec_lo +// GFX1250: v_cvt_i32_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xea,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, exec_hi +// GFX1250: v_cvt_i32_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xea,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, null +// GFX1250: v_cvt_i32_i16_e64 v5, null ; encoding: [0x05,0x00,0xea,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, -1 +// GFX1250: v_cvt_i32_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, 0.5 +// GFX1250: v_cvt_i32_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xea,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v5, src_scc +// GFX1250: v_cvt_i32_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_i32_i16_e64 v255, 0xfe0b +// GFX1250: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_cvt_i32_i16 v1, v128.l +// GFX1250: v_cvt_i32_i16_e64 v1, v128.l ; encoding: [0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_i32_i16 v1, v128.h +// GFX1250: v_cvt_i32_i16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xea,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, null +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_norm_i16_f16_e64 v5, v1 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, v255 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, s1 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, s105 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_norm_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, m0 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, exec_lo +// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, exec_hi +// GFX1250: v_cvt_norm_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, null +// GFX1250: v_cvt_norm_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, -1 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, 0.5 +// GFX1250: v_cvt_norm_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v5, src_scc +// GFX1250: v_cvt_norm_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_norm_i16_f16_e64 v255, -|0xfe0b| +// GFX1250: v_cvt_norm_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_cvt_norm_i16_f16 v1.l, v128.l +// GFX1250: v_cvt_norm_i16_f16_e64 v1.l, v128.l ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_norm_i16_f16 v1.l, v128.h +// GFX1250: v_cvt_norm_i16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe3,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, v1 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, v255 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, s1 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, s105 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_norm_u16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, m0 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, exec_lo +// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, exec_hi +// GFX1250: v_cvt_norm_u16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, null +// GFX1250: v_cvt_norm_u16_f16_e64 v5, null ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, -1 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, 0.5 +// GFX1250: v_cvt_norm_u16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v5, src_scc +// GFX1250: v_cvt_norm_u16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_norm_u16_f16_e64 v255, -|0xfe0b| +// GFX1250: v_cvt_norm_u16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_cvt_norm_u16_f16 v1.l, v128.l +// GFX1250: v_cvt_norm_u16_f16_e64 v1.l, v128.l ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_norm_u16_f16 v1.l, v128.h +// GFX1250: v_cvt_norm_u16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe4,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, v1 +// GFX1250: v_cvt_off_f32_i4_e64 v5, v1 ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, v255 +// GFX1250: v_cvt_off_f32_i4_e64 v5, v255 ; encoding: [0x05,0x00,0x8e,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, s1 +// GFX1250: v_cvt_off_f32_i4_e64 v5, s1 ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, s105 +// GFX1250: v_cvt_off_f32_i4_e64 v5, s105 ; encoding: [0x05,0x00,0x8e,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, vcc_lo +// GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8e,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, vcc_hi +// GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8e,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, ttmp15 +// GFX1250: v_cvt_off_f32_i4_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8e,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, m0 +// GFX1250: v_cvt_off_f32_i4_e64 v5, m0 ; encoding: [0x05,0x00,0x8e,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, exec_lo +// GFX1250: v_cvt_off_f32_i4_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8e,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, exec_hi +// GFX1250: v_cvt_off_f32_i4_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8e,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, null +// GFX1250: v_cvt_off_f32_i4_e64 v5, null ; encoding: [0x05,0x00,0x8e,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, -1 +// GFX1250: v_cvt_off_f32_i4_e64 v5, -1 ; encoding: [0x05,0x00,0x8e,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_off_f32_i4_e64 v5, 0.5 mul:2 +// GFX1250: v_cvt_off_f32_i4_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8e,0xd5,0xf0,0x00,0x00,0x08] + +v_cvt_off_f32_i4_e64 v5, src_scc mul:4 +// GFX1250: v_cvt_off_f32_i4_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8e,0xd5,0xfd,0x00,0x00,0x10] + +v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 +// GFX1250: v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 ; encoding: [0xff,0x80,0x8e,0xd5,0xff,0x00,0x00,0x18,0x4f,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, v1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, v255 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, s1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, s105 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, m0 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, null +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, -1 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v5, src_scc +// GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_rpi_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_u16_f16_e64 v5, v1 +// GFX1250: v_cvt_u16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_u16_f16_e64 v5, v255 +// GFX1250: v_cvt_u16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_u16_f16_e64 v5, s1 +// GFX1250: v_cvt_u16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, s105 +// GFX1250: v_cvt_u16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, vcc_lo +// GFX1250: v_cvt_u16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, vcc_hi +// GFX1250: v_cvt_u16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, ttmp15 +// GFX1250: v_cvt_u16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, m0 +// GFX1250: v_cvt_u16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, exec_lo +// GFX1250: v_cvt_u16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, exec_hi +// GFX1250: v_cvt_u16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, null +// GFX1250: v_cvt_u16_f16_e64 v5, null ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, -1 +// GFX1250: v_cvt_u16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, 0.5 +// GFX1250: v_cvt_u16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v5, src_scc +// GFX1250: v_cvt_u16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp +// GFX1250: v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_cvt_u16_f16 v1.l, v128.l +// GFX1250: v_cvt_u16_f16_e64 v1.l, v128.l ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_u16_f16 v1.l, v128.h +// GFX1250: v_cvt_u16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xd2,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_u32_f32_e64 v5, v1 +// GFX1250: v_cvt_u32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_u32_f32_e64 v5, v255 +// GFX1250: v_cvt_u32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x87,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_u32_f32_e64 v5, s1 +// GFX1250: v_cvt_u32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, s105 +// GFX1250: v_cvt_u32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x87,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, vcc_lo +// GFX1250: v_cvt_u32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x87,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, vcc_hi +// GFX1250: v_cvt_u32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x87,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, ttmp15 +// GFX1250: v_cvt_u32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x87,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, m0 +// GFX1250: v_cvt_u32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x87,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, exec_lo +// GFX1250: v_cvt_u32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x87,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, exec_hi +// GFX1250: v_cvt_u32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x87,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, null +// GFX1250: v_cvt_u32_f32_e64 v5, null ; encoding: [0x05,0x00,0x87,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, -1 +// GFX1250: v_cvt_u32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x87,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, 0.5 +// GFX1250: v_cvt_u32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x87,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v5, src_scc +// GFX1250: v_cvt_u32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x87,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp +// GFX1250: v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x87,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_cvt_u32_f64_e64 v5, v[2:3] +// GFX1250: v_cvt_u32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x01,0x00,0x00] + +v_cvt_u32_f64_e64 v5, v[254:255] +// GFX1250: v_cvt_u32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x95,0xd5,0xfe,0x01,0x00,0x00] + +v_cvt_u32_f64_e64 v5, s[2:3] +// GFX1250: v_cvt_u32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, s[104:105] +// GFX1250: v_cvt_u32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x95,0xd5,0x68,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, vcc +// GFX1250: v_cvt_u32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x95,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, ttmp[14:15] +// GFX1250: v_cvt_u32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x95,0xd5,0x7a,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, exec +// GFX1250: v_cvt_u32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x95,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, null +// GFX1250: v_cvt_u32_f64_e64 v5, null ; encoding: [0x05,0x00,0x95,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, -1 +// GFX1250: v_cvt_u32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x95,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, 0.5 +// GFX1250: v_cvt_u32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0x95,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_u32_f64_e64 v5, -|src_scc| +// GFX1250: v_cvt_u32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0x95,0xd5,0xfd,0x00,0x00,0x20] + +v_cvt_u32_f64_e64 v255, 0xaf123456 clamp +// GFX1250: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_cvt_u32_u16_e64 v5, v1 +// GFX1250: v_cvt_u32_u16_e64 v5, v1 ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00] + +v_cvt_u32_u16_e64 v5, v255 +// GFX1250: v_cvt_u32_u16_e64 v5, v255 ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00] + +v_cvt_u32_u16_e64 v5, s1 +// GFX1250: v_cvt_u32_u16_e64 v5, s1 ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, s105 +// GFX1250: v_cvt_u32_u16_e64 v5, s105 ; encoding: [0x05,0x00,0xeb,0xd5,0x69,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, vcc_lo +// GFX1250: v_cvt_u32_u16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xeb,0xd5,0x6a,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, vcc_hi +// GFX1250: v_cvt_u32_u16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xeb,0xd5,0x6b,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, ttmp15 +// GFX1250: v_cvt_u32_u16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xeb,0xd5,0x7b,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, m0 +// GFX1250: v_cvt_u32_u16_e64 v5, m0 ; encoding: [0x05,0x00,0xeb,0xd5,0x7d,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, exec_lo +// GFX1250: v_cvt_u32_u16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xeb,0xd5,0x7e,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, exec_hi +// GFX1250: v_cvt_u32_u16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xeb,0xd5,0x7f,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, null +// GFX1250: v_cvt_u32_u16_e64 v5, null ; encoding: [0x05,0x00,0xeb,0xd5,0x7c,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, -1 +// GFX1250: v_cvt_u32_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, 0.5 +// GFX1250: v_cvt_u32_u16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xeb,0xd5,0xf0,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v5, src_scc +// GFX1250: v_cvt_u32_u16_e64 v5, src_scc ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00] + +v_cvt_u32_u16_e64 v255, 0xfe0b +// GFX1250: v_cvt_u32_u16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_cvt_u32_u16 v1, v128.l +// GFX1250: v_cvt_u32_u16_e64 v1, v128.l ; encoding: [0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00] + +v_cvt_u32_u16 v1, v128.h +// GFX1250: v_cvt_u32_u16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xeb,0xd5,0x80,0x01,0x00,0x00] + +v_exp_f16_e64 v5, v1 +// GFX1250: v_exp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00] + +v_exp_f16_e64 v5, v255 +// GFX1250: v_exp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00] + +v_exp_f16_e64 v5, s1 +// GFX1250: v_exp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00] + +v_exp_f16_e64 v5, s105 +// GFX1250: v_exp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00] + +v_exp_f16_e64 v5, vcc_lo +// GFX1250: v_exp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00] + +v_exp_f16_e64 v5, vcc_hi +// GFX1250: v_exp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00] + +v_exp_f16_e64 v5, ttmp15 +// GFX1250: v_exp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00] + +v_exp_f16_e64 v5, m0 +// GFX1250: v_exp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00] + +v_exp_f16_e64 v5, exec_lo +// GFX1250: v_exp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00] + +v_exp_f16_e64 v5, exec_hi +// GFX1250: v_exp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00] + +v_exp_f16_e64 v5, null +// GFX1250: v_exp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00] + +v_exp_f16_e64 v5, -1 +// GFX1250: v_exp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00] + +v_exp_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_exp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08] + +v_exp_f16_e64 v5, src_scc mul:4 +// GFX1250: v_exp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10] + +v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_exp_f16 v1.h, v128.l +// GFX1250: v_exp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd8,0xd5,0x80,0x01,0x00,0x00] + +v_exp_f16 v1.h, v128.h +// GFX1250: v_exp_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd8,0xd5,0x80,0x01,0x00,0x00] + +v_exp_f32_e64 v5, v1 +// GFX1250: v_exp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00] + +v_exp_f32_e64 v5, v255 +// GFX1250: v_exp_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa5,0xd5,0xff,0x01,0x00,0x00] + +v_exp_f32_e64 v5, s1 +// GFX1250: v_exp_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x00,0x00,0x00] + +v_exp_f32_e64 v5, s105 +// GFX1250: v_exp_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa5,0xd5,0x69,0x00,0x00,0x00] + +v_exp_f32_e64 v5, vcc_lo +// GFX1250: v_exp_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa5,0xd5,0x6a,0x00,0x00,0x00] + +v_exp_f32_e64 v5, vcc_hi +// GFX1250: v_exp_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa5,0xd5,0x6b,0x00,0x00,0x00] + +v_exp_f32_e64 v5, ttmp15 +// GFX1250: v_exp_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa5,0xd5,0x7b,0x00,0x00,0x00] + +v_exp_f32_e64 v5, m0 +// GFX1250: v_exp_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa5,0xd5,0x7d,0x00,0x00,0x00] + +v_exp_f32_e64 v5, exec_lo +// GFX1250: v_exp_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa5,0xd5,0x7e,0x00,0x00,0x00] + +v_exp_f32_e64 v5, exec_hi +// GFX1250: v_exp_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa5,0xd5,0x7f,0x00,0x00,0x00] + +v_exp_f32_e64 v5, null +// GFX1250: v_exp_f32_e64 v5, null ; encoding: [0x05,0x00,0xa5,0xd5,0x7c,0x00,0x00,0x00] + +v_exp_f32_e64 v5, -1 +// GFX1250: v_exp_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa5,0xd5,0xc1,0x00,0x00,0x00] + +v_exp_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_exp_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa5,0xd5,0xf0,0x00,0x00,0x08] + +v_exp_f32_e64 v5, src_scc mul:4 +// GFX1250: v_exp_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa5,0xd5,0xfd,0x00,0x00,0x10] + +v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_ffbh_i32_e64 v5, v1 +// GFX1250: v_cls_i32_e64 v5, v1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00] + +v_ffbh_i32_e64 v5, v255 +// GFX1250: v_cls_i32_e64 v5, v255 ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00] + +v_ffbh_i32_e64 v5, s1 +// GFX1250: v_cls_i32_e64 v5, s1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, s105 +// GFX1250: v_cls_i32_e64 v5, s105 ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, vcc_lo +// GFX1250: v_cls_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, vcc_hi +// GFX1250: v_cls_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, ttmp15 +// GFX1250: v_cls_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, m0 +// GFX1250: v_cls_i32_e64 v5, m0 ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, exec_lo +// GFX1250: v_cls_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, exec_hi +// GFX1250: v_cls_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, null +// GFX1250: v_cls_i32_e64 v5, null ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, -1 +// GFX1250: v_cls_i32_e64 v5, -1 ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, 0.5 +// GFX1250: v_cls_i32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00] + +v_ffbh_i32_e64 v5, src_scc +// GFX1250: v_cls_i32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00] + +v_ffbh_i32_e64 v255, 0xaf123456 +// GFX1250: v_cls_i32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_ffbh_u32_e64 v5, v1 +// GFX1250: v_clz_i32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00] + +v_ffbh_u32_e64 v5, v255 +// GFX1250: v_clz_i32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00] + +v_ffbh_u32_e64 v5, s1 +// GFX1250: v_clz_i32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, s105 +// GFX1250: v_clz_i32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, vcc_lo +// GFX1250: v_clz_i32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, vcc_hi +// GFX1250: v_clz_i32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, ttmp15 +// GFX1250: v_clz_i32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, m0 +// GFX1250: v_clz_i32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, exec_lo +// GFX1250: v_clz_i32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, exec_hi +// GFX1250: v_clz_i32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, null +// GFX1250: v_clz_i32_u32_e64 v5, null ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, -1 +// GFX1250: v_clz_i32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, 0.5 +// GFX1250: v_clz_i32_u32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00] + +v_ffbh_u32_e64 v5, src_scc +// GFX1250: v_clz_i32_u32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00] + +v_ffbh_u32_e64 v255, 0xaf123456 +// GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_ffbl_b32_e64 v5, v1 +// GFX1250: v_ctz_i32_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00] + +v_ffbl_b32_e64 v5, v255 +// GFX1250: v_ctz_i32_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00] + +v_ffbl_b32_e64 v5, s1 +// GFX1250: v_ctz_i32_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, s105 +// GFX1250: v_ctz_i32_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, vcc_lo +// GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, vcc_hi +// GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, ttmp15 +// GFX1250: v_ctz_i32_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, m0 +// GFX1250: v_ctz_i32_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, exec_lo +// GFX1250: v_ctz_i32_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, exec_hi +// GFX1250: v_ctz_i32_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, null +// GFX1250: v_ctz_i32_b32_e64 v5, null ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, -1 +// GFX1250: v_ctz_i32_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, 0.5 +// GFX1250: v_ctz_i32_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00] + +v_ffbl_b32_e64 v5, src_scc +// GFX1250: v_ctz_i32_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00] + +v_ffbl_b32_e64 v255, 0xaf123456 +// GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_floor_f16_e64 v5, v1 +// GFX1250: v_floor_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00] + +v_floor_f16_e64 v5, v255 +// GFX1250: v_floor_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00] + +v_floor_f16_e64 v5, s1 +// GFX1250: v_floor_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00] + +v_floor_f16_e64 v5, s105 +// GFX1250: v_floor_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00] + +v_floor_f16_e64 v5, vcc_lo +// GFX1250: v_floor_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00] + +v_floor_f16_e64 v5, vcc_hi +// GFX1250: v_floor_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00] + +v_floor_f16_e64 v5, ttmp15 +// GFX1250: v_floor_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00] + +v_floor_f16_e64 v5, m0 +// GFX1250: v_floor_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00] + +v_floor_f16_e64 v5, exec_lo +// GFX1250: v_floor_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00] + +v_floor_f16_e64 v5, exec_hi +// GFX1250: v_floor_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00] + +v_floor_f16_e64 v5, null +// GFX1250: v_floor_f16_e64 v5, null ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00] + +v_floor_f16_e64 v5, -1 +// GFX1250: v_floor_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00] + +v_floor_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_floor_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08] + +v_floor_f16_e64 v5, src_scc mul:4 +// GFX1250: v_floor_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10] + +v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_floor_f16 v1.h, v128.l +// GFX1250: v_floor_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdb,0xd5,0x80,0x01,0x00,0x00] + +v_floor_f16 v1.h, v128.h +// GFX1250: v_floor_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdb,0xd5,0x80,0x01,0x00,0x00] + +v_floor_f32_e64 v5, v1 +// GFX1250: v_floor_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00] + +v_floor_f32_e64 v5, v255 +// GFX1250: v_floor_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa4,0xd5,0xff,0x01,0x00,0x00] + +v_floor_f32_e64 v5, s1 +// GFX1250: v_floor_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x00,0x00,0x00] + +v_floor_f32_e64 v5, s105 +// GFX1250: v_floor_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa4,0xd5,0x69,0x00,0x00,0x00] + +v_floor_f32_e64 v5, vcc_lo +// GFX1250: v_floor_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa4,0xd5,0x6a,0x00,0x00,0x00] + +v_floor_f32_e64 v5, vcc_hi +// GFX1250: v_floor_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa4,0xd5,0x6b,0x00,0x00,0x00] + +v_floor_f32_e64 v5, ttmp15 +// GFX1250: v_floor_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa4,0xd5,0x7b,0x00,0x00,0x00] + +v_floor_f32_e64 v5, m0 +// GFX1250: v_floor_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa4,0xd5,0x7d,0x00,0x00,0x00] + +v_floor_f32_e64 v5, exec_lo +// GFX1250: v_floor_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa4,0xd5,0x7e,0x00,0x00,0x00] + +v_floor_f32_e64 v5, exec_hi +// GFX1250: v_floor_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa4,0xd5,0x7f,0x00,0x00,0x00] + +v_floor_f32_e64 v5, null +// GFX1250: v_floor_f32_e64 v5, null ; encoding: [0x05,0x00,0xa4,0xd5,0x7c,0x00,0x00,0x00] + +v_floor_f32_e64 v5, -1 +// GFX1250: v_floor_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa4,0xd5,0xc1,0x00,0x00,0x00] + +v_floor_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_floor_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa4,0xd5,0xf0,0x00,0x00,0x08] + +v_floor_f32_e64 v5, src_scc mul:4 +// GFX1250: v_floor_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa4,0xd5,0xfd,0x00,0x00,0x10] + +v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa4,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_floor_f64_e64 v[6:7], v[2:3] +// GFX1250: v_floor_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x01,0x00,0x00] + +v_floor_f64_e64 v[6:7], v[254:255] +// GFX1250: v_floor_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x9a,0xd5,0xfe,0x01,0x00,0x00] + +v_floor_f64_e64 v[6:7], s[2:3] +// GFX1250: v_floor_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], s[104:105] +// GFX1250: v_floor_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x9a,0xd5,0x68,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], vcc +// GFX1250: v_floor_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x9a,0xd5,0x6a,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_floor_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x9a,0xd5,0x7a,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], exec +// GFX1250: v_floor_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x9a,0xd5,0x7e,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], null +// GFX1250: v_floor_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x9a,0xd5,0x7c,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], -1 +// GFX1250: v_floor_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x9a,0xd5,0xc1,0x00,0x00,0x00] + +v_floor_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_floor_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x9a,0xd5,0xf0,0x00,0x00,0x08] + +v_floor_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_floor_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x9a,0xd5,0xfd,0x00,0x00,0x30] + +v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_fract_f16_e64 v5, v1 +// GFX1250: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f16_e64 v5, v255 +// GFX1250: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] + +v_fract_f16_e64 v5, s1 +// GFX1250: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] + +v_fract_f16_e64 v5, s105 +// GFX1250: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] + +v_fract_f16_e64 v5, vcc_lo +// GFX1250: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f16_e64 v5, vcc_hi +// GFX1250: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] + +v_fract_f16_e64 v5, ttmp15 +// GFX1250: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] + +v_fract_f16_e64 v5, m0 +// GFX1250: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] + +v_fract_f16_e64 v5, exec_lo +// GFX1250: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f16_e64 v5, exec_hi +// GFX1250: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] + +v_fract_f16_e64 v5, null +// GFX1250: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f16_e64 v5, -1 +// GFX1250: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] + +v_fract_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] + +v_fract_f16_e64 v5, src_scc mul:4 +// GFX1250: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] + +v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_fract_f16 v1.h, v128.l +// GFX1250: v_fract_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdf,0xd5,0x80,0x01,0x00,0x00] + +v_fract_f16 v1.h, v128.h +// GFX1250: v_fract_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdf,0xd5,0x80,0x01,0x00,0x00] + +v_fract_f32_e64 v5, v1 +// GFX1250: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] + +v_fract_f32_e64 v5, v255 +// GFX1250: v_fract_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa0,0xd5,0xff,0x01,0x00,0x00] + +v_fract_f32_e64 v5, s1 +// GFX1250: v_fract_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x00,0x00,0x00] + +v_fract_f32_e64 v5, s105 +// GFX1250: v_fract_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa0,0xd5,0x69,0x00,0x00,0x00] + +v_fract_f32_e64 v5, vcc_lo +// GFX1250: v_fract_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa0,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f32_e64 v5, vcc_hi +// GFX1250: v_fract_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa0,0xd5,0x6b,0x00,0x00,0x00] + +v_fract_f32_e64 v5, ttmp15 +// GFX1250: v_fract_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa0,0xd5,0x7b,0x00,0x00,0x00] + +v_fract_f32_e64 v5, m0 +// GFX1250: v_fract_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa0,0xd5,0x7d,0x00,0x00,0x00] + +v_fract_f32_e64 v5, exec_lo +// GFX1250: v_fract_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa0,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f32_e64 v5, exec_hi +// GFX1250: v_fract_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa0,0xd5,0x7f,0x00,0x00,0x00] + +v_fract_f32_e64 v5, null +// GFX1250: v_fract_f32_e64 v5, null ; encoding: [0x05,0x00,0xa0,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f32_e64 v5, -1 +// GFX1250: v_fract_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa0,0xd5,0xc1,0x00,0x00,0x00] + +v_fract_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_fract_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa0,0xd5,0xf0,0x00,0x00,0x08] + +v_fract_f32_e64 v5, src_scc mul:4 +// GFX1250: v_fract_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa0,0xd5,0xfd,0x00,0x00,0x10] + +v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_fract_f64_e64 v[6:7], v[2:3] +// GFX1250: v_fract_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x01,0x00,0x00] + +v_fract_f64_e64 v[6:7], v[254:255] +// GFX1250: v_fract_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbe,0xd5,0xfe,0x01,0x00,0x00] + +v_fract_f64_e64 v[6:7], s[2:3] +// GFX1250: v_fract_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], s[104:105] +// GFX1250: v_fract_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbe,0xd5,0x68,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], vcc +// GFX1250: v_fract_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xbe,0xd5,0x6a,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_fract_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbe,0xd5,0x7a,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], exec +// GFX1250: v_fract_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xbe,0xd5,0x7e,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], null +// GFX1250: v_fract_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xbe,0xd5,0x7c,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], -1 +// GFX1250: v_fract_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xbe,0xd5,0xc1,0x00,0x00,0x00] + +v_fract_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_fract_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xbe,0xd5,0xf0,0x00,0x00,0x08] + +v_fract_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_fract_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbe,0xd5,0xfd,0x00,0x00,0x30] + +v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_frexp_exp_i16_f16_e64 v5, v1 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, v255 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, s1 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, s105 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, vcc_lo +// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, vcc_hi +// GFX1250: v_frexp_exp_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, ttmp15 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, m0 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, exec_lo +// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, exec_hi +// GFX1250: v_frexp_exp_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, null +// GFX1250: v_frexp_exp_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, -1 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, 0.5 +// GFX1250: v_frexp_exp_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v5, src_scc +// GFX1250: v_frexp_exp_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00] + +v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| +// GFX1250: v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +v_frexp_exp_i16_f16 v1.h, v128.l +// GFX1250: v_frexp_exp_i16_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xda,0xd5,0x80,0x01,0x00,0x00] + +v_frexp_exp_i16_f16 v1.h, v128.h +// GFX1250: v_frexp_exp_i16_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xda,0xd5,0x80,0x01,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, v1 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, v255 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xbf,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, s1 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, s105 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xbf,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, vcc_lo +// GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbf,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, vcc_hi +// GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbf,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, ttmp15 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbf,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, m0 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xbf,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, exec_lo +// GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbf,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, exec_hi +// GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbf,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, null +// GFX1250: v_frexp_exp_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0xbf,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, -1 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xbf,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, 0.5 +// GFX1250: v_frexp_exp_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbf,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v5, src_scc +// GFX1250: v_frexp_exp_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbf,0xd5,0xfd,0x00,0x00,0x00] + +v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| +// GFX1250: v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0xbf,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +v_frexp_exp_i32_f64_e64 v5, v[2:3] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x01,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, v[254:255] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0xbc,0xd5,0xfe,0x01,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, s[2:3] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, s[104:105] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0xbc,0xd5,0x68,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, vcc +// GFX1250: v_frexp_exp_i32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0xbc,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] +// GFX1250: v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0xbc,0xd5,0x7a,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, exec +// GFX1250: v_frexp_exp_i32_f64_e64 v5, exec ; encoding: [0x05,0x00,0xbc,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, null +// GFX1250: v_frexp_exp_i32_f64_e64 v5, null ; encoding: [0x05,0x00,0xbc,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, -1 +// GFX1250: v_frexp_exp_i32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0xbc,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, 0.5 +// GFX1250: v_frexp_exp_i32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbc,0xd5,0xf0,0x00,0x00,0x00] + +v_frexp_exp_i32_f64_e64 v5, -|src_scc| +// GFX1250: v_frexp_exp_i32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0xbc,0xd5,0xfd,0x00,0x00,0x20] + +v_frexp_exp_i32_f64_e64 v255, 0xaf123456 +// GFX1250: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_frexp_mant_f16_e64 v5, v1 +// GFX1250: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v5, v255 +// GFX1250: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_mant_f16_e64 v5, s1 +// GFX1250: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, s105 +// GFX1250: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, vcc_lo +// GFX1250: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, vcc_hi +// GFX1250: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, ttmp15 +// GFX1250: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, m0 +// GFX1250: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, exec_lo +// GFX1250: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, exec_hi +// GFX1250: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, null +// GFX1250: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, -1 +// GFX1250: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_mant_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] + +v_frexp_mant_f16_e64 v5, src_scc mul:4 +// GFX1250: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] + +v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_frexp_mant_f16 v1.h, v128.l +// GFX1250: v_frexp_mant_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd9,0xd5,0x80,0x01,0x00,0x00] + +v_frexp_mant_f16 v1.h, v128.h +// GFX1250: v_frexp_mant_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd9,0xd5,0x80,0x01,0x00,0x00] + +v_frexp_mant_f32_e64 v5, v1 +// GFX1250: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] + +v_frexp_mant_f32_e64 v5, v255 +// GFX1250: v_frexp_mant_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xc0,0xd5,0xff,0x01,0x00,0x00] + +v_frexp_mant_f32_e64 v5, s1 +// GFX1250: v_frexp_mant_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, s105 +// GFX1250: v_frexp_mant_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xc0,0xd5,0x69,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, vcc_lo +// GFX1250: v_frexp_mant_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xc0,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, vcc_hi +// GFX1250: v_frexp_mant_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xc0,0xd5,0x6b,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, ttmp15 +// GFX1250: v_frexp_mant_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xc0,0xd5,0x7b,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, m0 +// GFX1250: v_frexp_mant_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xc0,0xd5,0x7d,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, exec_lo +// GFX1250: v_frexp_mant_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xc0,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, exec_hi +// GFX1250: v_frexp_mant_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xc0,0xd5,0x7f,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, null +// GFX1250: v_frexp_mant_f32_e64 v5, null ; encoding: [0x05,0x00,0xc0,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, -1 +// GFX1250: v_frexp_mant_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xc0,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_mant_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_frexp_mant_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xc0,0xd5,0xf0,0x00,0x00,0x08] + +v_frexp_mant_f32_e64 v5, src_scc mul:4 +// GFX1250: v_frexp_mant_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xc0,0xd5,0xfd,0x00,0x00,0x10] + +v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xc0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_frexp_mant_f64_e64 v[6:7], v[2:3] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x01,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], v[254:255] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbd,0xd5,0xfe,0x01,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], s[2:3] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], s[104:105] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbd,0xd5,0x68,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], vcc +// GFX1250: v_frexp_mant_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xbd,0xd5,0x6a,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_frexp_mant_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbd,0xd5,0x7a,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], exec +// GFX1250: v_frexp_mant_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xbd,0xd5,0x7e,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], null +// GFX1250: v_frexp_mant_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xbd,0xd5,0x7c,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], -1 +// GFX1250: v_frexp_mant_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xbd,0xd5,0xc1,0x00,0x00,0x00] + +v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xbd,0xd5,0xf0,0x00,0x00,0x08] + +v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbd,0xd5,0xfd,0x00,0x00,0x30] + +v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_log_f16_e64 v5, v1 +// GFX1250: v_log_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00] + +v_log_f16_e64 v5, v255 +// GFX1250: v_log_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00] + +v_log_f16_e64 v5, s1 +// GFX1250: v_log_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00] + +v_log_f16_e64 v5, s105 +// GFX1250: v_log_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00] + +v_log_f16_e64 v5, vcc_lo +// GFX1250: v_log_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00] + +v_log_f16_e64 v5, vcc_hi +// GFX1250: v_log_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00] + +v_log_f16_e64 v5, ttmp15 +// GFX1250: v_log_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00] + +v_log_f16_e64 v5, m0 +// GFX1250: v_log_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00] + +v_log_f16_e64 v5, exec_lo +// GFX1250: v_log_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00] + +v_log_f16_e64 v5, exec_hi +// GFX1250: v_log_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00] + +v_log_f16_e64 v5, null +// GFX1250: v_log_f16_e64 v5, null ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00] + +v_log_f16_e64 v5, -1 +// GFX1250: v_log_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00] + +v_log_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_log_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08] + +v_log_f16_e64 v5, src_scc mul:4 +// GFX1250: v_log_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10] + +v_log_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_log_f16 v1.h, v128.l +// GFX1250: v_log_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd7,0xd5,0x80,0x01,0x00,0x00] + +v_log_f16 v1.h, v128.h +// GFX1250: v_log_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd7,0xd5,0x80,0x01,0x00,0x00] + +v_log_f32_e64 v5, v1 +// GFX1250: v_log_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00] + +v_log_f32_e64 v5, v255 +// GFX1250: v_log_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa7,0xd5,0xff,0x01,0x00,0x00] + +v_log_f32_e64 v5, s1 +// GFX1250: v_log_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x00,0x00,0x00] + +v_log_f32_e64 v5, s105 +// GFX1250: v_log_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa7,0xd5,0x69,0x00,0x00,0x00] + +v_log_f32_e64 v5, vcc_lo +// GFX1250: v_log_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa7,0xd5,0x6a,0x00,0x00,0x00] + +v_log_f32_e64 v5, vcc_hi +// GFX1250: v_log_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa7,0xd5,0x6b,0x00,0x00,0x00] + +v_log_f32_e64 v5, ttmp15 +// GFX1250: v_log_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa7,0xd5,0x7b,0x00,0x00,0x00] + +v_log_f32_e64 v5, m0 +// GFX1250: v_log_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa7,0xd5,0x7d,0x00,0x00,0x00] + +v_log_f32_e64 v5, exec_lo +// GFX1250: v_log_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa7,0xd5,0x7e,0x00,0x00,0x00] + +v_log_f32_e64 v5, exec_hi +// GFX1250: v_log_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa7,0xd5,0x7f,0x00,0x00,0x00] + +v_log_f32_e64 v5, null +// GFX1250: v_log_f32_e64 v5, null ; encoding: [0x05,0x00,0xa7,0xd5,0x7c,0x00,0x00,0x00] + +v_log_f32_e64 v5, -1 +// GFX1250: v_log_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa7,0xd5,0xc1,0x00,0x00,0x00] + +v_log_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_log_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa7,0xd5,0xf0,0x00,0x00,0x08] + +v_log_f32_e64 v5, src_scc mul:4 +// GFX1250: v_log_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa7,0xd5,0xfd,0x00,0x00,0x10] + +v_log_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_log_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa7,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_mov_b32_e64 v5, v1 +// GFX1250: v_mov_b32_e64 v5, v1 ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x01,0x00,0x00] + +v_mov_b32_e64 v5, v255 +// GFX1250: v_mov_b32_e64 v5, v255 ; encoding: [0x05,0x00,0x81,0xd5,0xff,0x01,0x00,0x00] + +v_mov_b32_e64 v5, s1 +// GFX1250: v_mov_b32_e64 v5, s1 ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x00,0x00,0x00] + +v_mov_b32_e64 v5, s105 +// GFX1250: v_mov_b32_e64 v5, s105 ; encoding: [0x05,0x00,0x81,0xd5,0x69,0x00,0x00,0x00] + +v_mov_b32_e64 v5, vcc_lo +// GFX1250: v_mov_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x81,0xd5,0x6a,0x00,0x00,0x00] + +v_mov_b32_e64 v5, vcc_hi +// GFX1250: v_mov_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x81,0xd5,0x6b,0x00,0x00,0x00] + +v_mov_b32_e64 v5, ttmp15 +// GFX1250: v_mov_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x81,0xd5,0x7b,0x00,0x00,0x00] + +v_mov_b32_e64 v5, m0 +// GFX1250: v_mov_b32_e64 v5, m0 ; encoding: [0x05,0x00,0x81,0xd5,0x7d,0x00,0x00,0x00] + +v_mov_b32_e64 v5, exec_lo +// GFX1250: v_mov_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x81,0xd5,0x7e,0x00,0x00,0x00] + +v_mov_b32_e64 v5, exec_hi +// GFX1250: v_mov_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x81,0xd5,0x7f,0x00,0x00,0x00] + +v_mov_b32_e64 v5, null +// GFX1250: v_mov_b32_e64 v5, null ; encoding: [0x05,0x00,0x81,0xd5,0x7c,0x00,0x00,0x00] + +v_mov_b32_e64 v5, -1 +// GFX1250: v_mov_b32_e64 v5, -1 ; encoding: [0x05,0x00,0x81,0xd5,0xc1,0x00,0x00,0x00] + +v_mov_b32_e64 v5, 0.5 +// GFX1250: v_mov_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x81,0xd5,0xf0,0x00,0x00,0x00] + +v_mov_b32_e64 v5, src_scc +// GFX1250: v_mov_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0x81,0xd5,0xfd,0x00,0x00,0x00] + +v_mov_b32_e64 v255, 0xaf123456 +// GFX1250: v_mov_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0x81,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_movreld_b32_e64 v5, v1 +// GFX1250: v_movreld_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x01,0x00,0x00] + +v_movreld_b32_e64 v5, v255 +// GFX1250: v_movreld_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xc2,0xd5,0xff,0x01,0x00,0x00] + +v_movreld_b32_e64 v5, s1 +// GFX1250: v_movreld_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, s105 +// GFX1250: v_movreld_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xc2,0xd5,0x69,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, vcc_lo +// GFX1250: v_movreld_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xc2,0xd5,0x6a,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, vcc_hi +// GFX1250: v_movreld_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xc2,0xd5,0x6b,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, ttmp15 +// GFX1250: v_movreld_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xc2,0xd5,0x7b,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, m0 +// GFX1250: v_movreld_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xc2,0xd5,0x7d,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, exec_lo +// GFX1250: v_movreld_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xc2,0xd5,0x7e,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, exec_hi +// GFX1250: v_movreld_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xc2,0xd5,0x7f,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, null +// GFX1250: v_movreld_b32_e64 v5, null ; encoding: [0x05,0x00,0xc2,0xd5,0x7c,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, -1 +// GFX1250: v_movreld_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xc2,0xd5,0xc1,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, 0.5 +// GFX1250: v_movreld_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xc2,0xd5,0xf0,0x00,0x00,0x00] + +v_movreld_b32_e64 v5, src_scc +// GFX1250: v_movreld_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xc2,0xd5,0xfd,0x00,0x00,0x00] + +v_movreld_b32_e64 v255, 0xaf123456 +// GFX1250: v_movreld_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xc2,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_movrels_b32_e64 v5, v1 +// GFX1250: v_movrels_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc3,0xd5,0x01,0x01,0x00,0x00] + +v_movrels_b32_e64 v255, v255 +// GFX1250: v_movrels_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc3,0xd5,0xff,0x01,0x00,0x00] + +v_movrelsd_2_b32_e64 v5, v1 +// GFX1250: v_movrelsd_2_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc8,0xd5,0x01,0x01,0x00,0x00] + +v_movrelsd_2_b32_e64 v255, v255 +// GFX1250: v_movrelsd_2_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc8,0xd5,0xff,0x01,0x00,0x00] + +v_movrelsd_b32_e64 v5, v1 +// GFX1250: v_movrelsd_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc4,0xd5,0x01,0x01,0x00,0x00] + +v_movrelsd_b32_e64 v255, v255 +// GFX1250: v_movrelsd_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00] + +v_nop_e64 +// GFX1250: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] + +v_not_b16_e64 v5, v1 +// GFX1250: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +v_not_b16_e64 v5, v255 +// GFX1250: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] + +v_not_b16_e64 v5, s1 +// GFX1250: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] + +v_not_b16_e64 v5, s105 +// GFX1250: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] + +v_not_b16_e64 v5, vcc_lo +// GFX1250: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] + +v_not_b16_e64 v5, vcc_hi +// GFX1250: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] + +v_not_b16_e64 v5, ttmp15 +// GFX1250: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] + +v_not_b16_e64 v5, m0 +// GFX1250: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] + +v_not_b16_e64 v5, exec_lo +// GFX1250: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] + +v_not_b16_e64 v5, exec_hi +// GFX1250: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] + +v_not_b16_e64 v5, null +// GFX1250: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] + +v_not_b16_e64 v5, -1 +// GFX1250: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] + +v_not_b16_e64 v5, 0.5 +// GFX1250: v_not_b16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe9,0xd5,0xf0,0x00,0x00,0x00] + +v_not_b16_e64 v5, src_scc +// GFX1250: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] + +v_not_b16_e64 v255, 0xfe0b +// GFX1250: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_not_b16 v1.h, v128.l +// GFX1250: v_not_b16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe9,0xd5,0x80,0x01,0x00,0x00] + +v_not_b16 v1.h, v128.h +// GFX1250: v_not_b16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xe9,0xd5,0x80,0x01,0x00,0x00] + +v_not_b32_e64 v5, v1 +// GFX1250: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] + +v_not_b32_e64 v5, v255 +// GFX1250: v_not_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xb7,0xd5,0xff,0x01,0x00,0x00] + +v_not_b32_e64 v5, s1 +// GFX1250: v_not_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x00,0x00,0x00] + +v_not_b32_e64 v5, s105 +// GFX1250: v_not_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xb7,0xd5,0x69,0x00,0x00,0x00] + +v_not_b32_e64 v5, vcc_lo +// GFX1250: v_not_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb7,0xd5,0x6a,0x00,0x00,0x00] + +v_not_b32_e64 v5, vcc_hi +// GFX1250: v_not_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb7,0xd5,0x6b,0x00,0x00,0x00] + +v_not_b32_e64 v5, ttmp15 +// GFX1250: v_not_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb7,0xd5,0x7b,0x00,0x00,0x00] + +v_not_b32_e64 v5, m0 +// GFX1250: v_not_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xb7,0xd5,0x7d,0x00,0x00,0x00] + +v_not_b32_e64 v5, exec_lo +// GFX1250: v_not_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb7,0xd5,0x7e,0x00,0x00,0x00] + +v_not_b32_e64 v5, exec_hi +// GFX1250: v_not_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb7,0xd5,0x7f,0x00,0x00,0x00] + +v_not_b32_e64 v5, null +// GFX1250: v_not_b32_e64 v5, null ; encoding: [0x05,0x00,0xb7,0xd5,0x7c,0x00,0x00,0x00] + +v_not_b32_e64 v5, -1 +// GFX1250: v_not_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xb7,0xd5,0xc1,0x00,0x00,0x00] + +v_not_b32_e64 v5, 0.5 +// GFX1250: v_not_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb7,0xd5,0xf0,0x00,0x00,0x00] + +v_not_b32_e64 v5, src_scc +// GFX1250: v_not_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb7,0xd5,0xfd,0x00,0x00,0x00] + +v_not_b32_e64 v255, 0xaf123456 +// GFX1250: v_not_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +v_pipeflush_e64 +// GFX1250: v_pipeflush ; encoding: [0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, v1 +// GFX1250: v_rcp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00] + +v_rcp_f16_e64 v5, v255 +// GFX1250: v_rcp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00] + +v_rcp_f16_e64 v5, s1 +// GFX1250: v_rcp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, s105 +// GFX1250: v_rcp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, vcc_lo +// GFX1250: v_rcp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, vcc_hi +// GFX1250: v_rcp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, ttmp15 +// GFX1250: v_rcp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, m0 +// GFX1250: v_rcp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, exec_lo +// GFX1250: v_rcp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, exec_hi +// GFX1250: v_rcp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, null +// GFX1250: v_rcp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, -1 +// GFX1250: v_rcp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00] + +v_rcp_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_rcp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08] + +v_rcp_f16_e64 v5, src_scc mul:4 +// GFX1250: v_rcp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10] + +v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_rcp_f16 v1.h, v128.l +// GFX1250: v_rcp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd4,0xd5,0x80,0x01,0x00,0x00] + +v_rcp_f16 v1.h, v128.h +// GFX1250: v_rcp_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd4,0xd5,0x80,0x01,0x00,0x00] + +v_rcp_f32_e64 v5, v1 +// GFX1250: v_rcp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00] + +v_rcp_f32_e64 v5, v255 +// GFX1250: v_rcp_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xaa,0xd5,0xff,0x01,0x00,0x00] + +v_rcp_f32_e64 v5, s1 +// GFX1250: v_rcp_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, s105 +// GFX1250: v_rcp_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xaa,0xd5,0x69,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, vcc_lo +// GFX1250: v_rcp_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xaa,0xd5,0x6a,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, vcc_hi +// GFX1250: v_rcp_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xaa,0xd5,0x6b,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, ttmp15 +// GFX1250: v_rcp_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xaa,0xd5,0x7b,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, m0 +// GFX1250: v_rcp_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xaa,0xd5,0x7d,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, exec_lo +// GFX1250: v_rcp_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xaa,0xd5,0x7e,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, exec_hi +// GFX1250: v_rcp_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xaa,0xd5,0x7f,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, null +// GFX1250: v_rcp_f32_e64 v5, null ; encoding: [0x05,0x00,0xaa,0xd5,0x7c,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, -1 +// GFX1250: v_rcp_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xaa,0xd5,0xc1,0x00,0x00,0x00] + +v_rcp_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_rcp_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xaa,0xd5,0xf0,0x00,0x00,0x08] + +v_rcp_f32_e64 v5, src_scc mul:4 +// GFX1250: v_rcp_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xaa,0xd5,0xfd,0x00,0x00,0x10] + +v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xaa,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_rcp_f64_e64 v[6:7], v[2:3] +// GFX1250: v_rcp_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x01,0x00,0x00] + +v_rcp_f64_e64 v[6:7], v[254:255] +// GFX1250: v_rcp_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xaf,0xd5,0xfe,0x01,0x00,0x00] + +v_rcp_f64_e64 v[6:7], s[2:3] +// GFX1250: v_rcp_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], s[104:105] +// GFX1250: v_rcp_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xaf,0xd5,0x68,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], vcc +// GFX1250: v_rcp_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xaf,0xd5,0x6a,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_rcp_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xaf,0xd5,0x7a,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], exec +// GFX1250: v_rcp_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xaf,0xd5,0x7e,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], null +// GFX1250: v_rcp_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xaf,0xd5,0x7c,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], -1 +// GFX1250: v_rcp_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xaf,0xd5,0xc1,0x00,0x00,0x00] + +v_rcp_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_rcp_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xaf,0xd5,0xf0,0x00,0x00,0x08] + +v_rcp_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_rcp_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xaf,0xd5,0xfd,0x00,0x00,0x30] + +v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xaf,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_rcp_iflag_f32_e64 v5, v1 +// GFX1250: v_rcp_iflag_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x01,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, v255 +// GFX1250: v_rcp_iflag_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xab,0xd5,0xff,0x01,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, s1 +// GFX1250: v_rcp_iflag_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, s105 +// GFX1250: v_rcp_iflag_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xab,0xd5,0x69,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, vcc_lo +// GFX1250: v_rcp_iflag_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xab,0xd5,0x6a,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, vcc_hi +// GFX1250: v_rcp_iflag_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xab,0xd5,0x6b,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, ttmp15 +// GFX1250: v_rcp_iflag_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xab,0xd5,0x7b,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, m0 +// GFX1250: v_rcp_iflag_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xab,0xd5,0x7d,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, exec_lo +// GFX1250: v_rcp_iflag_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xab,0xd5,0x7e,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, exec_hi +// GFX1250: v_rcp_iflag_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xab,0xd5,0x7f,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, null +// GFX1250: v_rcp_iflag_f32_e64 v5, null ; encoding: [0x05,0x00,0xab,0xd5,0x7c,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, -1 +// GFX1250: v_rcp_iflag_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xab,0xd5,0xc1,0x00,0x00,0x00] + +v_rcp_iflag_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_rcp_iflag_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xab,0xd5,0xf0,0x00,0x00,0x08] + +v_rcp_iflag_f32_e64 v5, src_scc mul:4 +// GFX1250: v_rcp_iflag_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xab,0xd5,0xfd,0x00,0x00,0x10] + +v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_rndne_f16_e64 v5, v1 +// GFX1250: v_rndne_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00] + +v_rndne_f16_e64 v5, v255 +// GFX1250: v_rndne_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00] + +v_rndne_f16_e64 v5, s1 +// GFX1250: v_rndne_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, s105 +// GFX1250: v_rndne_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, vcc_lo +// GFX1250: v_rndne_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, vcc_hi +// GFX1250: v_rndne_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, ttmp15 +// GFX1250: v_rndne_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, m0 +// GFX1250: v_rndne_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, exec_lo +// GFX1250: v_rndne_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, exec_hi +// GFX1250: v_rndne_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, null +// GFX1250: v_rndne_f16_e64 v5, null ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, -1 +// GFX1250: v_rndne_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00] + +v_rndne_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_rndne_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08] + +v_rndne_f16_e64 v5, src_scc mul:4 +// GFX1250: v_rndne_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10] + +v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_rndne_f16 v1.h, v128.l +// GFX1250: v_rndne_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xde,0xd5,0x80,0x01,0x00,0x00] + +v_rndne_f16 v1.h, v128.h +// GFX1250: v_rndne_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xde,0xd5,0x80,0x01,0x00,0x00] + +v_rndne_f32_e64 v5, v1 +// GFX1250: v_rndne_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00] + +v_rndne_f32_e64 v5, v255 +// GFX1250: v_rndne_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa3,0xd5,0xff,0x01,0x00,0x00] + +v_rndne_f32_e64 v5, s1 +// GFX1250: v_rndne_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, s105 +// GFX1250: v_rndne_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa3,0xd5,0x69,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, vcc_lo +// GFX1250: v_rndne_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa3,0xd5,0x6a,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, vcc_hi +// GFX1250: v_rndne_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa3,0xd5,0x6b,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, ttmp15 +// GFX1250: v_rndne_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa3,0xd5,0x7b,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, m0 +// GFX1250: v_rndne_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa3,0xd5,0x7d,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, exec_lo +// GFX1250: v_rndne_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa3,0xd5,0x7e,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, exec_hi +// GFX1250: v_rndne_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa3,0xd5,0x7f,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, null +// GFX1250: v_rndne_f32_e64 v5, null ; encoding: [0x05,0x00,0xa3,0xd5,0x7c,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, -1 +// GFX1250: v_rndne_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa3,0xd5,0xc1,0x00,0x00,0x00] + +v_rndne_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_rndne_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa3,0xd5,0xf0,0x00,0x00,0x08] + +v_rndne_f32_e64 v5, src_scc mul:4 +// GFX1250: v_rndne_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa3,0xd5,0xfd,0x00,0x00,0x10] + +v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_rndne_f64_e64 v[6:7], v[2:3] +// GFX1250: v_rndne_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x01,0x00,0x00] + +v_rndne_f64_e64 v[6:7], v[254:255] +// GFX1250: v_rndne_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x99,0xd5,0xfe,0x01,0x00,0x00] + +v_rndne_f64_e64 v[6:7], s[2:3] +// GFX1250: v_rndne_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], s[104:105] +// GFX1250: v_rndne_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x99,0xd5,0x68,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], vcc +// GFX1250: v_rndne_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x99,0xd5,0x6a,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_rndne_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x99,0xd5,0x7a,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], exec +// GFX1250: v_rndne_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x99,0xd5,0x7e,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], null +// GFX1250: v_rndne_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x99,0xd5,0x7c,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], -1 +// GFX1250: v_rndne_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x99,0xd5,0xc1,0x00,0x00,0x00] + +v_rndne_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_rndne_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x99,0xd5,0xf0,0x00,0x00,0x08] + +v_rndne_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_rndne_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x99,0xd5,0xfd,0x00,0x00,0x30] + +v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_rsq_f16_e64 v5, v1 +// GFX1250: v_rsq_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00] + +v_rsq_f16_e64 v5, v255 +// GFX1250: v_rsq_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00] + +v_rsq_f16_e64 v5, s1 +// GFX1250: v_rsq_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, s105 +// GFX1250: v_rsq_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, vcc_lo +// GFX1250: v_rsq_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, vcc_hi +// GFX1250: v_rsq_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, ttmp15 +// GFX1250: v_rsq_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, m0 +// GFX1250: v_rsq_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, exec_lo +// GFX1250: v_rsq_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, exec_hi +// GFX1250: v_rsq_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, null +// GFX1250: v_rsq_f16_e64 v5, null ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, -1 +// GFX1250: v_rsq_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00] + +v_rsq_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_rsq_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08] + +v_rsq_f16_e64 v5, src_scc mul:4 +// GFX1250: v_rsq_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10] + +v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] -v_tanh_bf16_e64 v5, v255 -// GFX1250: v_tanh_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xca,0xd5,0xff,0x01,0x00,0x00] +v_rsq_f16 v1.h, v128.l +// GFX1250: v_rsq_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd6,0xd5,0x80,0x01,0x00,0x00] -v_tanh_bf16_e64 v5, s1 -// GFX1250: v_tanh_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x00,0x00,0x00] +v_rsq_f16 v1.h, v128.h +// GFX1250: v_rsq_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd6,0xd5,0x80,0x01,0x00,0x00] -v_tanh_bf16_e64 v5, s105 -// GFX1250: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] +v_rsq_f32_e64 v5, v1 +// GFX1250: v_rsq_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00] -v_tanh_bf16_e64 v5, vcc_lo -// GFX1250: v_tanh_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xca,0xd5,0x6a,0x00,0x00,0x00] +v_rsq_f32_e64 v5, v255 +// GFX1250: v_rsq_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xae,0xd5,0xff,0x01,0x00,0x00] -v_tanh_bf16_e64 v5, vcc_hi -// GFX1250: v_tanh_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xca,0xd5,0x6b,0x00,0x00,0x00] +v_rsq_f32_e64 v5, s1 +// GFX1250: v_rsq_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, ttmp15 -// GFX1250: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] +v_rsq_f32_e64 v5, s105 +// GFX1250: v_rsq_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xae,0xd5,0x69,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, m0 -// GFX1250: v_tanh_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xca,0xd5,0x7d,0x00,0x00,0x00] +v_rsq_f32_e64 v5, vcc_lo +// GFX1250: v_rsq_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xae,0xd5,0x6a,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, exec_lo -// GFX1250: v_tanh_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xca,0xd5,0x7e,0x00,0x00,0x00] +v_rsq_f32_e64 v5, vcc_hi +// GFX1250: v_rsq_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xae,0xd5,0x6b,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, exec_hi -// GFX1250: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] +v_rsq_f32_e64 v5, ttmp15 +// GFX1250: v_rsq_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xae,0xd5,0x7b,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, null -// GFX1250: v_tanh_bf16_e64 v5, null ; encoding: [0x05,0x00,0xca,0xd5,0x7c,0x00,0x00,0x00] +v_rsq_f32_e64 v5, m0 +// GFX1250: v_rsq_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xae,0xd5,0x7d,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, -1 -// GFX1250: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] +v_rsq_f32_e64 v5, exec_lo +// GFX1250: v_rsq_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xae,0xd5,0x7e,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, 0.5 mul:2 -// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] +v_rsq_f32_e64 v5, exec_hi +// GFX1250: v_rsq_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xae,0xd5,0x7f,0x00,0x00,0x00] -v_tanh_bf16_e64 v5, src_scc mul:4 -// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] +v_rsq_f32_e64 v5, null +// GFX1250: v_rsq_f32_e64 v5, null ; encoding: [0x05,0x00,0xae,0xd5,0x7c,0x00,0x00,0x00] -v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 -// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] +v_rsq_f32_e64 v5, -1 +// GFX1250: v_rsq_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xae,0xd5,0xc1,0x00,0x00,0x00] -v_tanh_bf16 v5.l, v128.h -// GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00] +v_rsq_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_rsq_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xae,0xd5,0xf0,0x00,0x00,0x08] -v_cvt_f32_bf8_e64 v1, s3 -// GFX1250: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] +v_rsq_f32_e64 v5, src_scc mul:4 +// GFX1250: v_rsq_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xae,0xd5,0xfd,0x00,0x00,0x10] -v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 -// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] +v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xae,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] -v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 -// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] +v_rsq_f64_e64 v[6:7], v[2:3] +// GFX1250: v_rsq_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x01,0x00,0x00] -v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 -// GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] +v_rsq_f64_e64 v[6:7], v[254:255] +// GFX1250: v_rsq_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xb1,0xd5,0xfe,0x01,0x00,0x00] -v_cvt_f32_bf8_e64 v1, 3 -// GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] +v_rsq_f64_e64 v[6:7], s[2:3] +// GFX1250: v_rsq_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 -// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] +v_rsq_f64_e64 v[6:7], s[104:105] +// GFX1250: v_rsq_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xb1,0xd5,0x68,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 -// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] +v_rsq_f64_e64 v[6:7], vcc +// GFX1250: v_rsq_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xb1,0xd5,0x6a,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 -// GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] +v_rsq_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_rsq_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xb1,0xd5,0x7a,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, v3 -// GFX1250: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] +v_rsq_f64_e64 v[6:7], exec +// GFX1250: v_rsq_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xb1,0xd5,0x7e,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 -// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] +v_rsq_f64_e64 v[6:7], null +// GFX1250: v_rsq_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xb1,0xd5,0x7c,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 -// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] +v_rsq_f64_e64 v[6:7], -1 +// GFX1250: v_rsq_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xb1,0xd5,0xc1,0x00,0x00,0x00] -v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 -// GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] +v_rsq_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_rsq_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xb1,0xd5,0xf0,0x00,0x00,0x08] -v_cvt_f32_fp8_e64 v1, s3 -// GFX1250: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] +v_rsq_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_rsq_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb1,0xd5,0xfd,0x00,0x00,0x30] -v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 -// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] +v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] -v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 -// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, v1 +// GFX1250: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] -v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 -// GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, v255 +// GFX1250: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] -v_cvt_f32_fp8_e64 v1, 3 -// GFX1250: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, s1 +// GFX1250: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 -// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, s105 +// GFX1250: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 -// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, vcc_lo +// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 -// GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, vcc_hi +// GFX1250: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, v3 -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, ttmp15 +// GFX1250: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, m0 +// GFX1250: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, exec_lo +// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, exec_hi +// GFX1250: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] -v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, null +// GFX1250: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] -v_cvt_f32_fp8 v1, v3 byte_sel:1 clamp -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, -1 +// GFX1250: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] -v_cvt_f32_fp8 v1, v3 byte_sel:2 clamp -// GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] +v_sat_pk_u8_i16_e64 v5, 0.5 +// GFX1250: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] + +v_sat_pk_u8_i16_e64 v5, src_scc +// GFX1250: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] + +v_sat_pk_u8_i16_e64 v255, 0xfe0b +// GFX1250: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +v_sat_pk_u8_i16 v128.l, v1 +// GFX1250: v_sat_pk_u8_i16_e64 v128.l, v1 ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] + +v_sat_pk_u8_i16 v128.h, v1 +// GFX1250: v_sat_pk_u8_i16_e64 v128.h, v1 op_sel:[0,1] ; encoding: [0x80,0x40,0xe2,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16_e64 v5, v1 +// GFX1250: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f16_e64 v5, v255 +// GFX1250: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] + +v_sin_f16_e64 v5, s1 +// GFX1250: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] + +v_sin_f16_e64 v5, s105 +// GFX1250: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] + +v_sin_f16_e64 v5, vcc_lo +// GFX1250: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] + +v_sin_f16_e64 v5, vcc_hi +// GFX1250: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] + +v_sin_f16_e64 v5, ttmp15 +// GFX1250: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] + +v_sin_f16_e64 v5, m0 +// GFX1250: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] + +v_sin_f16_e64 v5, exec_lo +// GFX1250: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] + +v_sin_f16_e64 v5, exec_hi +// GFX1250: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] + +v_sin_f16_e64 v5, null +// GFX1250: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] + +v_sin_f16_e64 v5, -1 +// GFX1250: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] + +v_sin_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] + +v_sin_f16_e64 v5, src_scc mul:4 +// GFX1250: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] + +v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_sin_f16 v1.h, v128.l +// GFX1250: v_sin_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe0,0xd5,0x80,0x01,0x00,0x00] + +v_sin_f16 v1.h, v128.h +// GFX1250: v_sin_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xe0,0xd5,0x80,0x01,0x00,0x00] + +v_sin_f32_e64 v5, v1 +// GFX1250: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] + +v_sin_f32_e64 v5, v255 +// GFX1250: v_sin_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb5,0xd5,0xff,0x01,0x00,0x00] + +v_sin_f32_e64 v5, s1 +// GFX1250: v_sin_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x00,0x00,0x00] + +v_sin_f32_e64 v5, s105 +// GFX1250: v_sin_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb5,0xd5,0x69,0x00,0x00,0x00] + +v_sin_f32_e64 v5, vcc_lo +// GFX1250: v_sin_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb5,0xd5,0x6a,0x00,0x00,0x00] + +v_sin_f32_e64 v5, vcc_hi +// GFX1250: v_sin_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb5,0xd5,0x6b,0x00,0x00,0x00] + +v_sin_f32_e64 v5, ttmp15 +// GFX1250: v_sin_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb5,0xd5,0x7b,0x00,0x00,0x00] + +v_sin_f32_e64 v5, m0 +// GFX1250: v_sin_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb5,0xd5,0x7d,0x00,0x00,0x00] + +v_sin_f32_e64 v5, exec_lo +// GFX1250: v_sin_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb5,0xd5,0x7e,0x00,0x00,0x00] + +v_sin_f32_e64 v5, exec_hi +// GFX1250: v_sin_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb5,0xd5,0x7f,0x00,0x00,0x00] + +v_sin_f32_e64 v5, null +// GFX1250: v_sin_f32_e64 v5, null ; encoding: [0x05,0x00,0xb5,0xd5,0x7c,0x00,0x00,0x00] + +v_sin_f32_e64 v5, -1 +// GFX1250: v_sin_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb5,0xd5,0xc1,0x00,0x00,0x00] + +v_sin_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_sin_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08] + +v_sin_f32_e64 v5, src_scc mul:4 +// GFX1250: v_sin_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb5,0xd5,0xfd,0x00,0x00,0x10] + +v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_sqrt_f16_e64 v5, v1 +// GFX1250: v_sqrt_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00] + +v_sqrt_f16_e64 v5, v255 +// GFX1250: v_sqrt_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00] + +v_sqrt_f16_e64 v5, s1 +// GFX1250: v_sqrt_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, s105 +// GFX1250: v_sqrt_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, vcc_lo +// GFX1250: v_sqrt_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, vcc_hi +// GFX1250: v_sqrt_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, ttmp15 +// GFX1250: v_sqrt_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, m0 +// GFX1250: v_sqrt_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, exec_lo +// GFX1250: v_sqrt_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, exec_hi +// GFX1250: v_sqrt_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, null +// GFX1250: v_sqrt_f16_e64 v5, null ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, -1 +// GFX1250: v_sqrt_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00] + +v_sqrt_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_sqrt_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08] + +v_sqrt_f16_e64 v5, src_scc mul:4 +// GFX1250: v_sqrt_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10] + +v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_sqrt_f16 v1.h, v128.l +// GFX1250: v_sqrt_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd5,0xd5,0x80,0x01,0x00,0x00] + +v_sqrt_f16 v1.h, v128.h +// GFX1250: v_sqrt_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd5,0xd5,0x80,0x01,0x00,0x00] + +v_sqrt_f32_e64 v5, v1 +// GFX1250: v_sqrt_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00] + +v_sqrt_f32_e64 v5, v255 +// GFX1250: v_sqrt_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb3,0xd5,0xff,0x01,0x00,0x00] + +v_sqrt_f32_e64 v5, s1 +// GFX1250: v_sqrt_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, s105 +// GFX1250: v_sqrt_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb3,0xd5,0x69,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, vcc_lo +// GFX1250: v_sqrt_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb3,0xd5,0x6a,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, vcc_hi +// GFX1250: v_sqrt_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb3,0xd5,0x6b,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, ttmp15 +// GFX1250: v_sqrt_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb3,0xd5,0x7b,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, m0 +// GFX1250: v_sqrt_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb3,0xd5,0x7d,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, exec_lo +// GFX1250: v_sqrt_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb3,0xd5,0x7e,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, exec_hi +// GFX1250: v_sqrt_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb3,0xd5,0x7f,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, null +// GFX1250: v_sqrt_f32_e64 v5, null ; encoding: [0x05,0x00,0xb3,0xd5,0x7c,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, -1 +// GFX1250: v_sqrt_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb3,0xd5,0xc1,0x00,0x00,0x00] + +v_sqrt_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_sqrt_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb3,0xd5,0xf0,0x00,0x00,0x08] + +v_sqrt_f32_e64 v5, src_scc mul:4 +// GFX1250: v_sqrt_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb3,0xd5,0xfd,0x00,0x00,0x10] + +v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_sqrt_f64_e64 v[6:7], v[2:3] +// GFX1250: v_sqrt_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x01,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], v[254:255] +// GFX1250: v_sqrt_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xb4,0xd5,0xfe,0x01,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], s[2:3] +// GFX1250: v_sqrt_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], s[104:105] +// GFX1250: v_sqrt_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xb4,0xd5,0x68,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], vcc +// GFX1250: v_sqrt_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xb4,0xd5,0x6a,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_sqrt_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xb4,0xd5,0x7a,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], exec +// GFX1250: v_sqrt_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xb4,0xd5,0x7e,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], null +// GFX1250: v_sqrt_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xb4,0xd5,0x7c,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], -1 +// GFX1250: v_sqrt_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xb4,0xd5,0xc1,0x00,0x00,0x00] + +v_sqrt_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_sqrt_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xb4,0xd5,0xf0,0x00,0x00,0x08] + +v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb4,0xd5,0xfd,0x00,0x00,0x30] + +v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +v_trunc_f16_e64 v5, v1 +// GFX1250: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f16_e64 v5, v255 +// GFX1250: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] + +v_trunc_f16_e64 v5, s1 +// GFX1250: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, s105 +// GFX1250: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, vcc_lo +// GFX1250: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, vcc_hi +// GFX1250: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, ttmp15 +// GFX1250: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, m0 +// GFX1250: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, exec_lo +// GFX1250: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, exec_hi +// GFX1250: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, null +// GFX1250: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, -1 +// GFX1250: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] + +v_trunc_f16_e64 v5, 0.5 mul:2 +// GFX1250: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] + +v_trunc_f16_e64 v5, src_scc mul:4 +// GFX1250: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] + +v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 +// GFX1250: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +v_trunc_f16 v1.h, v128.l +// GFX1250: v_trunc_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdd,0xd5,0x80,0x01,0x00,0x00] + +v_trunc_f16 v1.h, v128.h +// GFX1250: v_trunc_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdd,0xd5,0x80,0x01,0x00,0x00] + +v_trunc_f32_e64 v5, v1 +// GFX1250: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] + +v_trunc_f32_e64 v5, v255 +// GFX1250: v_trunc_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa1,0xd5,0xff,0x01,0x00,0x00] + +v_trunc_f32_e64 v5, s1 +// GFX1250: v_trunc_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, s105 +// GFX1250: v_trunc_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa1,0xd5,0x69,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, vcc_lo +// GFX1250: v_trunc_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, vcc_hi +// GFX1250: v_trunc_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd5,0x6b,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, ttmp15 +// GFX1250: v_trunc_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa1,0xd5,0x7b,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, m0 +// GFX1250: v_trunc_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa1,0xd5,0x7d,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, exec_lo +// GFX1250: v_trunc_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa1,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, exec_hi +// GFX1250: v_trunc_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa1,0xd5,0x7f,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, null +// GFX1250: v_trunc_f32_e64 v5, null ; encoding: [0x05,0x00,0xa1,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, -1 +// GFX1250: v_trunc_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa1,0xd5,0xc1,0x00,0x00,0x00] + +v_trunc_f32_e64 v5, 0.5 mul:2 +// GFX1250: v_trunc_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa1,0xd5,0xf0,0x00,0x00,0x08] + +v_trunc_f32_e64 v5, src_scc mul:4 +// GFX1250: v_trunc_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa1,0xd5,0xfd,0x00,0x00,0x10] + +v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 +// GFX1250: v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa1,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +v_trunc_f64_e64 v[6:7], v[2:3] +// GFX1250: v_trunc_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x01,0x00,0x00] + +v_trunc_f64_e64 v[6:7], v[254:255] +// GFX1250: v_trunc_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x97,0xd5,0xfe,0x01,0x00,0x00] + +v_trunc_f64_e64 v[6:7], s[2:3] +// GFX1250: v_trunc_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], s[104:105] +// GFX1250: v_trunc_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x97,0xd5,0x68,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], vcc +// GFX1250: v_trunc_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x97,0xd5,0x6a,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], ttmp[14:15] +// GFX1250: v_trunc_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x97,0xd5,0x7a,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], exec +// GFX1250: v_trunc_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x97,0xd5,0x7e,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], null +// GFX1250: v_trunc_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x97,0xd5,0x7c,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], -1 +// GFX1250: v_trunc_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x97,0xd5,0xc1,0x00,0x00,0x00] + +v_trunc_f64_e64 v[6:7], 0.5 mul:2 +// GFX1250: v_trunc_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x97,0xd5,0xf0,0x00,0x00,0x08] + +v_trunc_f64_e64 v[6:7], -|src_scc| mul:4 +// GFX1250: v_trunc_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x97,0xd5,0xfd,0x00,0x00,0x30] + +v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 +// GFX1250: v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x97,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] v_tanh_f32_e64 v5, v1 // GFX1250: v_tanh_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x9e,0xd5,0x01,0x01,0x00,0x00] @@ -223,6 +3916,54 @@ v_tanh_f16_e64 v255, -|0x8000| clamp div:2 v_tanh_f16 v5.l, v128.h // GFX1250: v_tanh_f16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0x9f,0xd5,0x80,0x01,0x00,0x00] +v_tanh_bf16_e64 v5, v1 +// GFX1250: v_tanh_bf16_e64 v5, v1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x01,0x00,0x00] + +v_tanh_bf16_e64 v5, v255 +// GFX1250: v_tanh_bf16_e64 v5, v255 ; encoding: [0x05,0x00,0xca,0xd5,0xff,0x01,0x00,0x00] + +v_tanh_bf16_e64 v5, s1 +// GFX1250: v_tanh_bf16_e64 v5, s1 ; encoding: [0x05,0x00,0xca,0xd5,0x01,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, s105 +// GFX1250: v_tanh_bf16_e64 v5, s105 ; encoding: [0x05,0x00,0xca,0xd5,0x69,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, vcc_lo +// GFX1250: v_tanh_bf16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xca,0xd5,0x6a,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, vcc_hi +// GFX1250: v_tanh_bf16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xca,0xd5,0x6b,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, ttmp15 +// GFX1250: v_tanh_bf16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xca,0xd5,0x7b,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, m0 +// GFX1250: v_tanh_bf16_e64 v5, m0 ; encoding: [0x05,0x00,0xca,0xd5,0x7d,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, exec_lo +// GFX1250: v_tanh_bf16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xca,0xd5,0x7e,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, exec_hi +// GFX1250: v_tanh_bf16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xca,0xd5,0x7f,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, null +// GFX1250: v_tanh_bf16_e64 v5, null ; encoding: [0x05,0x00,0xca,0xd5,0x7c,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, -1 +// GFX1250: v_tanh_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xca,0xd5,0xc1,0x00,0x00,0x00] + +v_tanh_bf16_e64 v5, 0.5 mul:2 +// GFX1250: v_tanh_bf16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xca,0xd5,0xf0,0x00,0x00,0x08] + +v_tanh_bf16_e64 v5, src_scc mul:4 +// GFX1250: v_tanh_bf16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xca,0xd5,0xfd,0x00,0x00,0x10] + +v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 +// GFX1250: v_tanh_bf16_e64 v255, -|0x8000| clamp div:2 ; encoding: [0xff,0x81,0xca,0xd5,0xff,0x00,0x00,0x38,0x00,0x80,0x00,0x00] + +v_tanh_bf16 v5.l, v128.h +// GFX1250: v_tanh_bf16_e64 v5.l, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xca,0xd5,0x80,0x01,0x00,0x00] + v_prng_b32_e64 v5, v1 // GFX1250: v_prng_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xcb,0xd5,0x01,0x01,0x00,0x00] diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s index b2c2943e2a182..78afa10b984cb 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8-fake16.s @@ -186,10 +186,6 @@ v_cvt_f16_bf8 v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f16_bf8_e64_dpp v1, v2 byte_sel:1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x10,0xf8,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cvt_f16_bf8 v150, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cvt_f16_bf8_e64_dpp v150, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf8,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f16_bf8 v150, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] fi:1 // GFX1250: v_cvt_f16_bf8_e64_dpp v150, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x96,0x18,0xf8,0xd5,0xea,0x00,0x00,0x00,0x02,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s index e3c7c0f8cbc81..0414421f0a906 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3_from_vop1_dpp8.s @@ -202,18 +202,6 @@ v_cos_bf16_e64_dpp v5.h, v128.h dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU -v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cvt_f32_bf16_e64_dpp v5, v128.h dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - -v_cvt_f16_bf8 v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] -// GFX1250: v_cvt_f16_bf8_e64_dpp v150.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x96,0x00,0xf8,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] -// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU - v_cvt_f16_bf8 v1.l, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f16_bf8_e64_dpp v1.l, v2 byte_sel:3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x18,0xf8,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU @@ -274,6 +262,14 @@ v_cvt_f16_fp8 v128.l, v2 dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f16_fp8_e64_dpp v128.l, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x80,0x00,0xf7,0xd5,0xe9,0x00,0x00,0x00,0x02,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU +v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + +v_cvt_f32_bf16_e64_dpp v5, v128.h dpp8:[7,6,5,4,3,2,1,0] +// GFX1250: v_cvt_f32_bf16_e64_dpp v5, v128.h op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU + v_cvt_f32_fp8 v1, v3 clamp dpp8:[7,6,5,4,3,2,1,0] // GFX1250: v_cvt_f32_fp8_e64_dpp v1, v3 clamp dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x80,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05] // GFX12-ERR: :[[@LINE-2]]:22: error: invalid operand for instruction diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt index 5b905820844af..07dbbddcdc2f9 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop1.txt @@ -810,6 +810,52 @@ 0x03,0xd9,0x02,0x7e # GFX1250: v_cvt_f32_fp8_e32 v1, v3 ; encoding: [0x03,0xd9,0x02,0x7e] +0x03,0xde,0x04,0x7e +# GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], s3 ; encoding: [0x03,0xde,0x04,0x7e] + +0x83,0xde,0x04,0x7e +# GFX1250: v_cvt_pk_f32_bf8_e32 v[2:3], 3 ; encoding: [0x83,0xde,0x04,0x7e] + +0x03,0xdf,0x04,0x7e +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e32 v[2:3], v3.l ; encoding: [0x03,0xdf,0x04,0x7e] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e32 v[2:3], v3 ; encoding: [0x03,0xdf,0x04,0x7e] + +0x05,0xde,0x08,0x7e +# GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], s5 ; encoding: [0x05,0xde,0x08,0x7e] + +0x83,0xde,0x08,0x7e +# GFX1250: v_cvt_pk_f32_bf8_e32 v[4:5], 3 ; encoding: [0x83,0xde,0x08,0x7e] + +0x03,0xdf,0x08,0x7e +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e32 v[4:5], v3.l ; encoding: [0x03,0xdf,0x08,0x7e] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e32 v[4:5], v3 ; encoding: [0x03,0xdf,0x08,0x7e] + +0xff,0xdf,0x08,0x7e +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e32 v[4:5], v127.h ; encoding: [0xff,0xdf,0x08,0x7e] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e32 v[4:5], v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xdf,0x08,0x7e] + +0x7f,0xdf,0x08,0x7e +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e32 v[4:5], v127.l ; encoding: [0x7f,0xdf,0x08,0x7e] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e32 v[4:5], v127 ; encoding: [0x7f,0xdf,0x08,0x7e] + +0x03,0xdc,0x04,0x7e +# GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], s3 ; encoding: [0x03,0xdc,0x04,0x7e] + +0x83,0xdc,0x04,0x7e +# GFX1250: v_cvt_pk_f32_fp8_e32 v[2:3], 3 ; encoding: [0x83,0xdc,0x04,0x7e] + +0x03,0xdd,0x04,0x7e +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e32 v[2:3], v3.l ; encoding: [0x03,0xdd,0x04,0x7e] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e32 v[2:3], v3 ; encoding: [0x03,0xdd,0x04,0x7e] + +0xff,0xdd,0x08,0x7e +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e32 v[4:5], v127.h ; encoding: [0xff,0xdd,0x08,0x7e] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e32 v[4:5], v255/*Invalid register, operand has 'VS_32_Lo128' register class*/ ; encoding: [0xff,0xdd,0x08,0x7e] + +0x7f,0xdd,0x08,0x7e +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e32 v[4:5], v127.l ; encoding: [0x7f,0xdd,0x08,0x7e] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e32 v[4:5], v127 ; encoding: [0x7f,0xdd,0x08,0x7e] + 0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00 # GFX1250-REAL16: v_sat_pk4_i4_i8_e32 v1.l, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00] # GFX1250-FAKE16: v_sat_pk4_i4_i8_e32 v1, 0x1234 ; encoding: [0xff,0xe6,0x02,0x7e,0x34,0x12,0x00,0x00] @@ -845,9 +891,8 @@ # GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, v2 ; encoding: [0x02,0xe9,0x02,0x7e] # GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, v2 ; encoding: [0x02,0xe9,0x02,0x7e] -0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00 -# GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.l, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00] -# GFX1250-FAKE16: v_sat_pk4_u4_u8_e32 v1, 0x1234 ; encoding: [0xff,0xe8,0x02,0x7e,0x34,0x12,0x00,0x00] - 0x02,0xe9,0x02,0x7f # GFX1250-REAL16: v_sat_pk4_u4_u8_e32 v1.h, v2 ; encoding: [0x02,0xe9,0x02,0x7f] + +0x02,0x93,0x02,0x7e +# GFX1250: v_permlane16_swap_b32_e32 v1, v2 ; encoding: [0x02,0x93,0x02,0x7e] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt index 1719592c3dccd..67747a65ee52a 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1.txt @@ -2,6 +2,4018 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-REAL16 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s +0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_bfrev_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb8,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xb8,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xb8,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb8,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb8,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb8,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xb8,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, null ; encoding: [0x05,0x00,0xb8,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xb8,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb8,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb8,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb8,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xb8,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb8,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xb8,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_bfrev_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb8,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdc,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdc,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdc,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdc,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, null ; encoding: [0x05,0x00,0xdc,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdc,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdc,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdc,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdc,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdc,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdc,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdc,0xd5,0x6a,0x00,0x00,0x00] + +0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.l, v128.l ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, v128 ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00] + +0x05,0x48,0xdc,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_ceil_f16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xdc,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_ceil_f16_e64 v5, v128 ; encoding: [0x05,0x00,0xdc,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xa2,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_ceil_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa2,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xa2,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa2,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_ceil_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa2,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xa2,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa2,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa2,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa2,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, null ; encoding: [0x05,0x00,0xa2,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa2,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_ceil_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa2,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xa2,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa2,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa2,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa2,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa2,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xa2,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_ceil_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa2,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0x98,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_ceil_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x98,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0x98,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x98,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0x98,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_ceil_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x98,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0x98,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_ceil_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x98,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0x98,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x98,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0x98,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x98,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0x98,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x98,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0x98,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0x98,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x98,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0x98,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x98,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0x98,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x98,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0x98,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_ceil_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x98,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_cls_i32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbb,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, -1 ; encoding: [0x05,0x00,0xbb,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbb,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, m0 ; encoding: [0x05,0x00,0xbb,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, null ; encoding: [0x05,0x00,0xbb,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, s1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, s105 ; encoding: [0x05,0x00,0xbb,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbb,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbb,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, v1 ; encoding: [0x05,0x00,0xbb,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, v255 ; encoding: [0x05,0x00,0xbb,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbb,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cls_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbb,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_clz_i32_u32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb9,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0xb9,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb9,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0xb9,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, null ; encoding: [0x05,0x00,0xb9,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0xb9,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb9,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb9,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0xb9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0xb9,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb9,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_clz_i32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb9,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe1,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe1,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_cos_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe1,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe1,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, null ; encoding: [0x05,0x00,0xe1,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe1,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_cos_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe1,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe1,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe1,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe1,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe1,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe1,0xd5,0x6a,0x00,0x00,0x00] + +0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.l, v128.l ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, v128 ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00] + +0x05,0x48,0xe1,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cos_f16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xe1,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cos_f16_e64 v5, v128 ; encoding: [0x05,0x00,0xe1,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xb6,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_cos_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb6,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xb6,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb6,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cos_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb6,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xb6,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb6,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb6,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb6,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, null ; encoding: [0x05,0x00,0xb6,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb6,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cos_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb6,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xb6,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb6,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb6,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb6,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb6,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xb6,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cos_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb6,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_ctz_i32_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xba,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xba,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xba,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xba,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xba,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, null ; encoding: [0x05,0x00,0xba,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xba,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xba,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xba,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xba,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xba,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xba,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_ctz_i32_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xba,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250-REAL16: v_cvt_f16_f32_e64 v255.l, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x8a,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, -1 ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8a,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8a,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, m0 ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8a,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, null ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, null ; encoding: [0x05,0x00,0x8a,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, s1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, s105 ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8a,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8a,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8a,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, v255 ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8a,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8a,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8a,0xd5,0x6a,0x00,0x00,0x00] + +0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v128.l, v15 ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v128, v15 ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00] + +0x80,0x40,0x8a,0xd5,0x0f,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_f32_e64 v128.h, v15 op_sel:[0,1] ; encoding: [0x80,0x40,0x8a,0xd5,0x0f,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_f32_e64 v128, v15 ; encoding: [0x80,0x00,0x8a,0xd5,0x0f,0x01,0x00,0x00] + +0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v255.l, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd1,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xd1,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, 0x3800 mul:2 ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, 0x3800 mul:2 ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xd1,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, null ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, null ; encoding: [0x05,0x00,0xd1,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xd1,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd1,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd1,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xd1,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xd1,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd1,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd1,0xd5,0x6a,0x00,0x00,0x00] + +0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v128.l, v15.l ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v128, v15 ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00] + +0x80,0x48,0xd1,0xd5,0x0f,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_i16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd1,0xd5,0x0f,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_i16_e64 v128, v15 ; encoding: [0x80,0x00,0xd1,0xd5,0x0f,0x01,0x00,0x00] + +0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v255.l, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v255, 0xfe0b clamp div:2 ; encoding: [0xff,0x80,0xd0,0xd5,0xff,0x00,0x00,0x18,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xd0,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, 0x3800 mul:2 ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, 0x3800 mul:2 ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x00,0x00,0x08,0x00,0x38,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, m0 ; encoding: [0x05,0x00,0xd0,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, null ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, null ; encoding: [0x05,0x00,0xd0,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, s1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, s105 ; encoding: [0x05,0x00,0xd0,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd0,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd0,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, v1 ; encoding: [0x05,0x00,0xd0,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, v255 ; encoding: [0x05,0x00,0xd0,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd0,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd0,0xd5,0x6a,0x00,0x00,0x00] + +0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v128.l, v15.l ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v128, v15 ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00] + +0x80,0x48,0xd0,0xd5,0x0f,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_u16_e64 v128.h, v15.h op_sel:[1,1] ; encoding: [0x80,0x48,0xd0,0xd5,0x0f,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_u16_e64 v128, v15 ; encoding: [0x80,0x00,0xd0,0xd5,0x0f,0x01,0x00,0x00] + +0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] + +0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] + +0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] + +0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] + +0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] + +0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] + +0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] + +0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] + +0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] + +0xff,0x81,0x8b,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0x8b,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, -1 ; encoding: [0x05,0x00,0x8b,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f32_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8b,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x8b,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8b,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8b,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, m0 ; encoding: [0x05,0x00,0x8b,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, null ; encoding: [0x05,0x00,0x8b,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, s1 ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, s105 ; encoding: [0x05,0x00,0x8b,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f32_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8b,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x8b,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8b,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f32_f16_e64 v5, v1.l ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f32_f16_e64 v5, v1 ; encoding: [0x05,0x00,0x8b,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f32_f16_e64 v5, v255.l ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f32_f16_e64 v5, v255 ; encoding: [0x05,0x00,0x8b,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8b,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x8b,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8b,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f32_f16_e64 v1, v128.l ; encoding: [0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f32_f16_e64 v1, v128 ; encoding: [0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x08,0x8b,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f32_f16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0x8b,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f32_f16_e64 v1, v128 ; encoding: [0x01,0x00,0x8b,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x80,0x8f,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f32_f64_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x8f,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x8f,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x8f,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x01,0x8f,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_cvt_f32_f64_e64 v5, -|src_scc| mul:4 ; encoding: [0x05,0x01,0x8f,0xd5,0xfd,0x00,0x00,0x30] + +0x05,0x00,0x8f,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f32_f64_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8f,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x8f,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x8f,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x8f,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, null ; encoding: [0x05,0x00,0x8f,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x8f,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x8f,0xd5,0x68,0x00,0x00,0x00] + +0x05,0x00,0x8f,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x00,0x00,0x00] + +0x05,0x00,0x8f,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x8f,0xd5,0x7a,0x00,0x00,0x00] + +0x05,0x00,0x8f,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x8f,0xd5,0xfe,0x01,0x00,0x00] + +0x05,0x00,0x8f,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x8f,0xd5,0x02,0x01,0x00,0x00] + +0x05,0x00,0x8f,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x8f,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] + +0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] + +0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] + +0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] + +0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] + +0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] + +0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] + +0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] + +0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00] + +0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] + +0xff,0x80,0x85,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f32_i32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x85,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x85,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, -1 ; encoding: [0x05,0x00,0x85,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f32_i32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x85,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x85,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x85,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x85,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, m0 ; encoding: [0x05,0x00,0x85,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, null ; encoding: [0x05,0x00,0x85,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, s1 ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, s105 ; encoding: [0x05,0x00,0x85,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f32_i32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x85,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x85,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x85,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, v1 ; encoding: [0x05,0x00,0x85,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x85,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, v255 ; encoding: [0x05,0x00,0x85,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x85,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x85,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x85,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_i32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x85,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x80,0x86,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f32_u32_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x86,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x86,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, -1 ; encoding: [0x05,0x00,0x86,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f32_u32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x86,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x86,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x86,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x86,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, m0 ; encoding: [0x05,0x00,0x86,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, null ; encoding: [0x05,0x00,0x86,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, s1 ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, s105 ; encoding: [0x05,0x00,0x86,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f32_u32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x86,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x86,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x86,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, v1 ; encoding: [0x05,0x00,0x86,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x86,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, v255 ; encoding: [0x05,0x00,0x86,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x86,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x86,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x86,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_u32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x86,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x80,0x91,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f32_ubyte0_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x91,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x91,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, -1 ; encoding: [0x05,0x00,0x91,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x91,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x91,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_hi ; encoding: [0x05,0x00,0x91,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, exec_lo ; encoding: [0x05,0x00,0x91,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, m0 ; encoding: [0x05,0x00,0x91,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, null ; encoding: [0x05,0x00,0x91,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, s1 ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, s105 ; encoding: [0x05,0x00,0x91,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x91,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x91,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x91,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, v1 ; encoding: [0x05,0x00,0x91,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x91,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, v255 ; encoding: [0x05,0x00,0x91,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x91,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x91,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x91,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte0_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x91,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x80,0x92,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f32_ubyte1_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x92,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x92,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, -1 ; encoding: [0x05,0x00,0x92,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x92,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x92,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_hi ; encoding: [0x05,0x00,0x92,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, exec_lo ; encoding: [0x05,0x00,0x92,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, m0 ; encoding: [0x05,0x00,0x92,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, null ; encoding: [0x05,0x00,0x92,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, s1 ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, s105 ; encoding: [0x05,0x00,0x92,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x92,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x92,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x92,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, v1 ; encoding: [0x05,0x00,0x92,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x92,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, v255 ; encoding: [0x05,0x00,0x92,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x92,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x92,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x92,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte1_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x92,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x80,0x93,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f32_ubyte2_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x93,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x93,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, -1 ; encoding: [0x05,0x00,0x93,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x93,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x93,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_hi ; encoding: [0x05,0x00,0x93,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, exec_lo ; encoding: [0x05,0x00,0x93,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, m0 ; encoding: [0x05,0x00,0x93,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, null ; encoding: [0x05,0x00,0x93,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, s1 ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, s105 ; encoding: [0x05,0x00,0x93,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x93,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x93,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x93,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, v1 ; encoding: [0x05,0x00,0x93,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x93,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, v255 ; encoding: [0x05,0x00,0x93,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x93,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x93,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x93,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte2_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x93,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x80,0x94,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f32_ubyte3_e64 v255, 0xaf123456 clamp div:2 ; encoding: [0xff,0x80,0x94,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x94,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, -1 ; encoding: [0x05,0x00,0x94,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x94,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x94,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_hi ; encoding: [0x05,0x00,0x94,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, exec_lo ; encoding: [0x05,0x00,0x94,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, m0 ; encoding: [0x05,0x00,0x94,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, null ; encoding: [0x05,0x00,0x94,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, s1 ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, s105 ; encoding: [0x05,0x00,0x94,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x94,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x94,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x94,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, v1 ; encoding: [0x05,0x00,0x94,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x94,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, v255 ; encoding: [0x05,0x00,0x94,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x94,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x94,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x94,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_ubyte3_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x94,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x81,0x90,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f64_f32_e64 v[254:255], -|0xaf123456| clamp div:2 ; encoding: [0xfe,0x81,0x90,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x06,0x00,0x90,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x90,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x90,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0x90,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x90,0xd5,0x7f,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x90,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x90,0xd5,0x7d,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], null ; encoding: [0x06,0x00,0x90,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x90,0xd5,0x69,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x90,0xd5,0xfd,0x00,0x00,0x10] + +0x06,0x00,0x90,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x90,0xd5,0x7b,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x90,0xd5,0x01,0x01,0x00,0x00] + +0x06,0x00,0x90,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x90,0xd5,0xff,0x01,0x00,0x00] + +0x06,0x00,0x90,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x90,0xd5,0x6b,0x00,0x00,0x00] + +0x06,0x00,0x90,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_f32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x90,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0x84,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f64_i32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x84,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0x84,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x84,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x84,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0x84,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x84,0xd5,0x7f,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x84,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x84,0xd5,0x7d,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], null ; encoding: [0x06,0x00,0x84,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x84,0xd5,0x69,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x84,0xd5,0xfd,0x00,0x00,0x10] + +0x06,0x00,0x84,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x84,0xd5,0x7b,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x84,0xd5,0x01,0x01,0x00,0x00] + +0x06,0x00,0x84,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x84,0xd5,0xff,0x01,0x00,0x00] + +0x06,0x00,0x84,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x84,0xd5,0x6b,0x00,0x00,0x00] + +0x06,0x00,0x84,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_i32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x84,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0x96,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_f64_u32_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x96,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0x96,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x96,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x96,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0x96,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_hi ; encoding: [0x06,0x00,0x96,0xd5,0x7f,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], exec_lo ; encoding: [0x06,0x00,0x96,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], m0 ; encoding: [0x06,0x00,0x96,0xd5,0x7d,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], null ; encoding: [0x06,0x00,0x96,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], s1 ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], s105 ; encoding: [0x06,0x00,0x96,0xd5,0x69,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], src_scc mul:4 ; encoding: [0x06,0x00,0x96,0xd5,0xfd,0x00,0x00,0x10] + +0x06,0x00,0x96,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], ttmp15 ; encoding: [0x06,0x00,0x96,0xd5,0x7b,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], v1 ; encoding: [0x06,0x00,0x96,0xd5,0x01,0x01,0x00,0x00] + +0x06,0x00,0x96,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], v255 ; encoding: [0x06,0x00,0x96,0xd5,0xff,0x01,0x00,0x00] + +0x06,0x00,0x96,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_hi ; encoding: [0x06,0x00,0x96,0xd5,0x6b,0x00,0x00,0x00] + +0x06,0x00,0x96,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f64_u32_e64 v[6:7], vcc_lo ; encoding: [0x06,0x00,0x96,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_floor_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8d,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8d,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8d,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8d,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8d,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8d,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8d,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8d,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8d,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8d,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8d,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_floor_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8d,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v255.l, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd3,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xd3,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd3,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xd3,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd3,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xd3,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd3,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd3,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd3,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd3,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd3,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v1.l, v128.l ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xd3,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_i16_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd3,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd3,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0x88,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_i32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x88,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x88,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x88,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x88,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x88,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x88,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x88,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x88,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x88,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x88,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x88,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x88,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x88,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x88,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x88,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x88,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x88,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_i32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x83,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x83,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x83,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x01,0x83,0xd5,0xfd,0x00,0x00,0x20 +# GFX1250: v_cvt_i32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0x83,0xd5,0xfd,0x00,0x00,0x20] + +0x05,0x00,0x83,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0x83,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0x83,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x83,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x83,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, null ; encoding: [0x05,0x00,0x83,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x83,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x83,0xd5,0x68,0x00,0x00,0x00] + +0x05,0x00,0x83,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x00,0x00,0x00] + +0x05,0x00,0x83,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x83,0xd5,0x7a,0x00,0x00,0x00] + +0x05,0x00,0x83,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x83,0xd5,0xfe,0x01,0x00,0x00] + +0x05,0x00,0x83,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x83,0xd5,0x02,0x01,0x00,0x00] + +0x05,0x00,0x83,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x83,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xea,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xea,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xea,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xea,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, null ; encoding: [0x05,0x00,0xea,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xea,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xea,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xea,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_i32_i16_e64 v5, v1.l ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i32_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xea,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_i32_i16_e64 v5, v255.l ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i32_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xea,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xea,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xea,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_i32_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xea,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_i32_i16_e64 v1, v128.l ; encoding: [0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i32_i16_e64 v1, v128 ; encoding: [0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x08,0xea,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_i32_i16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xea,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_i32_i16_e64 v1, v128 ; encoding: [0x01,0x00,0xea,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_nearest_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0x8c,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x8c,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x8c,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x8c,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0x8c,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x8c,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x8c,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8c,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x8c,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x8c,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8c,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_nearest_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8c,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xe3,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe3,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe3,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe3,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xe3,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe3,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe3,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe3,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe3,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe3,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe3,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe3,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v1.l, v128.l ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x08,0xe3,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_i16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe3,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_i16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xe3,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xe4,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe4,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe4,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe4,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, null ; encoding: [0x05,0x00,0xe4,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe4,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe4,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe4,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe4,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe4,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe4,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe4,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v1.l, v128.l ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x08,0xe4,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_norm_u16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xe4,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_norm_u16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xe4,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x80,0x8e,0xd5,0xff,0x00,0x00,0x18,0x4f,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v255, 0x4f clamp div:2 ; encoding: [0xff,0x80,0x8e,0xd5,0xff,0x00,0x00,0x18,0x4f,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, -1 ; encoding: [0x05,0x00,0x8e,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_cvt_off_f32_i4_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0x8e,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0x8e,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, exec_hi ; encoding: [0x05,0x00,0x8e,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, exec_lo ; encoding: [0x05,0x00,0x8e,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, m0 ; encoding: [0x05,0x00,0x8e,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, null ; encoding: [0x05,0x00,0x8e,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, s1 ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, s105 ; encoding: [0x05,0x00,0x8e,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_cvt_off_f32_i4_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0x8e,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0x8e,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x8e,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, v1 ; encoding: [0x05,0x00,0x8e,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, v255 ; encoding: [0x05,0x00,0x8e,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x8e,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x8e,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_off_f32_i4_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x8e,0xd5,0x6a,0x00,0x00,0x00] + +0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00] + +0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v3.l ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v3.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +0x04,0x08,0xef,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x83,0x00,0x00,0x00] + +0x04,0x00,0xef,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x00,0x00,0x00] + +0x04,0x08,0xef,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_bf8_e64 v[4:5], s3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x00,0x00,0x00] + +0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[4:5], v3.l ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[4:5], v3 ; encoding: [0x04,0x00,0xef,0xd5,0x03,0x01,0x00,0x00] + +0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[4:5], v3.h op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xef,0xd5,0x03,0x01,0x00,0x00] + +0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v128 op_sel:[1,0] ; encoding: [0x02,0x08,0xef,0xd5,0x80,0x01,0x00,0x00] + +0x02,0x00,0xef,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_bf8_e64 v[2:3], v128.l ; encoding: [0x02,0x00,0xef,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_bf8_e64 v[2:3], v128 ; encoding: [0x02,0x00,0xef,0xd5,0x80,0x01,0x00,0x00] + +0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00] + +0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v3.l ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v3 ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v3.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +0x04,0x00,0xee,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 ; encoding: [0x04,0x00,0xee,0xd5,0x83,0x00,0x00,0x00] + +0x04,0x08,0xee,0xd5,0x83,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], 3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x83,0x00,0x00,0x00] + +0x04,0x00,0xee,0xd5,0x03,0x00,0x00,0x00 +# GFX1250: v_cvt_pk_f32_fp8_e64 v[4:5], s3 ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x00,0x00,0x00] + +0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[4:5], v3.l ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[4:5], v3 ; encoding: [0x04,0x00,0xee,0xd5,0x03,0x01,0x00,0x00] + +0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[4:5], v3.h op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[4:5], v3 op_sel:[1,0] ; encoding: [0x04,0x08,0xee,0xd5,0x03,0x01,0x00,0x00] + +0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v128.h op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v128 op_sel:[1,0] ; encoding: [0x02,0x08,0xee,0xd5,0x80,0x01,0x00,0x00] + +0x02,0x00,0xee,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_pk_f32_fp8_e64 v[2:3], v128.l ; encoding: [0x02,0x00,0xee,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_pk_f32_fp8_e64 v[2:3], v128 ; encoding: [0x02,0x00,0xee,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v255.l, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v255, -|0xfe0b| clamp ; encoding: [0xff,0x81,0xd2,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd2,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xd2,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd2,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, null ; encoding: [0x05,0x00,0xd2,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd2,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xd2,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd2,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd2,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd2,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd2,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd2,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v1.l, v128.l ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x08,0xd2,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_u16_f16_e64 v1.l, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xd2,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd2,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0x87,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_u32_f32_e64 v255, -|0xaf123456| clamp ; encoding: [0xff,0x81,0x87,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x87,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0x87,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x87,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x87,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x87,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0x87,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, null ; encoding: [0x05,0x00,0x87,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0x87,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0x87,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x87,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0x87,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x87,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0x87,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x87,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x87,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x87,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_cvt_u32_f64_e64 v255, 0xaf123456 clamp ; encoding: [0xff,0x80,0x95,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x95,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0x95,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x01,0x95,0xd5,0xfd,0x00,0x00,0x20 +# GFX1250: v_cvt_u32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0x95,0xd5,0xfd,0x00,0x00,0x20] + +0x05,0x00,0x95,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0x95,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0x95,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, exec ; encoding: [0x05,0x00,0x95,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x95,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, null ; encoding: [0x05,0x00,0x95,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x95,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0x95,0xd5,0x68,0x00,0x00,0x00] + +0x05,0x00,0x95,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x00,0x00,0x00] + +0x05,0x00,0x95,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0x95,0xd5,0x7a,0x00,0x00,0x00] + +0x05,0x00,0x95,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0x95,0xd5,0xfe,0x01,0x00,0x00] + +0x05,0x00,0x95,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0x95,0xd5,0x02,0x01,0x00,0x00] + +0x05,0x00,0x95,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0x95,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, -1 ; encoding: [0x05,0x00,0xeb,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xeb,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xeb,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, m0 ; encoding: [0x05,0x00,0xeb,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, null ; encoding: [0x05,0x00,0xeb,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, s1 ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, s105 ; encoding: [0x05,0x00,0xeb,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, src_scc ; encoding: [0x05,0x00,0xeb,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xeb,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_u32_u16_e64 v5, v1.l ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u32_u16_e64 v5, v1 ; encoding: [0x05,0x00,0xeb,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_u32_u16_e64 v5, v255.l ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u32_u16_e64 v5, v255 ; encoding: [0x05,0x00,0xeb,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xeb,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xeb,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_u32_u16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xeb,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_u32_u16_e64 v1, v128.l ; encoding: [0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u32_u16_e64 v1, v128 ; encoding: [0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x08,0xeb,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_u32_u16_e64 v1, v128.h op_sel:[1,0] ; encoding: [0x01,0x08,0xeb,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_u32_u16_e64 v1, v128 ; encoding: [0x01,0x00,0xeb,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd8,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd8,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_exp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd8,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd8,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd8,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd8,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_exp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd8,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd8,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd8,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd8,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd8,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd8,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xd8,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd8,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd8,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xd8,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_exp_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd8,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_exp_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd8,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_exp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xa5,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa5,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_exp_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa5,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xa5,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa5,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa5,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa5,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, null ; encoding: [0x05,0x00,0xa5,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa5,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_exp_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa5,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xa5,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa5,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa5,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa5,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa5,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xa5,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_exp_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa5,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdb,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdb,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_floor_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdb,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdb,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, null ; encoding: [0x05,0x00,0xdb,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdb,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_floor_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdb,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdb,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdb,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdb,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdb,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdb,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xdb,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdb,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xdb,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xdb,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_floor_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdb,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_floor_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xdb,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xa4,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_floor_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa4,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xa4,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa4,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_floor_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa4,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xa4,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa4,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa4,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa4,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, null ; encoding: [0x05,0x00,0xa4,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa4,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_floor_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa4,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xa4,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa4,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa4,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa4,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa4,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xa4,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_floor_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa4,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_floor_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x9a,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0x9a,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x9a,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0x9a,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_floor_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x9a,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0x9a,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_floor_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x9a,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0x9a,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x9a,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0x9a,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x9a,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0x9a,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x9a,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0x9a,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0x9a,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x9a,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0x9a,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x9a,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0x9a,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x9a,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0x9a,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_floor_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x9a,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdf,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdf,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_fract_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdf,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdf,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, null ; encoding: [0x05,0x00,0xdf,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdf,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_fract_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdf,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdf,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdf,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdf,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdf,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdf,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xdf,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdf,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xdf,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xdf,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_fract_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdf,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_fract_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xdf,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xa0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_fract_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xa0,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa0,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_fract_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa0,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xa0,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa0,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa0,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa0,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, null ; encoding: [0x05,0x00,0xa0,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa0,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_fract_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa0,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xa0,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa0,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa0,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa0,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa0,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xa0,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_fract_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa0,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_fract_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbe,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0xbe,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xbe,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0xbe,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_fract_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbe,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0xbe,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_fract_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xbe,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0xbe,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xbe,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0xbe,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xbe,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0xbe,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbe,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0xbe,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0xbe,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbe,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0xbe,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbe,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0xbe,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xbe,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0xbe,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_fract_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xbe,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v255.l, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v255, -|0xfe0b| ; encoding: [0xff,0x01,0xda,0xd5,0xff,0x00,0x00,0x20,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xda,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xda,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xda,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xda,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xda,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, null ; encoding: [0x05,0x00,0xda,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xda,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, src_scc ; encoding: [0x05,0x00,0xda,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xda,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xda,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xda,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xda,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xda,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xda,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xda,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xda,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xda,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_frexp_exp_i16_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xda,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_frexp_exp_i16_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xda,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x01,0xbf,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf +# GFX1250: v_frexp_exp_i32_f32_e64 v255, -|0xaf123456| ; encoding: [0xff,0x01,0xbf,0xd5,0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xbf,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xbf,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbf,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xbf,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xbf,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xbf,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, null ; encoding: [0x05,0x00,0xbf,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xbf,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, src_scc ; encoding: [0x05,0x00,0xbf,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xbf,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xbf,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xbf,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xbf,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xbf,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xbf,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_frexp_exp_i32_f64_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xbc,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xbc,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, -1 ; encoding: [0x05,0x00,0xbc,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x01,0xbc,0xd5,0xfd,0x00,0x00,0x20 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, -|src_scc| ; encoding: [0x05,0x01,0xbc,0xd5,0xfd,0x00,0x00,0x20] + +0x05,0x00,0xbc,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, 0.5 ; encoding: [0x05,0x00,0xbc,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xbc,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, exec ; encoding: [0x05,0x00,0xbc,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xbc,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, null ; encoding: [0x05,0x00,0xbc,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xbc,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, s[104:105] ; encoding: [0x05,0x00,0xbc,0xd5,0x68,0x00,0x00,0x00] + +0x05,0x00,0xbc,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, s[2:3] ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x00,0x00,0x00] + +0x05,0x00,0xbc,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, ttmp[14:15] ; encoding: [0x05,0x00,0xbc,0xd5,0x7a,0x00,0x00,0x00] + +0x05,0x00,0xbc,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, v[254:255] ; encoding: [0x05,0x00,0xbc,0xd5,0xfe,0x01,0x00,0x00] + +0x05,0x00,0xbc,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, v[2:3] ; encoding: [0x05,0x00,0xbc,0xd5,0x02,0x01,0x00,0x00] + +0x05,0x00,0xbc,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_frexp_exp_i32_f64_e64 v5, vcc ; encoding: [0x05,0x00,0xbc,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd9,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd9,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd9,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd9,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, null ; encoding: [0x05,0x00,0xd9,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd9,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd9,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd9,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd9,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd9,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd9,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xd9,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd9,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd9,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xd9,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_frexp_mant_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd9,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_frexp_mant_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd9,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xc0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_frexp_mant_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xc0,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xc0,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xc0,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_frexp_mant_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xc0,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xc0,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xc0,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xc0,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xc0,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, null ; encoding: [0x05,0x00,0xc0,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xc0,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_frexp_mant_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xc0,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xc0,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xc0,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xc0,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xc0,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xc0,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xc0,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xc0,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_frexp_mant_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xbd,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0xbd,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xbd,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0xbd,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xbd,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0xbd,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xbd,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0xbd,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xbd,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0xbd,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xbd,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0xbd,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xbd,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0xbd,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0xbd,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xbd,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0xbd,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xbd,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0xbd,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xbd,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0xbd,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_frexp_mant_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xbd,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd7,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd7,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_log_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_log_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd7,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd7,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, null ; encoding: [0x05,0x00,0xd7,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd7,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_log_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_log_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd7,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd7,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd7,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd7,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd7,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd7,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xd7,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd7,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd7,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xd7,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_log_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd7,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_log_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd7,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xa7,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_log_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa7,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xa7,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa7,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_log_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa7,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xa7,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa7,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa7,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa7,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, null ; encoding: [0x05,0x00,0xa7,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa7,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_log_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa7,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xa7,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa7,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa7,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa7,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa7,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xa7,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_log_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa7,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0x81,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_mov_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0x81,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0x81,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, -1 ; encoding: [0x05,0x00,0x81,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0x81,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0x81,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0x81,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, m0 ; encoding: [0x05,0x00,0x81,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, null ; encoding: [0x05,0x00,0x81,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, s1 ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, s105 ; encoding: [0x05,0x00,0x81,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0x81,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0x81,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, v1 ; encoding: [0x05,0x00,0x81,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0x81,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, v255 ; encoding: [0x05,0x00,0x81,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0x81,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0x81,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_mov_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0x81,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xc2,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_movreld_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xc2,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xc2,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xc2,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xc2,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xc2,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xc2,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xc2,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, null ; encoding: [0x05,0x00,0xc2,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xc2,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xc2,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xc2,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc2,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xc2,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xc2,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xc2,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_movreld_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xc2,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xc3,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_movrels_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc3,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xc3,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_movrels_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc3,0xd5,0x01,0x01,0x00,0x00] + +0xff,0x00,0xc8,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_movrelsd_2_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc8,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xc8,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_movrelsd_2_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc8,0xd5,0x01,0x01,0x00,0x00] + +0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_movrelsd_b32_e64 v255, v255 ; encoding: [0xff,0x00,0xc4,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xc4,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_movrelsd_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xc4,0xd5,0x01,0x01,0x00,0x00] + +0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00 +# GFX1250: v_nop ; encoding: [0x00,0x00,0x80,0xd5,0x00,0x00,0x00,0x00] + +0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, -1 ; encoding: [0x05,0x00,0xe9,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, 0x3800 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x00,0x00,0x00,0x00,0x38,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, m0 ; encoding: [0x05,0x00,0xe9,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, null ; encoding: [0x05,0x00,0xe9,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, s1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, s105 ; encoding: [0x05,0x00,0xe9,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe9,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe9,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, v1 ; encoding: [0x05,0x00,0xe9,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, v255 ; encoding: [0x05,0x00,0xe9,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe9,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe9,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xe9,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe9,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v1, v128 ; encoding: [0x01,0x00,0xe9,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xe9,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_not_b16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xe9,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_not_b16_e64 v1, v128 ; encoding: [0x01,0x00,0xe9,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf +# GFX1250: v_not_b32_e64 v255, 0xaf123456 ; encoding: [0xff,0x00,0xb7,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xb7,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, -1 ; encoding: [0x05,0x00,0xb7,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, 0.5 ; encoding: [0x05,0x00,0xb7,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb7,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb7,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, m0 ; encoding: [0x05,0x00,0xb7,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, null ; encoding: [0x05,0x00,0xb7,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, s1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, s105 ; encoding: [0x05,0x00,0xb7,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, src_scc ; encoding: [0x05,0x00,0xb7,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb7,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, v1 ; encoding: [0x05,0x00,0xb7,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, v255 ; encoding: [0x05,0x00,0xb7,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb7,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xb7,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_not_b32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb7,0xd5,0x6a,0x00,0x00,0x00] + +0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00 +# GFX1250: v_pipeflush ; encoding: [0x00,0x00,0x9b,0xd5,0x00,0x00,0x00,0x00] + +0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd4,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd4,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd4,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd4,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, null ; encoding: [0x05,0x00,0xd4,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd4,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd4,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd4,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd4,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd4,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd4,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd4,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xd4,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd4,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd4,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xd4,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_rcp_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd4,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rcp_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd4,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xaa,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_rcp_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xaa,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xaa,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xaa,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_rcp_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xaa,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xaa,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xaa,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xaa,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xaa,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, null ; encoding: [0x05,0x00,0xaa,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xaa,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_rcp_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xaa,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xaa,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xaa,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xaa,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xaa,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xaa,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xaa,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_rcp_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xaa,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0xaf,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_rcp_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xaf,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0xaf,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xaf,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0xaf,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_rcp_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xaf,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0xaf,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_rcp_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xaf,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0xaf,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xaf,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0xaf,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xaf,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0xaf,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xaf,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0xaf,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0xaf,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xaf,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0xaf,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xaf,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0xaf,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xaf,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0xaf,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_rcp_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xaf,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_rcp_iflag_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xab,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xab,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xab,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_rcp_iflag_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xab,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xab,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xab,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xab,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xab,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, null ; encoding: [0x05,0x00,0xab,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xab,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_rcp_iflag_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xab,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xab,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xab,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xab,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xab,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xab,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xab,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xab,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xab,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_rcp_iflag_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xab,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xde,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xde,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xde,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xde,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xde,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xde,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, null ; encoding: [0x05,0x00,0xde,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xde,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xde,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xde,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xde,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xde,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xde,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xde,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xde,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xde,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xde,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xde,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_rndne_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xde,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rndne_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xde,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xa3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_rndne_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xa3,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa3,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_rndne_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa3,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xa3,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa3,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa3,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa3,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, null ; encoding: [0x05,0x00,0xa3,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa3,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_rndne_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa3,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xa3,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa3,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa3,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa3,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa3,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xa3,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_rndne_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa3,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_rndne_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x99,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0x99,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x99,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0x99,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_rndne_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x99,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0x99,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_rndne_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x99,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0x99,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x99,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0x99,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x99,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0x99,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x99,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0x99,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0x99,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x99,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0x99,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x99,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0x99,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x99,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0x99,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_rndne_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x99,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd6,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd6,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd6,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd6,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, null ; encoding: [0x05,0x00,0xd6,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd6,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd6,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd6,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd6,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd6,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd6,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd6,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xd6,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd6,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd6,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xd6,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_rsq_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd6,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_rsq_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd6,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xae,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_rsq_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xae,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xae,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xae,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_rsq_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xae,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xae,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xae,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xae,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xae,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, null ; encoding: [0x05,0x00,0xae,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xae,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_rsq_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xae,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xae,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xae,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xae,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xae,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xae,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xae,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xae,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xae,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_rsq_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xae,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_rsq_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb1,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0xb1,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xb1,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0xb1,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_rsq_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb1,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0xb1,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_rsq_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xb1,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0xb1,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xb1,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0xb1,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xb1,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0xb1,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xb1,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0xb1,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0xb1,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xb1,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0xb1,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xb1,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0xb1,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xb1,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0xb1,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_rsq_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xb1,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v255.l, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v255, 0xfe0b ; encoding: [0xff,0x00,0xe2,0xd5,0xff,0x00,0x00,0x00,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, -1 ; encoding: [0x05,0x00,0xe2,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, 0.5 ; encoding: [0x05,0x00,0xe2,0xd5,0xf0,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, m0 ; encoding: [0x05,0x00,0xe2,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, null ; encoding: [0x05,0x00,0xe2,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, s1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, s105 ; encoding: [0x05,0x00,0xe2,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, src_scc ; encoding: [0x05,0x00,0xe2,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe2,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, v1 ; encoding: [0x05,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, v255 ; encoding: [0x05,0x00,0xe2,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe2,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe2,0xd5,0x6a,0x00,0x00,0x00] + +0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v128.l, v1 ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v128, v1 ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] + +0x80,0x40,0xe2,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_sat_pk_u8_i16_e64 v128.h, v1 op_sel:[0,1] ; encoding: [0x80,0x40,0xe2,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sat_pk_u8_i16_e64 v128, v1 ; encoding: [0x80,0x00,0xe2,0xd5,0x01,0x01,0x00,0x00] + +0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xe0,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xe0,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_sin_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xe0,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xe0,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, null ; encoding: [0x05,0x00,0xe0,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xe0,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_sin_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xe0,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xe0,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xe0,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xe0,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xe0,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xe0,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xe0,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xe0,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xe0,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xe0,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_sin_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xe0,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sin_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xe0,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_sin_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb5,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xb5,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb5,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_sin_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb5,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xb5,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb5,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb5,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb5,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, null ; encoding: [0x05,0x00,0xb5,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb5,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_sin_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb5,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xb5,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb5,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb5,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb5,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb5,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xb5,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_sin_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb5,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xd5,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xd5,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xd5,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xd5,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, null ; encoding: [0x05,0x00,0xd5,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xd5,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xd5,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xd5,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xd5,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xd5,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xd5,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xd5,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xd5,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xd5,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd5,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xd5,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_sqrt_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xd5,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xd5,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xb3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_sqrt_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xb3,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xb3,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xb3,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_sqrt_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xb3,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xb3,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xb3,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xb3,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xb3,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, null ; encoding: [0x05,0x00,0xb3,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xb3,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_sqrt_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xb3,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xb3,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xb3,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xb3,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xb3,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xb3,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xb3,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_sqrt_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xb3,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_sqrt_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0xb4,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0xb4,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0xb4,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0xb4,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_sqrt_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0xb4,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0xb4,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_sqrt_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0xb4,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0xb4,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0xb4,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0xb4,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0xb4,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0xb4,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0xb4,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0xb4,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0xb4,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0xb4,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0xb4,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0xb4,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0xb4,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0xb4,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0xb4,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_sqrt_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0xb4,0xd5,0x6a,0x00,0x00,0x00] + +0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v255.l, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v255, -|0xfe0b| clamp div:2 ; encoding: [0xff,0x81,0xdd,0xd5,0xff,0x00,0x00,0x38,0x0b,0xfe,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, -1 ; encoding: [0x05,0x00,0xdd,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xdd,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, exec_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, exec_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, m0 ; encoding: [0x05,0x00,0xdd,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, null ; encoding: [0x05,0x00,0xdd,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, s1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, s105 ; encoding: [0x05,0x00,0xdd,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xdd,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xdd,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, v1.l ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, v1 ; encoding: [0x05,0x00,0xdd,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, v255.l ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, v255 ; encoding: [0x05,0x00,0xdd,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xdd,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v5.l, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xdd,0xd5,0x6a,0x00,0x00,0x00] + +0x01,0x40,0xdd,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v1.h, v128.l op_sel:[0,1] ; encoding: [0x01,0x40,0xdd,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xdd,0xd5,0x80,0x01,0x00,0x00] + +0x01,0x48,0xdd,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_trunc_f16_e64 v1.h, v128.h op_sel:[1,1] ; encoding: [0x01,0x48,0xdd,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_trunc_f16_e64 v1, v128 ; encoding: [0x01,0x00,0xdd,0xd5,0x80,0x01,0x00,0x00] + +0xff,0x81,0xa1,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf +# GFX1250: v_trunc_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0xa1,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] + +0x05,0x00,0xa1,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, -1 ; encoding: [0x05,0x00,0xa1,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_trunc_f32_e64 v5, 0.5 mul:2 ; encoding: [0x05,0x00,0xa1,0xd5,0xf0,0x00,0x00,0x08] + +0x05,0x00,0xa1,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, exec_hi ; encoding: [0x05,0x00,0xa1,0xd5,0x7f,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, exec_lo ; encoding: [0x05,0x00,0xa1,0xd5,0x7e,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, m0 ; encoding: [0x05,0x00,0xa1,0xd5,0x7d,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, null ; encoding: [0x05,0x00,0xa1,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, s1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, s105 ; encoding: [0x05,0x00,0xa1,0xd5,0x69,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0xfd,0x00,0x00,0x10 +# GFX1250: v_trunc_f32_e64 v5, src_scc mul:4 ; encoding: [0x05,0x00,0xa1,0xd5,0xfd,0x00,0x00,0x10] + +0x05,0x00,0xa1,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, ttmp15 ; encoding: [0x05,0x00,0xa1,0xd5,0x7b,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, v1 ; encoding: [0x05,0x00,0xa1,0xd5,0x01,0x01,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0xff,0x01,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, v255 ; encoding: [0x05,0x00,0xa1,0xd5,0xff,0x01,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, vcc_hi ; encoding: [0x05,0x00,0xa1,0xd5,0x6b,0x00,0x00,0x00] + +0x05,0x00,0xa1,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_trunc_f32_e64 v5, vcc_lo ; encoding: [0x05,0x00,0xa1,0xd5,0x6a,0x00,0x00,0x00] + +0xfe,0x80,0x97,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf +# GFX1250: v_trunc_f64_e64 v[254:255], 0xaf123456 clamp div:2 ; encoding: [0xfe,0x80,0x97,0xd5,0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf] + +0x06,0x00,0x97,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], -1 ; encoding: [0x06,0x00,0x97,0xd5,0xc1,0x00,0x00,0x00] + +0x06,0x01,0x97,0xd5,0xfd,0x00,0x00,0x30 +# GFX1250: v_trunc_f64_e64 v[6:7], -|src_scc| mul:4 ; encoding: [0x06,0x01,0x97,0xd5,0xfd,0x00,0x00,0x30] + +0x06,0x00,0x97,0xd5,0xf0,0x00,0x00,0x08 +# GFX1250: v_trunc_f64_e64 v[6:7], 0.5 mul:2 ; encoding: [0x06,0x00,0x97,0xd5,0xf0,0x00,0x00,0x08] + +0x06,0x00,0x97,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], exec ; encoding: [0x06,0x00,0x97,0xd5,0x7e,0x00,0x00,0x00] + +0x06,0x00,0x97,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], null ; encoding: [0x06,0x00,0x97,0xd5,0x7c,0x00,0x00,0x00] + +0x06,0x00,0x97,0xd5,0x68,0x00,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], s[104:105] ; encoding: [0x06,0x00,0x97,0xd5,0x68,0x00,0x00,0x00] + +0x06,0x00,0x97,0xd5,0x02,0x00,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], s[2:3] ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x00,0x00,0x00] + +0x06,0x00,0x97,0xd5,0x7a,0x00,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], ttmp[14:15] ; encoding: [0x06,0x00,0x97,0xd5,0x7a,0x00,0x00,0x00] + +0x06,0x00,0x97,0xd5,0xfe,0x01,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], v[254:255] ; encoding: [0x06,0x00,0x97,0xd5,0xfe,0x01,0x00,0x00] + +0x06,0x00,0x97,0xd5,0x02,0x01,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], v[2:3] ; encoding: [0x06,0x00,0x97,0xd5,0x02,0x01,0x00,0x00] + +0x06,0x00,0x97,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_trunc_f64_e64 v[6:7], vcc ; encoding: [0x06,0x00,0x97,0xd5,0x6a,0x00,0x00,0x00] + 0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf # GFX1250: v_tanh_f32_e64 v255, -|0xaf123456| clamp div:2 ; encoding: [0xff,0x81,0x9e,0xd5,0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf] @@ -659,86 +4671,114 @@ # GFX1250-REAL16: v_cos_bf16_e64 v5.h, v128.h op_sel:[1,1] ; encoding: [0x05,0x48,0xff,0xd5,0x80,0x01,0x00,0x00] # GFX1250-FAKE16: v_cos_bf16_e64 v5, v128 ; encoding: [0x05,0x00,0xff,0xd5,0x80,0x01,0x00,0x00] -0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, 3 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00] +0x01,0x10,0xf8,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.l, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf8,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v1, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf8,0xd5,0x02,0x01,0x00,0x00] -0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00] +0x01,0x08,0xf8,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.l, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf8,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v1, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf8,0xd5,0x02,0x01,0x00,0x00] -0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00] +0x01,0x18,0xf8,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.l, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf8,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v1, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf8,0xd5,0x02,0x01,0x00,0x00] -0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00] +0x96,0x00,0xf8,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf8,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf8,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] -0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, s3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00] +0x96,0x00,0xf8,0xd5,0x82,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v150.l, 2 ; encoding: [0x96,0x00,0xf8,0xd5,0x82,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v150, 2 ; encoding: [0x96,0x00,0xf8,0xd5,0x82,0x00,0x00,0x00] -0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00] +0x96,0x00,0xf8,0xd5,0x02,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v150.l, s2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v150, s2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x00,0x00,0x00] -0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00] +0x96,0x00,0xf8,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v150, v2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x01,0x00,0x00] -0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00] +0x80,0x40,0xf8,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v128.h, v2 op_sel:[0,1] ; encoding: [0x80,0x40,0xf8,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v128, v2 op_sel:[0,1] ; encoding: [0x80,0x40,0xf8,0xd5,0x02,0x01,0x00,0x00] -0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, v3 ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00] +0x01,0x48,0xf8,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.h, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf8,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v1, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf8,0xd5,0x02,0x01,0x00,0x00] -0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00] +0x01,0x10,0xf7,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.l, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf7,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf7,0xd5,0x02,0x01,0x00,0x00] -0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00] +0x01,0x08,0xf7,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.l, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf7,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf7,0xd5,0x02,0x01,0x00,0x00] -0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_bf8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00] +0x01,0x18,0xf7,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.l, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf7,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf7,0xd5,0x02,0x01,0x00,0x00] -0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, 3 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00] +0x96,0x00,0xf7,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf7,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf7,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] -0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00] +0x96,0x00,0xf7,0xd5,0x82,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v150.l, 2 ; encoding: [0x96,0x00,0xf7,0xd5,0x82,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v150, 2 ; encoding: [0x96,0x00,0xf7,0xd5,0x82,0x00,0x00,0x00] -0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00] +0x96,0x00,0xf7,0xd5,0x02,0x00,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v150.l, s2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x00,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v150, s2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x00,0x00,0x00] -0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, 3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00] +0x96,0x00,0xf7,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v150, v2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x01,0x00,0x00] -0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, s3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00] +0x80,0x40,0xf7,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v128.h, v2 op_sel:[0,1] ; encoding: [0x80,0x40,0xf7,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v128, v2 op_sel:[0,1] ; encoding: [0x80,0x40,0xf7,0xd5,0x02,0x01,0x00,0x00] -0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00] +0x01,0x48,0xf7,0xd5,0x02,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.h, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf7,0xd5,0x02,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf7,0xd5,0x02,0x01,0x00,0x00] -0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, s1 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00] -0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, s3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, s105 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00] -0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, v3 ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00] -0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 ; encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00] -0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 ; encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00] -0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:3 ; encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, m0 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00] -0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, v3 clamp ; encoding: [0x01,0x80,0xec,0xd5,0x03,0x01,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00] -0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:1 clamp ; encoding: [0x01,0x90,0xec,0xd5,0x03,0x01,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00] -0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00 -# GFX1250: v_cvt_f32_fp8_e64 v1, v3 byte_sel:2 clamp ; encoding: [0x01,0x88,0xec,0xd5,0x03,0x01,0x00,0x00] +0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, null op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00] + +0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, -1 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00] + +0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00 +# GFX1250: v_cvt_f32_bf16_e64 v5, src_scc op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00] + +0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00 +# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00] +# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v128 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00] 0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00 # GFX1250: v_cvt_f32_bf16_e64 v5, -1 ; encoding: [0x05,0x00,0xf2,0xd5,0xc1,0x00,0x00,0x00] @@ -786,115 +4826,6 @@ # GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v255.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00] # GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v255 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xff,0x01,0x00,0x00] -0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, s1 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x01,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, s105 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x69,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6a,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, vcc_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x6b,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, ttmp15 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7b,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, m0 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7d,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, exec_lo op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7e,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, exec_hi op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7f,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, null op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x7c,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, -1 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xc1,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00 -# GFX1250: v_cvt_f32_bf16_e64 v5, src_scc op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0xfd,0x00,0x00,0x00] - -0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f32_bf16_e64 v5, v128.h op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f32_bf16_e64 v5, v128 op_sel:[1,0] ; encoding: [0x05,0x08,0xf2,0xd5,0x80,0x01,0x00,0x00] - -0x01,0x10,0xf8,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.l, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf8,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v1, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf8,0xd5,0x02,0x01,0x00,0x00] - -0x01,0x08,0xf8,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.l, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf8,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v1, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf8,0xd5,0x02,0x01,0x00,0x00] - -0x01,0x18,0xf8,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.l, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf8,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v1, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf8,0xd5,0x02,0x01,0x00,0x00] - -0x96,0x00,0xf8,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf8,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf8,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] - -0x96,0x00,0xf8,0xd5,0x82,0x00,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v150.l, 2 ; encoding: [0x96,0x00,0xf8,0xd5,0x82,0x00,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v150, 2 ; encoding: [0x96,0x00,0xf8,0xd5,0x82,0x00,0x00,0x00] - -0x96,0x00,0xf8,0xd5,0x02,0x00,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v150.l, s2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x00,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v150, s2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x00,0x00,0x00] - -0x96,0x00,0xf8,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v150, v2 ; encoding: [0x96,0x00,0xf8,0xd5,0x02,0x01,0x00,0x00] - -0x80,0x40,0xf8,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v128.h, v2 op_sel:[0,1] ; encoding: [0x80,0x40,0xf8,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v128, v2 op_sel:[0,1] ; encoding: [0x80,0x40,0xf8,0xd5,0x02,0x01,0x00,0x00] - -0x01,0x48,0xf8,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_bf8_e64 v1.h, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf8,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_bf8_e64 v1, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf8,0xd5,0x02,0x01,0x00,0x00] - -0x01,0x10,0xf7,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.l, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf7,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 byte_sel:1 ; encoding: [0x01,0x10,0xf7,0xd5,0x02,0x01,0x00,0x00] - -0x01,0x08,0xf7,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.l, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf7,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 byte_sel:2 ; encoding: [0x01,0x08,0xf7,0xd5,0x02,0x01,0x00,0x00] - -0x01,0x18,0xf7,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.l, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf7,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 byte_sel:3 ; encoding: [0x01,0x18,0xf7,0xd5,0x02,0x01,0x00,0x00] - -0x96,0x00,0xf7,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf7,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf7,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] - -0x96,0x00,0xf7,0xd5,0x82,0x00,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v150.l, 2 ; encoding: [0x96,0x00,0xf7,0xd5,0x82,0x00,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v150, 2 ; encoding: [0x96,0x00,0xf7,0xd5,0x82,0x00,0x00,0x00] - -0x96,0x00,0xf7,0xd5,0x02,0x00,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v150.l, s2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x00,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v150, s2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x00,0x00,0x00] - -0x96,0x00,0xf7,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v150, v2 ; encoding: [0x96,0x00,0xf7,0xd5,0x02,0x01,0x00,0x00] - -0x80,0x40,0xf7,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v128.h, v2 op_sel:[0,1] ; encoding: [0x80,0x40,0xf7,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v128, v2 op_sel:[0,1] ; encoding: [0x80,0x40,0xf7,0xd5,0x02,0x01,0x00,0x00] - -0x01,0x48,0xf7,0xd5,0x02,0x01,0x00,0x00 -# GFX1250-REAL16: v_cvt_f16_fp8_e64 v1.h, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf7,0xd5,0x02,0x01,0x00,0x00] -# GFX1250-FAKE16: v_cvt_f16_fp8_e64 v1, v2 op_sel:[0,1] byte_sel:2 ; encoding: [0x01,0x48,0xf7,0xd5,0x02,0x01,0x00,0x00] - 0x01,0x08,0xf6,0xd5,0x02,0x00,0x00,0x00 # GFX1250: v_cvt_pk_f16_bf8 v1, s2 op_sel:[1,0] ; encoding: [0x01,0x08,0xf6,0xd5,0x02,0x00,0x00,0x00] @@ -961,10 +4892,6 @@ # GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00] # GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00] -0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00 -# GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.l, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] -# GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, 0x1234 ; encoding: [0x96,0x00,0xf4,0xd5,0xff,0x00,0x00,0x00,0x34,0x12,0x00,0x00] - 0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00 # GFX1250-REAL16: v_sat_pk4_u4_u8_e64 v150.h, v2 op_sel:[0,1] ; encoding: [0x96,0x40,0xf4,0xd5,0x02,0x01,0x00,0x00] # GFX1250-FAKE16: v_sat_pk4_u4_u8_e64 v150, v2 ; encoding: [0x96,0x00,0xf4,0xd5,0x02,0x01,0x00,0x00] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt index 34d2104a660d8..7c29f8ab01a1b 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp16.txt @@ -104,6 +104,66 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x9f,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + 0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250: v_prng_b32_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] @@ -137,6 +197,126 @@ 0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff # GFX1250: v_prng_b32_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xcb,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + +0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] + +0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] + +0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] + 0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfb,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] @@ -437,186 +617,6 @@ # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xff,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xf9,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] - -0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] - -0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf9,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] - -0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xfa,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] - -0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] - -0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xfa,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] - -0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x81,0xca,0xd5,0xfa,0x00,0x00,0x38,0xff,0x6f,0x05,0x30] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x08,0x01,0x5f,0x01,0x01] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x10,0x01,0x60,0x09,0x13] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x41,0x01,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x40,0x01,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x21,0x01,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x2f,0x01,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x50,0x01,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x01,0x01,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x0f,0x01,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x11,0x01,0xff] - -0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1f,0x01,0xff] - -0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xca,0xd5,0xfa,0x00,0x00,0x00,0x80,0x1b,0x00,0xff] - 0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff # GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] # GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0xf2,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0xff] diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt index 867fee512b424..d26bc46a1f272 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3_from_vop1_dpp8.txt @@ -34,9 +34,69 @@ # GFX1250-REAL16: v_tanh_f16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_tanh_f16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x9f,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + 0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250: v_prng_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xcb,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + +0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] + +0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] + +0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] + +0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 +# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] +# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] + 0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 # GFX1250-REAL16: v_rsq_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] # GFX1250-FAKE16: v_rsq_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfb,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] @@ -137,66 +197,6 @@ # GFX1250-REAL16: v_cos_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] # GFX1250-FAKE16: v_cos_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xff,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xf9,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - -0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] - -0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xf9,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - -0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 -# GFX1250-REAL16: v_rcp_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -# GFX1250-FAKE16: v_rcp_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf9,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] - -0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xfa,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - -0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xfa,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - -0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 -# GFX1250-REAL16: v_sqrt_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -# GFX1250-FAKE16: v_sqrt_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xfa,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] - -0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v255.l, -|v255.l| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v255, -|v255| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x81,0xca,0xd5,0xe9,0x00,0x00,0x38,0xff,0x00,0x00,0x00] - -0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] - -0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x08,0x01,0x77,0x39,0x05] - -0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.l, v1.l mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v1 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x00,0xca,0xd5,0xea,0x00,0x00,0x10,0x01,0x77,0x39,0x05] - -0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05 -# GFX1250-REAL16: v_tanh_bf16_e64_dpp v5.h, v128.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x48,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] -# GFX1250-FAKE16: v_tanh_bf16_e64_dpp v5, v128 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xca,0xd5,0xe9,0x00,0x00,0x00,0x80,0x77,0x39,0x05] - 0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05 # GFX1250-REAL16: v_cvt_f32_bf16_e64_dpp v5, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] # GFX1250-FAKE16: v_cvt_f32_bf16_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0xf2,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05] From d46de86ca4060fe2c631c08728af7c48ce9458fd Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Fri, 18 Jul 2025 17:04:34 -0400 Subject: [PATCH 404/813] [NFC][AMDGPU] Re-enable two tests previously disabled due to missing upstream features (#149568) This PR re-enables two tests that were previously disabled because they depended on features not yet upstreamed. --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll | 3 +-- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll | 3 +-- llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll index 3c49d0b9c01b1..199494d1c3473 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll @@ -1,10 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefix=SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefix=SDAG-TRUE16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefix=SDAG-FAKE16 %s ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefix=GI-TRUE16 %s ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefix=GI-FAKE16 %s -; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select. ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.amdgcn.rcp.bf16(bfloat) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll index 0a8a90422d1f2..42d12fd0fb3c9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll @@ -1,10 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; xUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=SDAG-REAL16 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=SDAG-FAKE16 %s ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GISEL-REAL16 %s ; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GISEL-FAKE16 %s -; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select. ; FIXME: GlobalISel does not work with bf16 declare bfloat @llvm.amdgcn.rsq.bf16(bfloat) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll index 47b2b68f05abc..dcf01f744945f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll @@ -2,8 +2,6 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s -; FIXME: t16 doesn't work at the moment because the store of s16 under t16 mode fails to select. - declare bfloat @llvm.sqrt.bf16(bfloat %a) declare <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a) From 3be44e25804e776d3ff071740a60ae6d2f3ef4a7 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 18 Jul 2025 22:05:54 +0100 Subject: [PATCH 405/813] [TableGen] Add some -time-phases support in CodeGenRegisters (#149309) --- llvm/utils/TableGen/Common/CodeGenRegisters.cpp | 11 ++++++++++- llvm/utils/TableGen/Common/CodeGenRegisters.h | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index 28b542f09e8c0..f78427940b276 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -30,6 +30,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" +#include "llvm/TableGen/TGTimer.h" #include #include #include @@ -1130,7 +1131,7 @@ CodeGenRegisterCategory::CodeGenRegisterCategory(CodeGenRegBank &RegBank, CodeGenRegBank::CodeGenRegBank(const RecordKeeper &Records, const CodeGenHwModes &Modes) - : CGH(Modes) { + : Records(Records), CGH(Modes) { // Configure register Sets to understand register classes and tuples. Sets.addFieldExpander("RegisterClass", "MemberList"); Sets.addFieldExpander("CalleeSavedRegs", "SaveList"); @@ -2202,7 +2203,9 @@ void CodeGenRegBank::computeDerivedInfo() { // Compute a weight for each register unit created during getSubRegs. // This may create adopted register units (with unit # >= NumNativeRegUnits). + Records.getTimer().startTimer("Compute reg unit weights"); computeRegUnitWeights(); + Records.getTimer().stopTimer(); // Compute a unique set of RegUnitSets. One for each RegClass and inferred // supersets for the union of overlapping sets. @@ -2446,6 +2449,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() { // and assigned EnumValues yet. That means getSubClasses(), // getSuperClasses(), and hasSubClass() functions are defunct. + Records.getTimer().startTimer("Compute inferred register classes"); + // Use one-before-the-end so it doesn't move forward when new elements are // added. auto FirstNewRC = std::prev(RegClasses.end()); @@ -2481,6 +2486,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() { } } + Records.getTimer().startTimer("Extend super-register classes"); + // Compute the transitive closure for super-register classes. // // By iterating over sub-register indices in topological order, we only ever @@ -2491,6 +2498,8 @@ void CodeGenRegBank::computeInferredRegisterClasses() { for (CodeGenRegisterClass &SubRC : RegClasses) SubRC.extendSuperRegClasses(SubIdx); } + + Records.getTimer().stopTimer(); } /// getRegisterClassForRegister - Find the register class that contains the diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.h b/llvm/utils/TableGen/Common/CodeGenRegisters.h index 5e6fff0f775ea..81aa663b8f11e 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.h +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.h @@ -607,6 +607,8 @@ typedef SmallVector TopoSigId; // CodeGenRegBank - Represent a target's registers and the relations between // them. class CodeGenRegBank { + const RecordKeeper &Records; + SetTheory Sets; const CodeGenHwModes &CGH; From d63ab5467dcae0492e2f4def336ddbb73ce10dc5 Mon Sep 17 00:00:00 2001 From: Princeton Ferro Date: Fri, 18 Jul 2025 14:11:31 -0700 Subject: [PATCH 406/813] [NVPTX] don't erase CopyToRegs when folding movs into loads (#149393) We may still need to keep CopyToReg even after folding uses into vector loads, since the original register may be used in other blocks. Partially reverts 1fdbe6984976d9e85ab3b1a93e8de434a85c5646 --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 12 +- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 591 +++++++++------- llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 643 +++++++++++------- llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 233 ++++--- llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 40 +- llvm/test/CodeGen/NVPTX/pr126337.ll | 41 ++ .../CodeGen/NVPTX/reduction-intrinsics.ll | 226 +++--- 7 files changed, 1095 insertions(+), 691 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/pr126337.ll diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7aa06f9079b09..31b236a6126ad 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -5060,12 +5060,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return !U.getUser()->use_empty(); } - // Handle CopyToReg nodes that will become dead after our replacement - if (U.getUser()->getOpcode() == ISD::CopyToReg) { - DeadCopyToRegs.push_back(U.getUser()); - return true; - } - // Otherwise, this use prevents us from splitting a value. return false; })) @@ -5132,10 +5126,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs)) Results.push_back(NewLoad.getValue(NewNumOutputs + I)); - // Remove dead CopyToReg nodes by folding them into the chain they reference - for (SDNode *CTR : DeadCopyToRegs) - DCI.CombineTo(CTR, CTR->getOperand(0)); - return DCI.DAG.getMergeValues(Results, DL); } @@ -6544,4 +6534,4 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode( default: break; } -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index d0e2c1817f696..3baefde072be7 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -45,11 +45,12 @@ define <2 x half> @test_ret_const() #0 { define half @test_extract_0(<2 x half> %a) #0 { ; CHECK-LABEL: test_extract_0( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i32 0 @@ -59,12 +60,13 @@ define half @test_extract_0(<2 x half> %a) #0 { define half @test_extract_1(<2 x half> %a) #0 { ; CHECK-LABEL: test_extract_1( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0]; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } +; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %e = extractelement <2 x half> %a, i32 1 ret half %e @@ -80,8 +82,9 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; ; CHECK-NEXT: setp.eq.b64 %p1, %rd1, 0; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; ; CHECK-NEXT: ret; @@ -107,14 +110,16 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fadd_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fadd_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: add.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NOF16-NEXT: add.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -143,7 +148,8 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_0_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_imm_0_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -175,7 +181,8 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_1_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fadd_imm_1_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -207,14 +214,16 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fsub_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fsub_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fsub_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fsub_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: sub.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NOF16-NEXT: sub.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -242,7 +251,8 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fneg_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fneg_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: mov.b32 %r3, 0f00000000; ; CHECK-NOF16-NEXT: sub.rn.f32 %r4, %r3, %r2; @@ -275,14 +285,16 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<10>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmul_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmul_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmul_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmul_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: mul.rn.f32 %r5, %r4, %r3; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NOF16-NEXT: mul.rn.f32 %r8, %r7, %r6; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NOF16-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -299,14 +311,16 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0]; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1]; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NEXT: ld.param.b32 %r2, [test_fdiv_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fdiv_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NEXT: div.rn.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NEXT: div.rn.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -331,10 +345,12 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_frem_param_0]; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_frem_param_1]; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NEXT: ld.param.b32 %r2, [test_frem_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_frem_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NEXT: div.rn.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; ; CHECK-NEXT: neg.f32 %r7, %r6; @@ -342,8 +358,8 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: testp.infinite.f32 %p1, %r3; ; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r9; -; CHECK-NEXT: cvt.f32.f16 %r10, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r11, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r10, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r11, %rs3; ; CHECK-NEXT: div.rn.f32 %r12, %r11, %r10; ; CHECK-NEXT: cvt.rzi.f32.f32 %r13, %r12; ; CHECK-NEXT: neg.f32 %r14, %r13; @@ -535,11 +551,13 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; ; CHECK-F16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; -; CHECK-F16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; ; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r3, %r4; -; CHECK-F16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_1]; -; CHECK-F16-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; -; CHECK-F16-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; +; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-F16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-F16-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; CHECK-F16-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; ; CHECK-F16-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; CHECK-F16-NEXT: ret; ; @@ -550,18 +568,22 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, < ; CHECK-NOF16-NEXT: .reg .b32 %r<9>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_3]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_2]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs5; +; CHECK-NOF16-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r6, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs6; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs4; ; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r8, %r7; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1]; -; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs2, %rs8, %p2; -; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs1, %rs7, %p1; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; CHECK-NOF16-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; CHECK-NOF16-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; +; CHECK-NOF16-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs9}; ; CHECK-NOF16-NEXT: ret; %cc = fcmp une <2 x half> %c, %d @@ -579,11 +601,13 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-F16-NEXT: // %bb.0: ; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; -; CHECK-F16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0]; +; CHECK-F16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1]; +; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0]; ; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; -; CHECK-F16-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_1]; -; CHECK-F16-NEXT: selp.f32 %r7, %r4, %r6, %p2; -; CHECK-F16-NEXT: selp.f32 %r8, %r3, %r5, %p1; +; CHECK-F16-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-F16-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-F16-NEXT: selp.f32 %r7, %r6, %r4, %p2; +; CHECK-F16-NEXT: selp.f32 %r8, %r5, %r3, %p1; ; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-F16-NEXT: ret; ; @@ -595,18 +619,22 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-NOF16-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f32_f16_param_3]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f32_f16_param_2]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r6, %r5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs2; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs4; -; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r8, %r7; -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_f16_param_1]; -; CHECK-NOF16-NEXT: selp.f32 %r11, %r4, %r10, %p2; -; CHECK-NOF16-NEXT: selp.f32 %r12, %r3, %r9, %p1; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; +; CHECK-NOF16-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1]; +; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs1; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs3; +; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs4; +; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5; +; CHECK-NOF16-NEXT: mov.b64 {%r7, %r8}, %rd2; +; CHECK-NOF16-NEXT: mov.b64 {%r9, %r10}, %rd1; +; CHECK-NOF16-NEXT: selp.f32 %r11, %r10, %r8, %p2; +; CHECK-NOF16-NEXT: selp.f32 %r12, %r9, %r7, %p1; ; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r11}; ; CHECK-NOF16-NEXT: ret; <2 x half> %c, <2 x half> %d) #0 { @@ -624,14 +652,18 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f16_f32_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3]; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f16_f32_param_1]; -; CHECK-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; -; CHECK-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; +; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0]; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NEXT: setp.neu.f32 %p1, %r5, %r3; +; CHECK-NEXT: setp.neu.f32 %p2, %r6, %r4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; CHECK-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; CHECK-NEXT: ret; <2 x float> %c, <2 x float> %d) #0 { @@ -664,13 +696,15 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_une_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_une_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_une_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_une_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.neu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -705,13 +739,15 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ueq_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ueq_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ueq_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ueq_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.equ.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.equ.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -746,13 +782,15 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ugt_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ugt_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ugt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ugt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.gtu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.gtu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -787,13 +825,15 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uge_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uge_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_uge_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_uge_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.geu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.geu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -828,13 +868,15 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ult_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ult_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ult_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ult_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.ltu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.ltu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -869,13 +911,15 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ule_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ule_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ule_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ule_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.leu.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.leu.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -911,13 +955,15 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uno_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uno_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_uno_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_uno_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.nan.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -952,13 +998,15 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_one_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_one_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_one_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_one_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.ne.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.ne.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -993,13 +1041,15 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oeq_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oeq_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_oeq_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_oeq_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.eq.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1034,13 +1084,15 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ogt_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ogt_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ogt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ogt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.gt.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.gt.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1075,13 +1127,15 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oge_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oge_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_oge_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_oge_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.ge.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.ge.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1116,13 +1170,15 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_olt_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_olt_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_olt_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_olt_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.lt.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.lt.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1157,13 +1213,15 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ole_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ole_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ole_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ole_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.le.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.le.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1198,13 +1256,15 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ord_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ord_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fcmp_ord_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fcmp_ord_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NOF16-NEXT: setp.num.f32 %p1, %r4, %r3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs3; ; CHECK-NOF16-NEXT: setp.num.f32 %p2, %r6, %r5; ; CHECK-NOF16-NEXT: selp.b16 %rs5, -1, 0, %p2; ; CHECK-NOF16-NEXT: st.param.b8 [func_retval0], %rs5; @@ -1222,7 +1282,8 @@ define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fptosi_i32_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.s32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.rzi.s32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1239,7 +1300,8 @@ define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i64_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fptosi_i64_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.s64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.rzi.s64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1255,7 +1317,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.u32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.rzi.u32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1272,7 +1335,8 @@ define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.u64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.rzi.u64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1369,16 +1433,17 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs1, %r1; ; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs2, %r2; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_uitofp_2xi32_fadd_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r4, %r5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; -; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r7, %r8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r8, %r7; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r9; ; CHECK-NOF16-NEXT: mov.b32 %r10, {%rs6, %rs5}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r10; @@ -1411,16 +1476,17 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1]; ; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs1, %r1; ; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs2, %r2; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_sitofp_2xi32_fadd_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs2; -; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r4, %r5; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: add.rn.f32 %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs5, %r6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; -; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r7, %r8; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; +; CHECK-NOF16-NEXT: add.rn.f32 %r9, %r8, %r7; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs6, %r9; ; CHECK-NOF16-NEXT: mov.b32 %r10, {%rs6, %rs5}; ; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %r10; @@ -1468,7 +1534,8 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xfloat_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xfloat_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: cvt.f32.f16 %r3, %rs1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -1485,7 +1552,8 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f64.f16 %rd1, %rs2; ; CHECK-NEXT: cvt.f64.f16 %rd2, %rs1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -1578,7 +1646,8 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sqrt_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sqrt_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1606,7 +1675,8 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sin_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sin_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: sin.approx.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1627,7 +1697,8 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_cos_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_cos_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: cos.approx.f32 %r3, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1703,17 +1774,20 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fma_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fma_param_2]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_fma_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fma_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fma_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fma_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6; ; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5; ; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11; ; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7}; @@ -1740,7 +1814,8 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fabs_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NOF16-NEXT: abs.f32 %r3, %r2; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs3, %r3; @@ -1761,14 +1836,16 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_minnum_param_0]; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_minnum_param_1]; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NEXT: ld.param.b32 %r2, [test_minnum_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_minnum_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NEXT: min.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NEXT: min.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -1785,14 +1862,16 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_maxnum_param_0]; -; CHECK-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_maxnum_param_1]; -; CHECK-NEXT: cvt.f32.f16 %r3, %rs4; -; CHECK-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NEXT: ld.param.b32 %r2, [test_maxnum_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_maxnum_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NEXT: cvt.f32.f16 %r3, %rs2; +; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; CHECK-NEXT: cvt.f32.f16 %r4, %rs4; ; CHECK-NEXT: max.f32 %r5, %r4, %r3; ; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %r5; -; CHECK-NEXT: cvt.f32.f16 %r6, %rs3; -; CHECK-NEXT: cvt.f32.f16 %r7, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r6, %rs1; +; CHECK-NEXT: cvt.f32.f16 %r7, %rs3; ; CHECK-NEXT: max.f32 %r8, %r7, %r6; ; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %r8; ; CHECK-NEXT: mov.b32 %r9, {%rs6, %rs5}; @@ -1822,13 +1901,15 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_param_1]; -; CHECK-NOF16-NEXT: and.b16 %rs5, %rs4, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs2, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs5; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs3, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs1, 32767; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, -32768; +; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs5, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs1, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs4, 32767; ; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; ; CHECK-NOF16-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs7}; ; CHECK-NOF16-NEXT: ret; @@ -1844,8 +1925,9 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16-NEXT: .reg .b64 %rd<2>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: +; CHECK-F16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; -; CHECK-F16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; +; CHECK-F16-NEXT: mov.b64 {%r2, %r3}, %rd1; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %r3; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %r2; ; CHECK-F16-NEXT: mov.b32 %r4, {%rs2, %rs1}; @@ -1862,8 +1944,10 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f32_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: ld.param.b64 %rd1, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; +; CHECK-NOF16-NEXT: mov.b64 {%r2, %r3}, %rd1; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: and.b32 %r4, %r3, -2147483648; ; CHECK-NOF16-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; } @@ -1906,7 +1990,8 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: ; CHECK-NOF16-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f64_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; ; CHECK-NOF16-NEXT: and.b64 %rd3, %rd2, -9223372036854775808; ; CHECK-NOF16-NEXT: shr.u64 %rd4, %rd3, 48; @@ -1948,13 +2033,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %r<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_extended_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_extended_param_1]; -; CHECK-NOF16-NEXT: and.b16 %rs5, %rs3, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs6, %rs1, 32767; -; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs5; -; CHECK-NOF16-NEXT: and.b16 %rs8, %rs4, -32768; -; CHECK-NOF16-NEXT: and.b16 %rs9, %rs2, 32767; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_copysign_extended_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_extended_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; CHECK-NOF16-NEXT: and.b16 %rs3, %rs1, -32768; +; CHECK-NOF16-NEXT: mov.b32 {%rs4, %rs5}, %r1; +; CHECK-NOF16-NEXT: and.b16 %rs6, %rs4, 32767; +; CHECK-NOF16-NEXT: or.b16 %rs7, %rs6, %rs3; +; CHECK-NOF16-NEXT: and.b16 %rs8, %rs2, -32768; +; CHECK-NOF16-NEXT: and.b16 %rs9, %rs5, 32767; ; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r3, %rs10; ; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs7; @@ -1972,7 +2059,8 @@ define <2 x half> @test_floor(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_floor_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_floor_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rmi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rmi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -1988,7 +2076,8 @@ define <2 x half> @test_ceil(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_ceil_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_ceil_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rpi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rpi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2004,7 +2093,8 @@ define <2 x half> @test_trunc(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_trunc_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_trunc_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rzi.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2020,7 +2110,8 @@ define <2 x half> @test_rint(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_rint_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_rint_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2036,7 +2127,8 @@ define <2 x half> @test_nearbyint(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_nearbyint_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_nearbyint_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2052,7 +2144,8 @@ define <2 x half> @test_roundeven(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_roundeven_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_roundeven_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rni.f16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rni.f16.f16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2070,7 +2163,8 @@ define <2 x half> @test_round(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<21>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_round_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_round_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %r2, %rs2; ; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; ; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; @@ -2121,17 +2215,20 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 ; CHECK-NOF16-NEXT: .reg .b32 %r<13>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fmuladd_param_0]; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_fmuladd_param_2]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs4; -; CHECK-NOF16-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_fmuladd_param_1]; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs6; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs2; +; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_fmuladd_param_2]; +; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_fmuladd_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_fmuladd_param_0]; +; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r4, %rs2; +; CHECK-NOF16-NEXT: mov.b32 {%rs3, %rs4}, %r2; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r5, %rs4; +; CHECK-NOF16-NEXT: mov.b32 {%rs5, %rs6}, %r1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r6, %rs6; ; CHECK-NOF16-NEXT: fma.rn.f32 %r7, %r6, %r5, %r4; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs7, %r7; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs3; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs5; -; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r8, %rs1; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r9, %rs3; +; CHECK-NOF16-NEXT: cvt.f32.f16 %r10, %rs5; ; CHECK-NOF16-NEXT: fma.rn.f32 %r11, %r10, %r9, %r8; ; CHECK-NOF16-NEXT: cvt.rn.f16.f32 %rs8, %r11; ; CHECK-NOF16-NEXT: mov.b32 %r12, {%rs8, %rs7}; @@ -2148,7 +2245,8 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; CHECK-NEXT: ret; %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> @@ -2158,12 +2256,13 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 { define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 { ; CHECK-LABEL: test_insertelement( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; -; CHECK-NEXT: ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; CHECK-NEXT: ret; %i = insertelement <2 x half> %a, half %x, i64 1 @@ -2177,7 +2276,8 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rn.f16.s16 %rs3, %rs2; ; CHECK-NEXT: cvt.rn.f16.s16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -2193,7 +2293,8 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_uitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rn.f16.u16 %rs3, %rs2; ; CHECK-NEXT: cvt.rn.f16.u16 %rs4, %rs1; ; CHECK-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index af3cb63082e78..da9e2d8cba139 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -28,29 +28,53 @@ define <2 x float> @test_ret_const() #0 { } define float @test_extract_0(<2 x float> %a) #0 { -; CHECK-LABEL: test_extract_0( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %r1; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_extract_0( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0]; +; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_extract_0( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_extract_0_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r1, _}, %rd1; +; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-F32X2-NEXT: ret; %e = extractelement <2 x float> %a, i32 0 ret float %e } define float @test_extract_1(<2 x float> %a) #0 { -; CHECK-LABEL: test_extract_1( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %r2; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_extract_1( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<2>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0]; +; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; } +; CHECK-NOF32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_extract_1( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<2>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_extract_1_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {_, %r1}, %rd1; +; CHECK-F32X2-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-F32X2-NEXT: ret; %e = extractelement <2 x float> %a, i32 1 ret float %e } @@ -70,10 +94,12 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1]; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -98,7 +124,8 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -128,7 +155,8 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -158,13 +186,17 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1]; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r9, %r4, %r8; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r10, %r3, %r7; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r2, %r6; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r1, %r5; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r11, %r10, %r8; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r12, %r9, %r7; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_v4( @@ -189,12 +221,14 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0]; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4( @@ -225,12 +259,14 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0]; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r5, %r4, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r6, %r3, 0f40400000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r2, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r1, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r3, %r2, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r4, %r1, 0f40400000; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4( @@ -261,10 +297,12 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1]; -; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: sub.rn.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -289,7 +327,8 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: neg.f32 %r3, %r2; ; CHECK-NEXT: neg.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -305,10 +344,12 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1]; -; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: mul.rn.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -333,10 +374,12 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1]; -; CHECK-NEXT: div.rn.f32 %r5, %r2, %r4; -; CHECK-NEXT: div.rn.f32 %r6, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; +; CHECK-NEXT: div.rn.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %r = fdiv <2 x float> %a, %b @@ -351,20 +394,22 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1]; -; CHECK-NEXT: div.rn.f32 %r5, %r2, %r4; +; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: div.rn.f32 %r5, %r4, %r2; ; CHECK-NEXT: cvt.rzi.f32.f32 %r6, %r5; ; CHECK-NEXT: neg.f32 %r7, %r6; -; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r4, %r2; -; CHECK-NEXT: testp.infinite.f32 %p1, %r4; -; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p1; -; CHECK-NEXT: div.rn.f32 %r10, %r1, %r3; +; CHECK-NEXT: fma.rn.f32 %r8, %r7, %r2, %r4; +; CHECK-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-NEXT: div.rn.f32 %r10, %r3, %r1; ; CHECK-NEXT: cvt.rzi.f32.f32 %r11, %r10; ; CHECK-NEXT: neg.f32 %r12, %r11; -; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r3, %r1; -; CHECK-NEXT: testp.infinite.f32 %p2, %r3; -; CHECK-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NEXT: fma.rn.f32 %r13, %r12, %r1, %r3; +; CHECK-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; ; CHECK-NEXT: ret; %r = frem <2 x float> %a, %b @@ -378,10 +423,12 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1]; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fadd_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -406,7 +453,8 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -436,7 +484,8 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40000000; ; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f3F800000; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -466,13 +515,17 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<5>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1]; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r9, %r4, %r8; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r10, %r3, %r7; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r2, %r6; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r1, %r5; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, %r1; +; CHECK-NOF32X2-NEXT: mov.b64 {%r7, %r8}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r9, %r10}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r11, %r10, %r8; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r12, %r9, %r7; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_v4_ftz( @@ -497,12 +550,14 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz( @@ -533,12 +588,14 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0]; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r5, %r4, 0f40800000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r6, %r3, 0f40400000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r2, 0f40000000; -; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r1, 0f3F800000; -; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5}; +; CHECK-NOF32X2-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r3, %r2, 0f40800000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r4, %r1, 0f40400000; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r7, %r6, 0f40000000; +; CHECK-NOF32X2-NEXT: add.rn.ftz.f32 %r8, %r5, 0f3F800000; +; CHECK-NOF32X2-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3}; ; CHECK-NOF32X2-NEXT: ret; ; ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz( @@ -569,10 +626,12 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1]; -; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fsub_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fsub_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: sub.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -597,7 +656,8 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fneg_ftz_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: neg.ftz.f32 %r3, %r2; ; CHECK-NEXT: neg.ftz.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -613,10 +673,12 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<3>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1]; -; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmul_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmul_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NOF32X2-NEXT: mul.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -641,11 +703,14 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2]; -; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r2, %r4, %r6; -; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_ftz_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_ftz_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_ftz_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r7, %r6, %r4, %r2; +; CHECK-NOF32X2-NEXT: fma.rn.ftz.f32 %r8, %r5, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -671,10 +736,12 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1]; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; -; CHECK-NEXT: div.rn.ftz.f32 %r6, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fdiv_ftz_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fdiv_ftz_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; +; CHECK-NEXT: div.rn.ftz.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %r = fdiv <2 x float> %a, %b @@ -689,20 +756,22 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1]; -; CHECK-NEXT: div.rn.ftz.f32 %r5, %r2, %r4; +; CHECK-NEXT: ld.param.b64 %rd2, [test_frem_ftz_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_frem_ftz_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: div.rn.ftz.f32 %r5, %r4, %r2; ; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r6, %r5; ; CHECK-NEXT: neg.ftz.f32 %r7, %r6; -; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r4, %r2; -; CHECK-NEXT: testp.infinite.f32 %p1, %r4; -; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p1; -; CHECK-NEXT: div.rn.ftz.f32 %r10, %r1, %r3; +; CHECK-NEXT: fma.rn.ftz.f32 %r8, %r7, %r2, %r4; +; CHECK-NEXT: testp.infinite.f32 %p1, %r2; +; CHECK-NEXT: selp.f32 %r9, %r4, %r8, %p1; +; CHECK-NEXT: div.rn.ftz.f32 %r10, %r3, %r1; ; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %r11, %r10; ; CHECK-NEXT: neg.ftz.f32 %r12, %r11; -; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r3, %r1; -; CHECK-NEXT: testp.infinite.f32 %p2, %r3; -; CHECK-NEXT: selp.f32 %r14, %r1, %r13, %p2; +; CHECK-NEXT: fma.rn.ftz.f32 %r13, %r12, %r1, %r3; +; CHECK-NEXT: testp.infinite.f32 %p2, %r1; +; CHECK-NEXT: selp.f32 %r14, %r3, %r13, %p2; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r9}; ; CHECK-NEXT: ret; %r = frem <2 x float> %a, %b @@ -877,14 +946,18 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> % ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_3]; -; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r5; -; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r6; -; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_1]; -; CHECK-NEXT: selp.f32 %r9, %r2, %r8, %p2; -; CHECK-NEXT: selp.f32 %r10, %r1, %r7, %p1; +; CHECK-NEXT: ld.param.b64 %rd4, [test_select_cc_param_3]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_select_cc_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-NEXT: selp.f32 %r9, %r8, %r6, %p2; +; CHECK-NEXT: selp.f32 %r10, %r7, %r5, %p1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r10, %r9}; ; CHECK-NEXT: ret; %cc = fcmp une <2 x float> %c, %d @@ -902,10 +975,12 @@ define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2 ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1]; ; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3]; -; CHECK-NEXT: setp.neu.f32 %p1, %r1, %r3; -; CHECK-NEXT: setp.neu.f32 %p2, %r2, %r4; +; CHECK-NEXT: ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3]; +; CHECK-NEXT: ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd6; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd5; +; CHECK-NEXT: setp.neu.f32 %p1, %r3, %r1; +; CHECK-NEXT: setp.neu.f32 %p2, %r4, %r2; ; CHECK-NEXT: selp.f64 %rd7, %rd2, %rd4, %p2; ; CHECK-NEXT: selp.f64 %rd8, %rd1, %rd3, %p1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd8, %rd7}; @@ -925,12 +1000,14 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3]; ; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2]; -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0]; ; CHECK-NEXT: setp.neu.f64 %p1, %rd3, %rd5; ; CHECK-NEXT: setp.neu.f64 %p2, %rd4, %rd6; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1]; -; CHECK-NEXT: selp.f32 %r5, %r2, %r4, %p2; -; CHECK-NEXT: selp.f32 %r6, %r1, %r3, %p1; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: selp.f32 %r5, %r4, %r2, %p2; +; CHECK-NEXT: selp.f32 %r6, %r3, %r1, %p1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %cc = fcmp une <2 x double> %c, %d @@ -947,10 +1024,12 @@ define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1]; -; CHECK-NEXT: setp.neu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.neu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_une_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_une_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.neu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.neu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -969,10 +1048,12 @@ define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1]; -; CHECK-NEXT: setp.equ.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.equ.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ueq_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ueq_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.equ.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.equ.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -991,10 +1072,12 @@ define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1]; -; CHECK-NEXT: setp.gtu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.gtu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ugt_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ugt_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.gtu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.gtu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1013,10 +1096,12 @@ define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1]; -; CHECK-NEXT: setp.geu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.geu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uge_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uge_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.geu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.geu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1035,10 +1120,12 @@ define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1]; -; CHECK-NEXT: setp.ltu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.ltu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ult_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ult_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.ltu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ltu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1057,10 +1144,12 @@ define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1]; -; CHECK-NEXT: setp.leu.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.leu.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ule_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ule_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.leu.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.leu.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1079,10 +1168,12 @@ define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1]; -; CHECK-NEXT: setp.nan.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.nan.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_uno_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_uno_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.nan.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.nan.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1101,10 +1192,12 @@ define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1]; -; CHECK-NEXT: setp.ne.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.ne.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_one_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_one_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.ne.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ne.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1123,10 +1216,12 @@ define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1]; -; CHECK-NEXT: setp.eq.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.eq.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oeq_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oeq_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.eq.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.eq.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1145,10 +1240,12 @@ define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1]; -; CHECK-NEXT: setp.gt.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.gt.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ogt_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ogt_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.gt.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.gt.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1167,10 +1264,12 @@ define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1]; -; CHECK-NEXT: setp.ge.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.ge.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_oge_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_oge_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.ge.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.ge.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1189,10 +1288,12 @@ define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1]; -; CHECK-NEXT: setp.lt.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.lt.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_olt_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_olt_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.lt.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.lt.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1211,10 +1312,12 @@ define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1]; -; CHECK-NEXT: setp.le.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.le.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ole_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ole_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.le.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.le.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1233,10 +1336,12 @@ define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1]; -; CHECK-NEXT: setp.num.f32 %p1, %r2, %r4; -; CHECK-NEXT: setp.num.f32 %p2, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_fcmp_ord_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fcmp_ord_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: setp.num.f32 %p1, %r4, %r2; +; CHECK-NEXT: setp.num.f32 %p2, %r3, %r1; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs1; ; CHECK-NEXT: selp.b16 %rs2, -1, 0, %p1; @@ -1253,7 +1358,8 @@ define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i32_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.s32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rzi.s32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1269,7 +1375,8 @@ define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptosi_i64_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.s64.f32 %rd2, %r2; ; CHECK-NEXT: cvt.rzi.s64.f32 %rd3, %r1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; @@ -1285,7 +1392,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi32_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.u32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rzi.u32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1301,7 +1409,8 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptoui_2xi64_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.u64.f32 %rd2, %r2; ; CHECK-NEXT: cvt.rzi.u64.f32 %rd3, %r1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; @@ -1380,9 +1489,10 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 { ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: ; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r3, %r1; ; CHECK-NOF32X2-NEXT: cvt.rn.f32.u32 %r4, %r2; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_uitofp_2xi32_fadd_param_1]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r7, %r6, %r4; ; CHECK-NOF32X2-NEXT: add.rn.f32 %r8, %r5, %r3; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; @@ -1431,7 +1541,8 @@ define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fpext_2xdouble_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.f64.f32 %rd2, %r2; ; CHECK-NEXT: cvt.f64.f32 %rd3, %r1; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2}; @@ -1499,7 +1610,8 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_sqrt_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: sqrt.rn.f32 %r3, %r2; ; CHECK-NEXT: sqrt.rn.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1522,7 +1634,8 @@ define <2 x float> @test_sin(<2 x float> %a) #0 #1 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_sin_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: sin.approx.f32 %r3, %r2; ; CHECK-NEXT: sin.approx.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1538,7 +1651,8 @@ define <2 x float> @test_cos(<2 x float> %a) #0 #1 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_cos_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cos.approx.f32 %r3, %r2; ; CHECK-NEXT: cos.approx.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1597,11 +1711,14 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2]; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fma_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fma_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fma_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -1627,7 +1744,8 @@ define <2 x float> @test_fabs(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fabs_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: abs.f32 %r3, %r2; ; CHECK-NEXT: abs.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1643,10 +1761,12 @@ define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1]; -; CHECK-NEXT: min.f32 %r5, %r2, %r4; -; CHECK-NEXT: min.f32 %r6, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_minnum_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_minnum_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: min.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b) @@ -1660,10 +1780,12 @@ define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1]; -; CHECK-NEXT: max.f32 %r5, %r2, %r4; -; CHECK-NEXT: max.f32 %r6, %r1, %r3; +; CHECK-NEXT: ld.param.b64 %rd2, [test_maxnum_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_maxnum_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd2; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd1; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: max.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; ; CHECK-NEXT: ret; %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b) @@ -1677,8 +1799,10 @@ define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; ; CHECK-NEXT: copysign.f32 %r5, %r4, %r2; ; CHECK-NEXT: copysign.f32 %r6, %r3, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r6, %r5}; @@ -1696,18 +1820,19 @@ define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1]; -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0]; -; CHECK-NEXT: abs.f32 %r3, %r2; -; CHECK-NEXT: neg.f32 %r4, %r3; +; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_f64_param_0]; ; CHECK-NEXT: shr.u64 %rd4, %rd3, 63; ; CHECK-NEXT: and.b64 %rd5, %rd4, 1; ; CHECK-NEXT: setp.ne.b64 %p1, %rd5, 0; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: abs.f32 %r3, %r2; +; CHECK-NEXT: neg.f32 %r4, %r3; ; CHECK-NEXT: selp.f32 %r5, %r4, %r3, %p1; -; CHECK-NEXT: abs.f32 %r6, %r1; -; CHECK-NEXT: neg.f32 %r7, %r6; ; CHECK-NEXT: shr.u64 %rd6, %rd2, 63; ; CHECK-NEXT: and.b64 %rd7, %rd6, 1; ; CHECK-NEXT: setp.ne.b64 %p2, %rd7, 0; +; CHECK-NEXT: abs.f32 %r6, %r1; +; CHECK-NEXT: neg.f32 %r7, %r6; ; CHECK-NEXT: selp.f32 %r8, %r7, %r6, %p2; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r5}; ; CHECK-NEXT: ret; @@ -1723,8 +1848,10 @@ define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_copysign_extended_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_copysign_extended_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; ; CHECK-NEXT: copysign.f32 %r5, %r3, %r1; ; CHECK-NEXT: copysign.f32 %r6, %r4, %r2; ; CHECK-NEXT: cvt.f64.f32 %rd3, %r6; @@ -1743,7 +1870,8 @@ define <2 x float> @test_floor(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_floor_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rmi.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rmi.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1759,7 +1887,8 @@ define <2 x float> @test_ceil(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ceil_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rpi.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rpi.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1775,7 +1904,8 @@ define <2 x float> @test_trunc(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rzi.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rzi.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1791,7 +1921,8 @@ define <2 x float> @test_rint(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_rint_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1807,7 +1938,8 @@ define <2 x float> @test_nearbyint(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_nearbyint_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1823,7 +1955,8 @@ define <2 x float> @test_roundeven(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_roundeven_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rni.f32.f32 %r3, %r2; ; CHECK-NEXT: cvt.rni.f32.f32 %r4, %r1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r4, %r3}; @@ -1841,7 +1974,8 @@ define <2 x float> @test_round(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_round_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_round_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: and.b32 %r3, %r2, -2147483648; ; CHECK-NEXT: or.b32 %r4, %r3, 1056964608; ; CHECK-NEXT: add.rn.f32 %r5, %r2, %r4; @@ -1875,11 +2009,14 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c) ; CHECK-NOF32X2-NEXT: .reg .b64 %rd<4>; ; CHECK-NOF32X2-EMPTY: ; CHECK-NOF32X2-NEXT: // %bb.0: -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1]; -; CHECK-NOF32X2-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2]; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r2, %r4, %r6; -; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r1, %r3, %r5; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd3, [test_fmuladd_param_2]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd2, [test_fmuladd_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_fmuladd_param_0]; +; CHECK-NOF32X2-NEXT: mov.b64 {%r1, %r2}, %rd3; +; CHECK-NOF32X2-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NOF32X2-NEXT: mov.b64 {%r5, %r6}, %rd1; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r7, %r6, %r4, %r2; +; CHECK-NOF32X2-NEXT: fma.rn.f32 %r8, %r5, %r3, %r1; ; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; ; CHECK-NOF32X2-NEXT: ret; ; @@ -1905,7 +2042,8 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_shufflevector_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; ; CHECK-NEXT: ret; %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> @@ -1913,16 +2051,29 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 { } define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 { -; CHECK-LABEL: test_insertelement( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<4>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; -; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [test_insertelement_param_0]; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; -; CHECK-NEXT: ret; +; CHECK-NOF32X2-LABEL: test_insertelement( +; CHECK-NOF32X2: { +; CHECK-NOF32X2-NEXT: .reg .b32 %r<3>; +; CHECK-NOF32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-NOF32X2-EMPTY: +; CHECK-NOF32X2-NEXT: // %bb.0: +; CHECK-NOF32X2-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; +; CHECK-NOF32X2-NEXT: ld.param.b64 %rd1, [test_insertelement_param_0]; +; CHECK-NOF32X2-NEXT: { .reg .b32 tmp; mov.b64 {%r2, tmp}, %rd1; } +; CHECK-NOF32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-NOF32X2-NEXT: ret; +; +; CHECK-F32X2-LABEL: test_insertelement( +; CHECK-F32X2: { +; CHECK-F32X2-NEXT: .reg .b32 %r<3>; +; CHECK-F32X2-NEXT: .reg .b64 %rd<2>; +; CHECK-F32X2-EMPTY: +; CHECK-F32X2-NEXT: // %bb.0: +; CHECK-F32X2-NEXT: ld.param.b32 %r1, [test_insertelement_param_1]; +; CHECK-F32X2-NEXT: ld.param.b64 %rd1, [test_insertelement_param_0]; +; CHECK-F32X2-NEXT: mov.b64 {%r2, _}, %rd1; +; CHECK-F32X2-NEXT: st.param.v2.b32 [func_retval0], {%r2, %r1}; +; CHECK-F32X2-NEXT: ret; %i = insertelement <2 x float> %a, float %x, i64 1 ret <2 x float> %i } diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 1a61498b10142..2b7a06c33d948 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -32,31 +32,57 @@ define <2 x i16> @test_ret_const() #0 { } define i16 @test_extract_0(<2 x i16> %a) #0 { -; COMMON-LABEL: test_extract_0( -; COMMON: { -; COMMON-NEXT: .reg .b16 %rs<3>; -; COMMON-NEXT: .reg .b32 %r<3>; -; COMMON-EMPTY: -; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0]; -; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; -; COMMON-NEXT: st.param.b32 [func_retval0], %r2; -; COMMON-NEXT: ret; +; I16x2-LABEL: test_extract_0( +; I16x2: { +; I16x2-NEXT: .reg .b16 %rs<2>; +; I16x2-NEXT: .reg .b32 %r<3>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; +; I16x2-NEXT: mov.b32 {%rs1, _}, %r1; +; I16x2-NEXT: cvt.u32.u16 %r2, %rs1; +; I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_extract_0( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<2>; +; NO-I16x2-NEXT: .reg .b32 %r<3>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; +; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } +; NO-I16x2-NEXT: cvt.u32.u16 %r2, %rs1; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; NO-I16x2-NEXT: ret; %e = extractelement <2 x i16> %a, i32 0 ret i16 %e } define i16 @test_extract_1(<2 x i16> %a) #0 { -; COMMON-LABEL: test_extract_1( -; COMMON: { -; COMMON-NEXT: .reg .b16 %rs<3>; -; COMMON-NEXT: .reg .b32 %r<3>; -; COMMON-EMPTY: -; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0]; -; COMMON-NEXT: cvt.u32.u16 %r2, %rs2; -; COMMON-NEXT: st.param.b32 [func_retval0], %r2; -; COMMON-NEXT: ret; +; I16x2-LABEL: test_extract_1( +; I16x2: { +; I16x2-NEXT: .reg .b16 %rs<2>; +; I16x2-NEXT: .reg .b32 %r<3>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; +; I16x2-NEXT: mov.b32 {_, %rs1}, %r1; +; I16x2-NEXT: cvt.u32.u16 %r2, %rs1; +; I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_extract_1( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<2>; +; NO-I16x2-NEXT: .reg .b32 %r<3>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; +; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } +; NO-I16x2-NEXT: cvt.u32.u16 %r2, %rs1; +; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r2; +; NO-I16x2-NEXT: ret; %e = extractelement <2 x i16> %a, i32 1 ret i16 %e } @@ -71,8 +97,9 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; ; COMMON-NEXT: setp.eq.b64 %p1, %rd1, 0; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; COMMON-NEXT: cvt.u32.u16 %r2, %rs3; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; @@ -99,10 +126,12 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_add_param_1]; -; NO-I16x2-NEXT: add.s16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: add.s16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_add_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: add.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: add.s16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %r = add <2 x i16> %a, %b @@ -128,7 +157,8 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<2>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_0_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; ; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -155,7 +185,8 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<2>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_1_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; ; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -171,10 +202,12 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_sub_param_0]; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_sub_param_1]; -; COMMON-NEXT: sub.s16 %rs5, %rs2, %rs4; -; COMMON-NEXT: sub.s16 %rs6, %rs1, %rs3; +; COMMON-NEXT: ld.param.b32 %r2, [test_sub_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_sub_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: sub.s16 %rs5, %rs4, %rs2; +; COMMON-NEXT: sub.s16 %rs6, %rs3, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; %r = sub <2 x i16> %a, %b @@ -199,10 +232,12 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_smax_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_smax_param_1]; -; NO-I16x2-NEXT: max.s16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: max.s16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smax_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smax_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: max.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: max.s16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp sgt <2 x i16> %a, %b @@ -228,10 +263,12 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_umax_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_umax_param_1]; -; NO-I16x2-NEXT: max.u16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: max.u16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umax_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umax_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: max.u16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: max.u16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp ugt <2 x i16> %a, %b @@ -257,10 +294,12 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_smin_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_smin_param_1]; -; NO-I16x2-NEXT: min.s16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: min.s16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smin_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smin_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: min.s16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: min.s16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp sle <2 x i16> %a, %b @@ -286,10 +325,12 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_umin_param_0]; -; NO-I16x2-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_umin_param_1]; -; NO-I16x2-NEXT: min.u16 %rs5, %rs2, %rs4; -; NO-I16x2-NEXT: min.u16 %rs6, %rs1, %rs3; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umin_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umin_param_0]; +; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; NO-I16x2-NEXT: min.u16 %rs5, %rs4, %rs2; +; NO-I16x2-NEXT: min.u16 %rs6, %rs3, %rs1; ; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; NO-I16x2-NEXT: ret; %cmp = icmp ule <2 x i16> %a, %b @@ -304,10 +345,12 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_mul_param_0]; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_mul_param_1]; -; COMMON-NEXT: mul.lo.s16 %rs5, %rs2, %rs4; -; COMMON-NEXT: mul.lo.s16 %rs6, %rs1, %rs3; +; COMMON-NEXT: ld.param.b32 %r2, [test_mul_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_mul_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: mul.lo.s16 %rs5, %rs4, %rs2; +; COMMON-NEXT: mul.lo.s16 %rs6, %rs3, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; %r = mul <2 x i16> %a, %b @@ -686,14 +729,18 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x ; COMMON-NEXT: .reg .b32 %r<5>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0]; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_2]; -; COMMON-NEXT: ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_3]; -; COMMON-NEXT: setp.ne.b16 %p1, %rs3, %rs5; -; COMMON-NEXT: setp.ne.b16 %p2, %rs4, %rs6; -; COMMON-NEXT: ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1]; -; COMMON-NEXT: selp.b16 %rs9, %rs2, %rs8, %p2; -; COMMON-NEXT: selp.b16 %rs10, %rs1, %rs7, %p1; +; COMMON-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; COMMON-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r3; +; COMMON-NEXT: setp.ne.b16 %p1, %rs3, %rs1; +; COMMON-NEXT: setp.ne.b16 %p2, %rs4, %rs2; +; COMMON-NEXT: mov.b32 {%rs5, %rs6}, %r2; +; COMMON-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; COMMON-NEXT: selp.b16 %rs9, %rs8, %rs6, %p2; +; COMMON-NEXT: selp.b16 %rs10, %rs7, %rs5, %p1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs10, %rs9}; ; COMMON-NEXT: ret; %cc = icmp ne <2 x i16> %c, %d @@ -711,10 +758,12 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1]; ; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0]; -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i32_i16_param_2]; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i32_i16_param_3]; -; COMMON-NEXT: setp.ne.b16 %p1, %rs1, %rs3; -; COMMON-NEXT: setp.ne.b16 %p2, %rs2, %rs4; +; COMMON-NEXT: ld.param.b32 %r6, [test_select_cc_i32_i16_param_3]; +; COMMON-NEXT: ld.param.b32 %r5, [test_select_cc_i32_i16_param_2]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r6; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r5; +; COMMON-NEXT: setp.ne.b16 %p1, %rs3, %rs1; +; COMMON-NEXT: setp.ne.b16 %p2, %rs4, %rs2; ; COMMON-NEXT: selp.b32 %r7, %r2, %r4, %p2; ; COMMON-NEXT: selp.b32 %r8, %r1, %r3, %p1; ; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r8, %r7}; @@ -735,12 +784,14 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, ; COMMON-NEXT: // %bb.0: ; COMMON-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3]; ; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2]; -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i16_i32_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_i16_i32_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_i16_i32_param_0]; ; COMMON-NEXT: setp.ne.b32 %p1, %r3, %r5; ; COMMON-NEXT: setp.ne.b32 %p2, %r4, %r6; -; COMMON-NEXT: ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i16_i32_param_1]; -; COMMON-NEXT: selp.b16 %rs5, %rs2, %rs4, %p2; -; COMMON-NEXT: selp.b16 %rs6, %rs1, %rs3, %p1; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; +; COMMON-NEXT: selp.b16 %rs5, %rs4, %rs2, %p2; +; COMMON-NEXT: selp.b16 %rs6, %rs3, %rs1, %p1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs6, %rs5}; ; COMMON-NEXT: ret; <2 x i32> %c, <2 x i32> %d) #0 { @@ -851,7 +902,8 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi32_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.u32.u16 %r2, %rs2; ; COMMON-NEXT: cvt.u32.u16 %r3, %rs1; ; COMMON-NEXT: st.param.v2.b32 [func_retval0], {%r3, %r2}; @@ -868,7 +920,8 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b64 %rd<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi64_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.u64.u16 %rd1, %rs2; ; COMMON-NEXT: cvt.u64.u16 %rd2, %rs1; ; COMMON-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd1}; @@ -926,7 +979,8 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; ; COMMON-NEXT: ret; %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> @@ -934,16 +988,29 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { } define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { -; COMMON-LABEL: test_insertelement( -; COMMON: { -; COMMON-NEXT: .reg .b16 %rs<4>; -; COMMON-NEXT: .reg .b32 %r<2>; -; COMMON-EMPTY: -; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; -; COMMON-NEXT: ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0]; -; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; -; COMMON-NEXT: ret; +; I16x2-LABEL: test_insertelement( +; I16x2: { +; I16x2-NEXT: .reg .b16 %rs<3>; +; I16x2-NEXT: .reg .b32 %r<2>; +; I16x2-EMPTY: +; I16x2-NEXT: // %bb.0: +; I16x2-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; +; I16x2-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; I16x2-NEXT: mov.b32 {%rs2, _}, %r1; +; I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; I16x2-NEXT: ret; +; +; NO-I16x2-LABEL: test_insertelement( +; NO-I16x2: { +; NO-I16x2-NEXT: .reg .b16 %rs<3>; +; NO-I16x2-NEXT: .reg .b32 %r<2>; +; NO-I16x2-EMPTY: +; NO-I16x2-NEXT: // %bb.0: +; NO-I16x2-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; +; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } +; NO-I16x2-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; NO-I16x2-NEXT: ret; %i = insertelement <2 x i16> %a, i16 %x, i64 1 ret <2 x i16> %i } @@ -955,7 +1022,8 @@ define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; COMMON-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; @@ -971,7 +1039,8 @@ define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0]; +; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; COMMON-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; ; COMMON-NEXT: st.param.v2.b16 [func_retval0], {%rs4, %rs3}; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index aba20e6b0f27f..9891e33151f8a 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -1935,16 +1935,18 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; O0-NEXT: .reg .b32 %r<12>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: -; O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; O0-NEXT: cvt.rzi.s16.f16 %rs5, %rs4; -; O0-NEXT: cvt.rzi.s16.f16 %rs6, %rs3; -; O0-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; O0-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; O0-NEXT: cvt.u32.u16 %r4, %rs8; -; O0-NEXT: cvt.u32.u16 %r5, %rs7; +; O0-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0]; +; O0-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; O0-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; +; O0-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; +; O0-NEXT: mov.b32 %r3, {%rs4, %rs3}; +; O0-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; O0-NEXT: cvt.u32.u16 %r4, %rs6; +; O0-NEXT: cvt.u32.u16 %r5, %rs5; ; O0-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; O0-NEXT: cvt.rzi.s16.f16 %rs9, %rs2; -; O0-NEXT: cvt.rzi.s16.f16 %rs10, %rs1; +; O0-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; O0-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; +; O0-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; ; O0-NEXT: mov.b32 %r7, {%rs10, %rs9}; ; O0-NEXT: mov.b32 {%rs11, %rs12}, %r7; ; O0-NEXT: cvt.u32.u16 %r8, %rs12; @@ -1989,16 +1991,18 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; O0-NEXT: .reg .b32 %r<12>; ; O0-EMPTY: ; O0-NEXT: // %bb.0: -; O0-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; O0-NEXT: cvt.rzi.u16.f16 %rs5, %rs4; -; O0-NEXT: cvt.rzi.u16.f16 %rs6, %rs3; -; O0-NEXT: mov.b32 %r3, {%rs6, %rs5}; -; O0-NEXT: mov.b32 {%rs7, %rs8}, %r3; -; O0-NEXT: cvt.u32.u16 %r4, %rs8; -; O0-NEXT: cvt.u32.u16 %r5, %rs7; +; O0-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0]; +; O0-NEXT: mov.b32 {%rs1, %rs2}, %r2; +; O0-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; +; O0-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; +; O0-NEXT: mov.b32 %r3, {%rs4, %rs3}; +; O0-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; O0-NEXT: cvt.u32.u16 %r4, %rs6; +; O0-NEXT: cvt.u32.u16 %r5, %rs5; ; O0-NEXT: prmt.b32 %r6, %r5, %r4, 0x3340U; -; O0-NEXT: cvt.rzi.u16.f16 %rs9, %rs2; -; O0-NEXT: cvt.rzi.u16.f16 %rs10, %rs1; +; O0-NEXT: mov.b32 {%rs7, %rs8}, %r1; +; O0-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; +; O0-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; ; O0-NEXT: mov.b32 %r7, {%rs10, %rs9}; ; O0-NEXT: mov.b32 {%rs11, %rs12}, %r7; ; O0-NEXT: cvt.u32.u16 %r8, %rs12; diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll new file mode 100644 index 0000000000000..32e411584b0e5 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/pr126337.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas-verify %} + +; This IR should compile without triggering assertions in LICM +; when the CopyToReg from %0 in the first BB gets eliminated +; but we still use its result in the second BB. +; Technically the problem happens in MIR, but there are multiple +; passes involved, so testing with the IR reproducer is more convenient. +; https://github.com/llvm/llvm-project/pull/126337#issuecomment-3081431594 + +target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define ptx_kernel void @Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(<2 x float> %0) { +; CHECK-LABEL: Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: // %.preheader15 +; CHECK-NEXT: ld.param.b64 %rd1, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0]; +; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } +; CHECK-NEXT: setp.eq.f32 %p1, %r1, 0f00000000; +; CHECK-NEXT: selp.b16 %rs1, 1, 0, %p1; +; CHECK-NEXT: $L__BB0_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: mov.b64 %rd2, 0; +; CHECK-NEXT: st.b8 [%rd2], %rs1; +; CHECK-NEXT: bra.uni $L__BB0_1; +.preheader15: + br label %1 + +1: ; preds = %1, %.preheader15 + %2 = fcmp oeq <2 x float> %0, zeroinitializer + %3 = extractelement <2 x i1> %2, i64 0 + store i1 %3, ptr null, align 4 + br label %1 +} + diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index 87f965c84b6b6..92cb51b17f0c8 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -117,16 +117,20 @@ define float @reduce_fadd_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0]; -; CHECK-NEXT: add.rn.f32 %r9, %r5, 0f00000000; -; CHECK-NEXT: add.rn.f32 %r10, %r9, %r6; -; CHECK-NEXT: add.rn.f32 %r11, %r10, %r7; -; CHECK-NEXT: add.rn.f32 %r12, %r11, %r8; -; CHECK-NEXT: add.rn.f32 %r13, %r12, %r1; -; CHECK-NEXT: add.rn.f32 %r14, %r13, %r2; -; CHECK-NEXT: add.rn.f32 %r15, %r14, %r3; -; CHECK-NEXT: add.rn.f32 %r16, %r15, %r4; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0]; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-NEXT: add.rn.f32 %r9, %r7, 0f00000000; +; CHECK-NEXT: add.rn.f32 %r10, %r9, %r8; +; CHECK-NEXT: add.rn.f32 %r11, %r10, %r5; +; CHECK-NEXT: add.rn.f32 %r12, %r11, %r6; +; CHECK-NEXT: add.rn.f32 %r13, %r12, %r3; +; CHECK-NEXT: add.rn.f32 %r14, %r13, %r4; +; CHECK-NEXT: add.rn.f32 %r15, %r14, %r1; +; CHECK-NEXT: add.rn.f32 %r16, %r15, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r16; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) @@ -140,14 +144,18 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0]; -; CHECK-SM80-NEXT: add.rn.f32 %r9, %r7, %r3; -; CHECK-SM80-NEXT: add.rn.f32 %r10, %r5, %r1; -; CHECK-SM80-NEXT: add.rn.f32 %r11, %r8, %r4; -; CHECK-SM80-NEXT: add.rn.f32 %r12, %r6, %r2; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM80-NEXT: add.rn.f32 %r5, %r3, %r1; +; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM80-NEXT: add.rn.f32 %r10, %r8, %r6; +; CHECK-SM80-NEXT: add.rn.f32 %r11, %r4, %r2; +; CHECK-SM80-NEXT: add.rn.f32 %r12, %r9, %r7; ; CHECK-SM80-NEXT: add.rn.f32 %r13, %r12, %r11; -; CHECK-SM80-NEXT: add.rn.f32 %r14, %r10, %r9; +; CHECK-SM80-NEXT: add.rn.f32 %r14, %r10, %r5; ; CHECK-SM80-NEXT: add.rn.f32 %r15, %r14, %r13; ; CHECK-SM80-NEXT: add.rn.f32 %r16, %r15, 0f00000000; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r16; @@ -321,15 +329,19 @@ define float @reduce_fmul_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0]; -; CHECK-NEXT: mul.rn.f32 %r9, %r5, %r6; -; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r7; -; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r8; -; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r1; -; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r2; -; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r3; -; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r4; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd3; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0]; +; CHECK-NEXT: mov.b64 {%r5, %r6}, %rd2; +; CHECK-NEXT: mov.b64 {%r7, %r8}, %rd1; +; CHECK-NEXT: mul.rn.f32 %r9, %r7, %r8; +; CHECK-NEXT: mul.rn.f32 %r10, %r9, %r5; +; CHECK-NEXT: mul.rn.f32 %r11, %r10, %r6; +; CHECK-NEXT: mul.rn.f32 %r12, %r11, %r3; +; CHECK-NEXT: mul.rn.f32 %r13, %r12, %r4; +; CHECK-NEXT: mul.rn.f32 %r14, %r13, %r1; +; CHECK-NEXT: mul.rn.f32 %r15, %r14, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) @@ -343,14 +355,18 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-SM80-NEXT: .reg .b64 %rd<5>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0+16]; -; CHECK-SM80-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0]; -; CHECK-SM80-NEXT: mul.rn.f32 %r9, %r7, %r3; -; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r5, %r1; -; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r8, %r4; -; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r6, %r2; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-SM80-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-SM80-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-SM80-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-SM80-NEXT: mul.rn.f32 %r5, %r3, %r1; +; CHECK-SM80-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-SM80-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-SM80-NEXT: mul.rn.f32 %r10, %r8, %r6; +; CHECK-SM80-NEXT: mul.rn.f32 %r11, %r4, %r2; +; CHECK-SM80-NEXT: mul.rn.f32 %r12, %r9, %r7; ; CHECK-SM80-NEXT: mul.rn.f32 %r13, %r12, %r11; -; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r10, %r9; +; CHECK-SM80-NEXT: mul.rn.f32 %r14, %r10, %r5; ; CHECK-SM80-NEXT: mul.rn.f32 %r15, %r14, %r13; ; CHECK-SM80-NEXT: st.param.b32 [func_retval0], %r15; ; CHECK-SM80-NEXT: ret; @@ -494,13 +510,17 @@ define float @reduce_fmax_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0]; -; CHECK-NEXT: max.f32 %r9, %r8, %r4; -; CHECK-NEXT: max.f32 %r10, %r6, %r2; -; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r7, %r3; -; CHECK-NEXT: max.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -517,13 +537,17 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0]; -; CHECK-NEXT: max.f32 %r9, %r8, %r4; -; CHECK-NEXT: max.f32 %r10, %r6, %r2; -; CHECK-NEXT: max.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.f32 %r12, %r7, %r3; -; CHECK-NEXT: max.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -628,13 +652,17 @@ define float @reduce_fmin_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0]; -; CHECK-NEXT: min.f32 %r9, %r8, %r4; -; CHECK-NEXT: min.f32 %r10, %r6, %r2; -; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r7, %r3; -; CHECK-NEXT: min.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -651,13 +679,17 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0]; -; CHECK-NEXT: min.f32 %r9, %r8, %r4; -; CHECK-NEXT: min.f32 %r10, %r6, %r2; -; CHECK-NEXT: min.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.f32 %r12, %r7, %r3; -; CHECK-NEXT: min.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -762,13 +794,17 @@ define float @reduce_fmaximum_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r8, %r4; -; CHECK-NEXT: max.NaN.f32 %r10, %r6, %r2; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r7, %r3; -; CHECK-NEXT: max.NaN.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -785,13 +821,17 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0]; -; CHECK-NEXT: max.NaN.f32 %r9, %r8, %r4; -; CHECK-NEXT: max.NaN.f32 %r10, %r6, %r2; -; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: max.NaN.f32 %r12, %r7, %r3; -; CHECK-NEXT: max.NaN.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: max.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: max.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: max.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: max.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: max.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: max.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: max.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -896,13 +936,17 @@ define float @reduce_fminimum_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r8, %r4; -; CHECK-NEXT: min.NaN.f32 %r10, %r6, %r2; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r7, %r3; -; CHECK-NEXT: min.NaN.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; @@ -919,13 +963,17 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0]; -; CHECK-NEXT: min.NaN.f32 %r9, %r8, %r4; -; CHECK-NEXT: min.NaN.f32 %r10, %r6, %r2; -; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r9; -; CHECK-NEXT: min.NaN.f32 %r12, %r7, %r3; -; CHECK-NEXT: min.NaN.f32 %r13, %r5, %r1; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd4; +; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; +; CHECK-NEXT: min.NaN.f32 %r5, %r4, %r2; +; CHECK-NEXT: mov.b64 {%r6, %r7}, %rd3; +; CHECK-NEXT: mov.b64 {%r8, %r9}, %rd1; +; CHECK-NEXT: min.NaN.f32 %r10, %r9, %r7; +; CHECK-NEXT: min.NaN.f32 %r11, %r10, %r5; +; CHECK-NEXT: min.NaN.f32 %r12, %r3, %r1; +; CHECK-NEXT: min.NaN.f32 %r13, %r8, %r6; ; CHECK-NEXT: min.NaN.f32 %r14, %r13, %r12; ; CHECK-NEXT: min.NaN.f32 %r15, %r14, %r11; ; CHECK-NEXT: st.param.b32 [func_retval0], %r15; From b02787d33f24d83f1d5814c578b7b0fce7156382 Mon Sep 17 00:00:00 2001 From: Andres-Salamanca Date: Fri, 18 Jul 2025 16:13:34 -0500 Subject: [PATCH 407/813] [CIR] Fix alignment when lowering set/get bitfield operations (#148999) This PR fixes incorrect alignment when lowering `set` and `getBitField` operations to LLVM IR. The issue occurred because during lowering, the function was being called with an alignment of 0, which caused it to default to the alignment of the packed member. For example, if the bitfield was packed inside a `u64i`, it would use an alignment of 8. With this change, the generated code now matches what the classic codegen produces. In the assembly format, I changed to be similar to how it's done in loadOp. If there's a better approach, please feel free to point it out. --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 27 +++++++---- clang/lib/CIR/CodeGen/CIRGenBuilder.h | 22 +++++---- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 13 +++--- .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 7 +-- clang/test/CIR/CodeGen/bitfields.c | 46 +++++++++---------- clang/test/CIR/CodeGen/bitfields.cpp | 22 ++++----- clang/test/CIR/CodeGen/bitfields_be.c | 26 +++++------ 7 files changed, 88 insertions(+), 75 deletions(-) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index d19cd83d78b40..01c5055484185 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -1739,7 +1739,8 @@ def CIR_SetBitfieldOp : CIR_Op<"set_bitfield"> { %2 = cir.load %0 : !cir.ptr>, !cir.ptr %3 = cir.get_member %2[1] {name = "e"} : !cir.ptr -> !cir.ptr - %4 = cir.set_bitfield(#bfi_e, %3 : !cir.ptr, %1 : !s32i) -> !s32i + %4 = cir.set_bitfield align(4) (#bfi_e, %3 : !cir.ptr, %1 : !s32i) + -> !s32i ``` }]; @@ -1747,12 +1748,15 @@ def CIR_SetBitfieldOp : CIR_Op<"set_bitfield"> { Arg:$addr, CIR_AnyType:$src, BitfieldInfoAttr:$bitfield_info, + DefaultValuedOptionalAttr:$alignment, UnitAttr:$is_volatile ); let results = (outs CIR_IntType:$result); - let assemblyFormat = [{ `(`$bitfield_info`,` $addr`:`qualified(type($addr))`,` + let assemblyFormat = [{ + (`align` `(` $alignment^ `)`)? + `(`$bitfield_info`,` $addr`:`qualified(type($addr))`,` $src`:`type($src) `)` attr-dict `->` type($result) }]; let builders = [ @@ -1764,14 +1768,15 @@ def CIR_SetBitfieldOp : CIR_Op<"set_bitfield"> { "unsigned":$size, "unsigned":$offset, "bool":$is_signed, - "bool":$is_volatile + "bool":$is_volatile, + CArg<"unsigned", "0">:$alignment ), [{ BitfieldInfoAttr info = BitfieldInfoAttr::get($_builder.getContext(), name, storage_type, size, offset, is_signed); - build($_builder, $_state, type, addr, src, info, is_volatile); + build($_builder, $_state, type, addr, src, info, alignment, is_volatile); }]> ]; } @@ -1823,20 +1828,23 @@ def CIR_GetBitfieldOp : CIR_Op<"get_bitfield"> { %2 = cir.load %0 : !cir.ptr>, !cir.ptr %3 = cir.get_member %2[1] {name = "e"} : !cir.ptr -> !cir.ptr - %4 = cir.get_bitfield(#bfi_e, %3 : !cir.ptr) -> !s32i + %4 = cir.get_bitfield align(4) (#bfi_e, %3 : !cir.ptr) -> !s32i ``` }]; let arguments = (ins Arg:$addr, BitfieldInfoAttr:$bitfield_info, + DefaultValuedOptionalAttr:$alignment, UnitAttr:$is_volatile ); let results = (outs CIR_IntType:$result); - let assemblyFormat = [{ `(`$bitfield_info `,` $addr attr-dict `:` - qualified(type($addr)) `)` `->` type($result) }]; + let assemblyFormat = [{ + (`align` `(` $alignment^ `)`)? + `(`$bitfield_info `,` $addr attr-dict `:` + qualified(type($addr)) `)` `->` type($result) }]; let builders = [ OpBuilder<(ins "mlir::Type":$type, @@ -1846,14 +1854,15 @@ def CIR_GetBitfieldOp : CIR_Op<"get_bitfield"> { "unsigned":$size, "unsigned":$offset, "bool":$is_signed, - "bool":$is_volatile + "bool":$is_volatile, + CArg<"unsigned", "0">:$alignment ), [{ BitfieldInfoAttr info = BitfieldInfoAttr::get($_builder.getContext(), name, storage_type, size, offset, is_signed); - build($_builder, $_state, type, addr, info, is_volatile); + build($_builder, $_state, type, addr, info, alignment, is_volatile); }]> ]; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index f855bdad2d7c3..73c9fb924f682 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -408,21 +408,23 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { } mlir::Value createSetBitfield(mlir::Location loc, mlir::Type resultType, - mlir::Value dstAddr, mlir::Type storageType, + Address dstAddr, mlir::Type storageType, mlir::Value src, const CIRGenBitFieldInfo &info, - bool isLvalueVolatile, bool useVolatile) { - return create(loc, resultType, dstAddr, storageType, - src, info.name, info.size, info.offset, - info.isSigned, isLvalueVolatile); + bool isLvalueVolatile) { + return create( + loc, resultType, dstAddr.getPointer(), storageType, src, info.name, + info.size, info.offset, info.isSigned, isLvalueVolatile, + dstAddr.getAlignment().getAsAlign().value()); } mlir::Value createGetBitfield(mlir::Location loc, mlir::Type resultType, - mlir::Value addr, mlir::Type storageType, + Address addr, mlir::Type storageType, const CIRGenBitFieldInfo &info, - bool isLvalueVolatile, bool useVolatile) { - return create(loc, resultType, addr, storageType, - info.name, info.size, info.offset, - info.isSigned, isLvalueVolatile); + bool isLvalueVolatile) { + return create( + loc, resultType, addr.getPointer(), storageType, info.name, info.size, + info.offset, info.isSigned, isLvalueVolatile, + addr.getAlignment().getAsAlign().value()); } }; diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 51da48d330f55..d63c18fc5056b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -333,13 +333,12 @@ mlir::Value CIRGenFunction::emitStoreThroughBitfieldLValue(RValue src, Address ptr = dst.getBitFieldAddress(); assert(!cir::MissingFeatures::armComputeVolatileBitfields()); - const bool useVolatile = false; mlir::Value dstAddr = dst.getAddress().getPointer(); - return builder.createSetBitfield(dstAddr.getLoc(), resLTy, dstAddr, + return builder.createSetBitfield(dstAddr.getLoc(), resLTy, ptr, ptr.getElementType(), src.getValue(), info, - dst.isVolatileQualified(), useVolatile); + dst.isVolatileQualified()); } RValue CIRGenFunction::emitLoadOfBitfieldLValue(LValue lv, SourceLocation loc) { @@ -352,8 +351,7 @@ RValue CIRGenFunction::emitLoadOfBitfieldLValue(LValue lv, SourceLocation loc) { assert(!cir::MissingFeatures::armComputeVolatileBitfields()); mlir::Value field = builder.createGetBitfield( - getLoc(loc), resLTy, ptr.getPointer(), ptr.getElementType(), info, - lv.isVolatile(), false); + getLoc(loc), resLTy, ptr, ptr.getElementType(), info, lv.isVolatile()); assert(!cir::MissingFeatures::opLoadEmitScalarRangeCheck() && "NYI"); return RValue::get(field); } @@ -366,7 +364,10 @@ Address CIRGenFunction::getAddrOfBitFieldStorage(LValue base, cir::PointerType fieldPtr = cir::PointerType::get(fieldType); cir::GetMemberOp sea = getBuilder().createGetMember( loc, fieldPtr, base.getPointer(), field->getName(), index); - return Address(sea, CharUnits::One()); + auto rec = cast(base.getAddress().getElementType()); + CharUnits offset = CharUnits::fromQuantity( + rec.getElementOffset(cgm.getDataLayout().layout, index)); + return Address(sea, base.getAlignment().alignmentAtOffset(offset)); } LValue CIRGenFunction::emitLValueForBitField(LValue base, diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 840e856ba0cf8..f075be8a9eed5 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -2571,7 +2571,7 @@ mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite( assert(storageSize > size && "Invalid bitfield size."); mlir::Value val = rewriter.create( - op.getLoc(), intType, adaptor.getAddr(), /* alignment */ 0, + op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(), op.getIsVolatile()); srcVal = @@ -2588,7 +2588,7 @@ mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite( } rewriter.create(op.getLoc(), srcVal, adaptor.getAddr(), - /* alignment */ 0, op.getIsVolatile()); + op.getAlignment(), op.getIsVolatile()); mlir::Type resultTy = getTypeConverter()->convertType(op.getType()); @@ -2662,7 +2662,8 @@ mlir::LogicalResult CIRToLLVMGetBitfieldOpLowering::matchAndRewrite( computeBitfieldIntType(storageType, context, storageSize); mlir::Value val = rewriter.create( - op.getLoc(), intType, adaptor.getAddr(), 0, op.getIsVolatile()); + op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(), + op.getIsVolatile()); val = rewriter.create(op.getLoc(), intType, val); if (info.getIsSigned()) { diff --git a/clang/test/CIR/CodeGen/bitfields.c b/clang/test/CIR/CodeGen/bitfields.c index 896acbfc854a4..a73c076ea81ab 100644 --- a/clang/test/CIR/CodeGen/bitfields.c +++ b/clang/test/CIR/CodeGen/bitfields.c @@ -87,14 +87,14 @@ int load_field(S* s) { // CIR: [[TMP0:%.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["s", init] // CIR: [[TMP1:%.*]] = cir.load{{.*}} [[TMP0]] : !cir.ptr>, !cir.ptr // CIR: [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "c"} : !cir.ptr -> !cir.ptr -// CIR: [[TMP3:%.*]] = cir.get_bitfield(#bfi_c, [[TMP2]] : !cir.ptr) -> !s32i +// CIR: [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP2]] : !cir.ptr) -> !s32i // LLVM: define dso_local i32 @load_field // LLVM: [[TMP0:%.*]] = alloca ptr, i64 1, align 8 // LLVM: [[TMP1:%.*]] = alloca i32, i64 1, align 4 // LLVM: [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8 // LLVM: [[TMP3:%.*]] = getelementptr %struct.S, ptr [[TMP2]], i32 0, i32 0 -// LLVM: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8 +// LLVM: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 4 // LLVM: [[TMP5:%.*]] = shl i64 [[TMP4]], 15 // LLVM: [[TMP6:%.*]] = ashr i64 [[TMP5]], 47 // LLVM: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 @@ -115,13 +115,13 @@ unsigned int load_field_unsigned(A* s) { //CIR: [[TMP0:%.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["s", init] {alignment = 8 : i64} //CIR: [[TMP1:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr>, !cir.ptr //CIR: [[TMP2:%.*]] = cir.get_member [[TMP1]][3] {name = "more_bits"} : !cir.ptr -> !cir.ptr -//CIR: [[TMP3:%.*]] = cir.get_bitfield(#bfi_more_bits, [[TMP2]] : !cir.ptr) -> !u32i +//CIR: [[TMP3:%.*]] = cir.get_bitfield align(1) (#bfi_more_bits, [[TMP2]] : !cir.ptr) -> !u32i //LLVM: define dso_local i32 @load_field_unsigned //LLVM: [[TMP0:%.*]] = alloca ptr, i64 1, align 8 //LLVM: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 //LLVM: [[TMP2:%.*]] = getelementptr %struct.A, ptr [[TMP1]], i32 0, i32 3 -//LLVM: [[TMP3:%.*]] = load i16, ptr [[TMP2]], align 2 +//LLVM: [[TMP3:%.*]] = load i16, ptr [[TMP2]], align 1 //LLVM: [[TMP4:%.*]] = lshr i16 [[TMP3]], 3 //LLVM: [[TMP5:%.*]] = and i16 [[TMP4]], 15 //LLVM: [[TMP6:%.*]] = zext i16 [[TMP5]] to i32 @@ -143,15 +143,15 @@ void store_field() { // CIR: [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr // CIR: [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i // CIR: [[TMP2:%.*]] = cir.get_member [[TMP0]][1] {name = "e"} : !cir.ptr -> !cir.ptr -// CIR: cir.set_bitfield(#bfi_e, [[TMP2]] : !cir.ptr, [[TMP1]] : !s32i) +// CIR: cir.set_bitfield align(4) (#bfi_e, [[TMP2]] : !cir.ptr, [[TMP1]] : !s32i) // LLVM: define dso_local void @store_field() // LLVM: [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4 // LLVM: [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 1 -// LLVM: [[TMP2:%.*]] = load i16, ptr [[TMP1]], align 2 +// LLVM: [[TMP2:%.*]] = load i16, ptr [[TMP1]], align 4 // LLVM: [[TMP3:%.*]] = and i16 [[TMP2]], -32768 // LLVM: [[TMP4:%.*]] = or i16 [[TMP3]], 3 -// LLVM: store i16 [[TMP4]], ptr [[TMP1]], align 2 +// LLVM: store i16 [[TMP4]], ptr [[TMP1]], align 4 // OGCG: define dso_local void @store_field() // OGCG: [[TMP0:%.*]] = alloca %struct.S, align 4 @@ -169,24 +169,24 @@ void store_bitfield_to_bitfield() { // CIR: cir.func {{.*@store_bitfield_to_bitfield}} // CIR: [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr, ["s"] {alignment = 4 : i64} // CIR: [[TMP1:%.*]] = cir.get_member [[TMP0]][0] {name = "c"} : !cir.ptr -> !cir.ptr -// CIR: [[TMP2:%.*]] = cir.get_bitfield(#bfi_c, [[TMP1]] : !cir.ptr) -> !s32i +// CIR: [[TMP2:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP1]] : !cir.ptr) -> !s32i // CIR: [[TMP3:%.*]] = cir.get_member [[TMP0]][0] {name = "a"} : !cir.ptr -> !cir.ptr -// CIR: [[TMP4:%.*]] = cir.set_bitfield(#bfi_a, [[TMP3]] : !cir.ptr, [[TMP2]] : !s32i) -> !s32i +// CIR: [[TMP4:%.*]] = cir.set_bitfield align(4) (#bfi_a, [[TMP3]] : !cir.ptr, [[TMP2]] : !s32i) -> !s32i // LLVM: define dso_local void @store_bitfield_to_bitfield() // LLVM: [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4 // LLVM: [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0 -// LLVM: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 +// LLVM: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 4 // LLVM: [[TMP3:%.*]] = shl i64 [[TMP2]], 15 // LLVM: [[TMP4:%.*]] = ashr i64 [[TMP3]], 47 // LLVM: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 // LLVM: [[TMP6:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0 // LLVM: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 -// LLVM: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 8 +// LLVM: [[TMP8:%.*]] = load i64, ptr [[TMP6]], align 4 // LLVM: [[TMP9:%.*]] = and i64 [[TMP7]], 15 // LLVM: [[TMP10:%.*]] = and i64 [[TMP8]], -16 // LLVM: [[TMP11:%.*]] = or i64 [[TMP10]], [[TMP9]] -// LLVM: store i64 [[TMP11]], ptr [[TMP6]], align 8 +// LLVM: store i64 [[TMP11]], ptr [[TMP6]], align 4 // LLVM: [[TMP12:%.*]] = shl i64 [[TMP9]], 60 // LLVM: [[TMP13:%.*]] = ashr i64 [[TMP12]], 60 // LLVM: [[TMP15:%.*]] = trunc i64 [[TMP13]] to i32 @@ -222,16 +222,16 @@ void get_volatile(V* v) { // CIR: [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i // CIR: [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr>, !cir.ptr // CIR: [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr -> !cir.ptr -// CIR: [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr, [[TMP1]] : !s32i) {is_volatile} -> !s32i +// CIR: [[TMP4:%.*]] = cir.set_bitfield align(4) (#bfi_b, [[TMP3]] : !cir.ptr, [[TMP1]] : !s32i) {is_volatile} -> !s32i // LLVM: define dso_local void @get_volatile // LLVM: [[TMP0:%.*]] = alloca ptr, i64 1, align 8 // LLVM: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 // LLVM: [[TMP2:%.*]] = getelementptr %struct.V, ptr [[TMP1]], i32 0, i32 0 -// LLVM: [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 8 +// LLVM: [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 4 // LLVM: [[TMP4:%.*]] = and i64 [[TMP3]], -1095216660481 // LLVM: [[TMP5:%.*]] = or i64 [[TMP4]], 12884901888 -// LLVM: store volatile i64 [[TMP5]], ptr [[TMP2]], align 8 +// LLVM: store volatile i64 [[TMP5]], ptr [[TMP2]], align 4 // OCGC: define dso_local void @get_volatile // OCGC: [[TMP0:%.*]] = alloca ptr, align 8 @@ -249,16 +249,16 @@ void set_volatile(V* v) { //CIR: [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i //CIR: [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr>, !cir.ptr //CIR: [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr -> !cir.ptr -//CIR: [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr, [[TMP1]] : !s32i) {is_volatile} -> !s32i +//CIR: [[TMP4:%.*]] = cir.set_bitfield align(4) (#bfi_b, [[TMP3]] : !cir.ptr, [[TMP1]] : !s32i) {is_volatile} -> !s32i // LLVM: define dso_local void @set_volatile // LLVM: [[TMP0:%.*]] = alloca ptr, i64 1, align 8 // LLVM: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 // LLVM: [[TMP2:%.*]] = getelementptr %struct.V, ptr [[TMP1]], i32 0, i32 0 -// LLVM: [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 8 +// LLVM: [[TMP3:%.*]] = load volatile i64, ptr [[TMP2]], align 4 // LLVM: [[TMP4:%.*]] = and i64 [[TMP3]], -1095216660481 // LLVM: [[TMP5:%.*]] = or i64 [[TMP4]], 12884901888 -// LLVM: store volatile i64 [[TMP5]], ptr [[TMP2]], align 8 +// LLVM: store volatile i64 [[TMP5]], ptr [[TMP2]], align 4 // OGCG: define dso_local void @set_volatile // OGCG: [[TMP0:%.*]] = alloca ptr, align 8 @@ -276,24 +276,24 @@ void unOp(S* s) { // CIR: [[TMP0:%.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["s", init] {alignment = 8 : i64} // CIR: [[TMP1:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr>, !cir.ptr // CIR: [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "d"} : !cir.ptr -> !cir.ptr -// CIR: [[TMP3:%.*]] = cir.get_bitfield(#bfi_d, [[TMP2]] : !cir.ptr) -> !s32i +// CIR: [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_d, [[TMP2]] : !cir.ptr) -> !s32i // CIR: [[TMP4:%.*]] = cir.unary(inc, [[TMP3]]) nsw : !s32i, !s32i -// CIR: cir.set_bitfield(#bfi_d, [[TMP2]] : !cir.ptr, [[TMP4]] : !s32i) +// CIR: cir.set_bitfield align(4) (#bfi_d, [[TMP2]] : !cir.ptr, [[TMP4]] : !s32i) // LLVM: define {{.*@unOp}} // LLVM: [[TMP0:%.*]] = getelementptr %struct.S, ptr [[LOAD0:%.*]], i32 0, i32 0 -// LLVM: [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 8 +// LLVM: [[TMP1:%.*]] = load i64, ptr [[TMP0]], align 4 // LLVM: [[TMP2:%.*]] = shl i64 [[TMP1]], 13 // LLVM: [[TMP3:%.*]] = ashr i64 [[TMP2]], 62 // LLVM: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 // LLVM: [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1 // LLVM: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 -// LLVM: [[TMP7:%.*]] = load i64, ptr [[TMP0]], align 8 +// LLVM: [[TMP7:%.*]] = load i64, ptr [[TMP0]], align 4 // LLVM: [[TMP8:%.*]] = and i64 [[TMP6]], 3 // LLVM: [[TMP9:%.*]] = shl i64 [[TMP8]], 49 // LLVM: [[TMP10:%.*]] = and i64 [[TMP7]], -1688849860263937 // LLVM: [[TMP11:%.*]] = or i64 [[TMP10]], [[TMP9]] -// LLVM: store i64 [[TMP11]], ptr [[TMP0]], align 8 +// LLVM: store i64 [[TMP11]], ptr [[TMP0]], align 4 // LLVM: [[TMP12:%.*]] = shl i64 [[TMP8]], 62 // LLVM: [[TMP13:%.*]] = ashr i64 [[TMP12]], 62 // LLVM: [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32 diff --git a/clang/test/CIR/CodeGen/bitfields.cpp b/clang/test/CIR/CodeGen/bitfields.cpp index 6715ebf1f48b6..7650e0b83faf6 100644 --- a/clang/test/CIR/CodeGen/bitfields.cpp +++ b/clang/test/CIR/CodeGen/bitfields.cpp @@ -39,14 +39,14 @@ int load_field(S* s) { // CIR: [[TMP0:%.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["s", init] // CIR: [[TMP1:%.*]] = cir.load{{.*}} [[TMP0]] : !cir.ptr>, !cir.ptr // CIR: [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "c"} : !cir.ptr -> !cir.ptr -// CIR: [[TMP3:%.*]] = cir.get_bitfield(#bfi_c, [[TMP2]] : !cir.ptr) -> !s32i +// CIR: [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP2]] : !cir.ptr) -> !s32i // LLVM: define dso_local i32 @_Z10load_fieldP1S // LLVM: [[TMP0:%.*]] = alloca ptr, i64 1, align 8 // LLVM: [[TMP1:%.*]] = alloca i32, i64 1, align 4 // LLVM: [[TMP2:%.*]] = load ptr, ptr [[TMP0]], align 8 // LLVM: [[TMP3:%.*]] = getelementptr %struct.S, ptr [[TMP2]], i32 0, i32 0 -// LLVM: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 8 +// LLVM: [[TMP4:%.*]] = load i64, ptr [[TMP3]], align 4 // LLVM: [[TMP5:%.*]] = shl i64 [[TMP4]], 15 // LLVM: [[TMP6:%.*]] = ashr i64 [[TMP5]], 47 // LLVM: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 @@ -67,15 +67,15 @@ void store_field() { // CIR: [[TMP0:%.*]] = cir.alloca !rec_S, !cir.ptr // CIR: [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i // CIR: [[TMP2:%.*]] = cir.get_member [[TMP0]][0] {name = "a"} : !cir.ptr -> !cir.ptr -// CIR: cir.set_bitfield(#bfi_a, [[TMP2]] : !cir.ptr, [[TMP1]] : !s32i) +// CIR: cir.set_bitfield align(4) (#bfi_a, [[TMP2]] : !cir.ptr, [[TMP1]] : !s32i) // LLVM: define dso_local void @_Z11store_fieldv // LLVM: [[TMP0:%.*]] = alloca %struct.S, i64 1, align 4 // LLVM: [[TMP1:%.*]] = getelementptr %struct.S, ptr [[TMP0]], i32 0, i32 0 -// LLVM: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8 +// LLVM: [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 4 // LLVM: [[TMP3:%.*]] = and i64 [[TMP2]], -16 // LLVM: [[TMP4:%.*]] = or i64 [[TMP3]], 3 -// LLVM: store i64 [[TMP4]], ptr [[TMP1]], align 8 +// LLVM: store i64 [[TMP4]], ptr [[TMP1]], align 4 // OGCG: define dso_local void @_Z11store_fieldv() // OGCG: [[TMP0:%.*]] = alloca %struct.S, align 4 @@ -93,25 +93,25 @@ void store_bitfield_to_bitfield(S* s) { // CIR: [[TMP1:%.*]] = cir.const #cir.int<3> : !s32i // CIR: [[TMP2:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr>, !cir.ptr // CIR: [[TMP3:%.*]] = cir.get_member [[TMP2]][0] {name = "b"} : !cir.ptr -> !cir.ptr -// CIR: [[TMP4:%.*]] = cir.set_bitfield(#bfi_b, [[TMP3]] : !cir.ptr, [[TMP1]] : !s32i) -> !s32i +// CIR: [[TMP4:%.*]] = cir.set_bitfield align(4) (#bfi_b, [[TMP3]] : !cir.ptr, [[TMP1]] : !s32i) -> !s32i // CIR: [[TMP5:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr>, !cir.ptr // CIR: [[TMP6:%.*]] = cir.get_member [[TMP5]][0] {name = "a"} : !cir.ptr -> !cir.ptr -// CIR: [[TMP7:%.*]] = cir.set_bitfield(#bfi_a, [[TMP6]] : !cir.ptr, [[TMP4]] : !s32i) -> !s32i +// CIR: [[TMP7:%.*]] = cir.set_bitfield align(4) (#bfi_a, [[TMP6]] : !cir.ptr, [[TMP4]] : !s32i) -> !s32i // LLVM: define dso_local void @_Z26store_bitfield_to_bitfieldP1S // LLVM: [[TMP0:%.*]] = alloca ptr, i64 1, align 8 // LLVM: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8 // LLVM: [[TMP2:%.*]] = getelementptr %struct.S, ptr [[TMP1]], i32 0, i32 0 -// LLVM: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 +// LLVM: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4 // LLVM: [[TMP4:%.*]] = and i64 [[TMP3]], -2147483633 // LLVM: [[TMP5:%.*]] = or i64 [[TMP4]], 48 -// LLVM: store i64 [[TMP5]], ptr [[TMP2]], align 8 +// LLVM: store i64 [[TMP5]], ptr [[TMP2]], align 4 // LLVM: [[TMP6:%.*]] = load ptr, ptr [[TMP0]], align 8 // LLVM: [[TMP7:%.*]] = getelementptr %struct.S, ptr [[TMP6]], i32 0, i32 0 -// LLVM: [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 8 +// LLVM: [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 4 // LLVM: [[TMP9:%.*]] = and i64 [[TMP8]], -16 // LLVM: [[TMP10:%.*]] = or i64 [[TMP9]], 3 -// LLVM: store i64 [[TMP10]], ptr [[TMP7]], align 8 +// LLVM: store i64 [[TMP10]], ptr [[TMP7]], align 4 // OGCG: define dso_local void @_Z26store_bitfield_to_bitfieldP1S // OGCG: [[TMP0:%.*]] = alloca ptr, align 8 diff --git a/clang/test/CIR/CodeGen/bitfields_be.c b/clang/test/CIR/CodeGen/bitfields_be.c index 6133927b67d21..77741ba74870b 100644 --- a/clang/test/CIR/CodeGen/bitfields_be.c +++ b/clang/test/CIR/CodeGen/bitfields_be.c @@ -25,7 +25,7 @@ int init(S* s) { //CIR: [[TMP0:%.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["s", init] {alignment = 8 : i64} //CIR: [[TMP1:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr>, !cir.ptr //CIR: [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "c"} : !cir.ptr -> !cir.ptr -//CIR: [[TMP3:%.*]] = cir.get_bitfield(#bfi_c, [[TMP2]] : !cir.ptr) -> !s32i +//CIR: [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_c, [[TMP2]] : !cir.ptr) -> !s32i //LLVM: define dso_local i32 @init(ptr %0) { //LLVM: [[TMP0:%.*]] = alloca ptr, i64 1, align 8 @@ -57,7 +57,7 @@ void load(S* s) { // CIR: %[[MIN1:.*]] = cir.unary(minus, %[[CONST1]]) nsw : !s32i, !s32i // CIR: %[[VAL0:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr>, !cir.ptr // CIR: %[[GET0:.*]] = cir.get_member %[[VAL0]][0] {name = "a"} : !cir.ptr -> !cir.ptr -// CIR: %[[SET0:.*]] = cir.set_bitfield(#bfi_a, %[[GET0]] : !cir.ptr, %[[MIN1]] : !s32i) -> !s32i +// CIR: %[[SET0:.*]] = cir.set_bitfield align(4) (#bfi_a, %[[GET0]] : !cir.ptr, %[[MIN1]] : !s32i) -> !s32i // LLVM: define dso_local void @load // LLVM: %[[PTR0:.*]] = load ptr @@ -65,50 +65,50 @@ void load(S* s) { // LLVM: %[[VAL0:.*]] = load i32, ptr %[[GET0]], align 4 // LLVM: %[[AND0:.*]] = and i32 %[[VAL0]], 268435455 // LLVM: %[[OR0:.*]] = or i32 %[[AND0]], -1073741824 -// LLVM: store i32 %[[OR0]], ptr %[[GET0]] +// LLVM: store i32 %[[OR0]], ptr %[[GET0]], align 4 // OGCG: define dso_local void @load // OGCG: %[[PTR0:.*]] = load ptr -// OGCG: %[[VAL0:.*]] = load i32, ptr %[[PTR0]] +// OGCG: %[[VAL0:.*]] = load i32, ptr %[[PTR0]], align 4 // OGCG: %[[AND0:.*]] = and i32 %[[VAL0]], 268435455 // OGCG: %[[OR0:.*]] = or i32 %[[AND0]], -1073741824 -// OGCG: store i32 %[[OR0]], ptr %[[PTR0]] +// OGCG: store i32 %[[OR0]], ptr %[[PTR0]], align 4 // field 'b' // CIR: %[[CONST2:.*]] = cir.const #cir.int<42> : !s32i // CIR: %[[VAL1:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr>, !cir.ptr // CIR: %[[GET1:.*]] = cir.get_member %[[VAL1]][0] {name = "b"} : !cir.ptr -> !cir.ptr -// CIR: %[[SET1:.*]] = cir.set_bitfield(#bfi_b, %[[GET1]] : !cir.ptr, %[[CONST2]] : !s32i) -> !s32i +// CIR: %[[SET1:.*]] = cir.set_bitfield align(4) (#bfi_b, %[[GET1]] : !cir.ptr, %[[CONST2]] : !s32i) -> !s32i // LLVM: %[[PTR1:.*]] = load ptr // LLVM: %[[GET1:.*]] = getelementptr %struct.S, ptr %[[PTR1]], i32 0, i32 0 // LLVM: %[[VAL1:.*]] = load i32, ptr %[[GET1]], align 4 // LLVM: %[[AND1:.*]] = and i32 %[[VAL1]], -268304385 // LLVM: %[[OR1:.*]] = or i32 %[[AND1]], 5505024 -// LLVM: store i32 %[[OR1]], ptr %[[GET1]] +// LLVM: store i32 %[[OR1]], ptr %[[GET1]], align 4 // OGCG: %[[PTR1:.*]] = load ptr -// OGCG: %[[VAL1:.*]] = load i32, ptr %[[PTR1]] +// OGCG: %[[VAL1:.*]] = load i32, ptr %[[PTR1]], align 4 // OGCG: %[[AND1:.*]] = and i32 %[[VAL1]], -268304385 // OGCG: %[[OR1:.*]] = or i32 %[[AND1]], 5505024 -// OGCG: store i32 %[[OR1]], ptr %[[PTR1]] +// OGCG: store i32 %[[OR1]], ptr %[[PTR1]], align 4 // field 'c' // CIR: %[[CONST3:.*]] = cir.const #cir.int<12345> : !s32i // CIR: %[[MIN2:.*]] = cir.unary(minus, %[[CONST3]]) nsw : !s32i, !s32i // CIR: %[[VAL2:.*]] = cir.load align(8) %[[PTR0]] : !cir.ptr>, !cir.ptr // CIR: %[[GET2:.*]] = cir.get_member %[[VAL2]][0] {name = "c"} : !cir.ptr -> !cir.ptr -// CIR: %[[SET2:.*]] = cir.set_bitfield(#bfi_c, %[[GET2]] : !cir.ptr, %[[MIN2]] : !s32i) -> !s32i +// CIR: %[[SET2:.*]] = cir.set_bitfield align(4) (#bfi_c, %[[GET2]] : !cir.ptr, %[[MIN2]] : !s32i) -> !s32i // LLVM: %[[PTR2:.*]] = load ptr // LLVM: %[[GET2:.*]] = getelementptr %struct.S, ptr %[[PTR2]], i32 0, i32 0 // LLVM: %[[VAL2:.*]] = load i32, ptr %[[GET2]], align 4 // LLVM: %[[AND2:.*]] = and i32 %[[VAL2]], -131072 // LLVM: %[[OR2:.*]] = or i32 %[[AND2]], 118727 -// LLVM: store i32 %[[OR2]], ptr %[[GET2]] +// LLVM: store i32 %[[OR2]], ptr %[[GET2]], align 4 // OGCG: %[[PTR2:.*]] = load ptr -// OGCG: %[[VAL2:.*]] = load i32, ptr %[[PTR2]] +// OGCG: %[[VAL2:.*]] = load i32, ptr %[[PTR2]], align 4 // OGCG: %[[AND2:.*]] = and i32 %[[VAL2]], -131072 // OGCG: %[[OR2:.*]] = or i32 %[[AND2]], 118727 -// OGCG: store i32 %[[OR2]], ptr %[[PTR2]] +// OGCG: store i32 %[[OR2]], ptr %[[PTR2]], align 4 From 965b68e8f26ea51202adfd2ab6429a68e8ce63c3 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Fri, 18 Jul 2025 14:20:13 -0700 Subject: [PATCH 408/813] [NVPTX] Prevent fptrunc of v2f32 from being folded into store (#149571) --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 2 ++ .../test/CodeGen/NVPTX/bf16x2-instructions.ll | 7 ++-- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 9 +++-- llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 35 +++++++++++++++++++ 4 files changed, 48 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 31b236a6126ad..77784be467e44 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -731,6 +731,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setTruncStoreAction(MVT::f32, MVT::bf16, Expand); setTruncStoreAction(MVT::f64, MVT::bf16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand); // PTX does not support load / store predicate registers setOperationAction(ISD::LOAD, MVT::i1, Custom); diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index e2a914d8cfc36..ba5813c869236 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -359,11 +359,12 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-LABEL: test_fptrunc_2xfloat( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = fptrunc <2 x float> %a to <2 x bfloat> ret <2 x bfloat> %r diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 3baefde072be7..a077ca17e4215 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1499,11 +1499,16 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-LABEL: test_fptrunc_2xfloat( ; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; -; CHECK-NEXT: st.param.b32 [func_retval0], %rd1; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2; +; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1; +; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; %r = fptrunc <2 x float> %a to <2 x half> ret <2 x half> %r diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index da9e2d8cba139..2109449fa586c 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -2108,6 +2108,41 @@ define <2 x float> @test_uitofp_2xi32_to_2xfloat(<2 x i32> %a) #0 { ret <2 x float> %r } +define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) { +; CHECK-LABEL: test_trunc_to_v2bf16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2bf16_param_0]; +; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; +; CHECK-NEXT: st.b32 [%rd2], %r3; +; CHECK-NEXT: ret; + %trunc = fptrunc <2 x float> %a to <2 x bfloat> + store <2 x bfloat> %trunc, ptr %p + ret void +} + +define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) { +; CHECK-LABEL: test_trunc_to_v2f16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2f16_param_0]; +; CHECK-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; +; CHECK-NEXT: st.b32 [%rd2], %r3; +; CHECK-NEXT: ret; + %trunc = fptrunc <2 x float> %a to <2 x half> + store <2 x half> %trunc, ptr %p + ret void +} + + attributes #0 = { nounwind } attributes #1 = { "unsafe-fp-math" = "true" } attributes #2 = { "denormal-fp-math"="preserve-sign" } From 3641448e08961d8b8621fafa01167f96d948ee9e Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 18 Jul 2025 14:15:14 -0700 Subject: [PATCH 409/813] [lldb] Use StopInfoSP instead of StopInfo* (NFC) Don't make assumptions about the lifetime of the underlying object and use the shared_ptr to participate in reference counting and extend the lifetime of the object to the end of the lexical scope. --- lldb/source/Target/Thread.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp index c68894808eacc..8c3e19725f8cb 100644 --- a/lldb/source/Target/Thread.cpp +++ b/lldb/source/Target/Thread.cpp @@ -710,9 +710,8 @@ bool Thread::ShouldResume(StateType resume_state) { const uint32_t process_stop_id = GetProcess()->GetStopID(); if (m_stop_info_stop_id == process_stop_id && (m_stop_info_sp && m_stop_info_sp->IsValid())) { - StopInfo *stop_info = GetPrivateStopInfo().get(); - if (stop_info) - stop_info->WillResume(resume_state); + if (StopInfoSP stop_info_sp = GetPrivateStopInfo()) + stop_info_sp->WillResume(resume_state); } // Tell all the plans that we are about to resume in case they need to clear From 1b8a136a09bfed49ae008a354946804230055153 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 18 Jul 2025 14:31:21 -0700 Subject: [PATCH 410/813] [Sanitizer] remove array-bounds-pseudofn (#149430) This has been replaced by -fsanitize-annotate-debug-info --- clang/lib/CodeGen/CGDebugInfo.cpp | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 446cf8d9e05c6..d6a5c4c476d5c 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -58,13 +58,6 @@ using namespace clang; using namespace clang::CodeGen; -// TODO: consider deprecating ClArrayBoundsPseudoFn; functionality is subsumed -// by -fsanitize-annotate-debug-info -static llvm::cl::opt ClArrayBoundsPseudoFn( - "array-bounds-pseudofn", llvm::cl::Hidden, llvm::cl::Optional, - llvm::cl::desc("Emit debug info that places array-bounds instrumentation " - "in an inline function called __ubsan_check_array_bounds.")); - static uint32_t getTypeAlignIfRequired(const Type *Ty, const ASTContext &Ctx) { auto TI = Ctx.getTypeInfo(Ty); if (TI.isAlignRequired()) @@ -6482,7 +6475,11 @@ llvm::DILocation *CodeGenFunction::SanitizerAnnotateDebugInfo( SanitizerHandler Handler) { llvm::DILocation *CheckDebugLoc = Builder.getCurrentDebugLocation(); auto *DI = getDebugInfo(); - if (!DI) + if (!DI || !CheckDebugLoc) + return CheckDebugLoc; + const auto &AnnotateDebugInfo = + CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo; + if (AnnotateDebugInfo.empty()) return CheckDebugLoc; std::string Label; @@ -6491,14 +6488,8 @@ llvm::DILocation *CodeGenFunction::SanitizerAnnotateDebugInfo( else Label = SanitizerHandlerToCheckLabel(Handler); - for (auto Ord : Ordinals) { - // TODO: deprecate ClArrayBoundsPseudoFn - if (((ClArrayBoundsPseudoFn && Ord == SanitizerKind::SO_ArrayBounds) || - CGM.getCodeGenOpts().SanitizeAnnotateDebugInfo.has(Ord)) && - CheckDebugLoc) { - return DI->CreateSyntheticInlineAt(CheckDebugLoc, Label); - } - } + if (any_of(Ordinals, [&](auto Ord) { return AnnotateDebugInfo.has(Ord); })) + return DI->CreateSyntheticInlineAt(CheckDebugLoc, Label); return CheckDebugLoc; } From 6d8e53d4afe46608f47bcb014387c053829cdcf1 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 18 Jul 2025 14:38:46 -0700 Subject: [PATCH 411/813] [AMDGPU] Support nv memory instructions modifier on gfx1250 (#149582) --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 26 +++++++- llvm/lib/Target/AMDGPU/BUFInstructions.td | 1 + llvm/lib/Target/AMDGPU/FLATInstructions.td | 3 +- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 3 + llvm/lib/Target/AMDGPU/SIDefines.h | 2 + llvm/lib/Target/AMDGPU/SIInstrFormats.td | 1 + llvm/lib/Target/AMDGPU/SMInstructions.td | 3 +- llvm/test/MC/AMDGPU/gfx1250_asm_smem.s | 14 +++++ .../MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s | 20 +++++++ llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s | 60 +++++++++++++++++++ .../Disassembler/AMDGPU/gfx1250_dasm_smem.txt | 7 +++ .../AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt | 10 ++++ .../AMDGPU/gfx1250_dasm_vflat.txt | 30 ++++++++++ 13 files changed, 177 insertions(+), 3 deletions(-) create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_smem.s create mode 100644 llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 43d4e8db791b0..de17fccdda902 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -5280,6 +5280,15 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst, unsigned CPol = Inst.getOperand(CPolPos).getImm(); + if (!isGFX1250()) { + if (CPol & CPol::NV) { + SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands); + StringRef CStr(S.getPointer()); + S = SMLoc::getFromPointer(&CStr.data()[CStr.find("nv")]); + Error(S, "nv is not supported on this GPU"); + } + } + if (isGFX12Plus()) return validateTHAndScopeBits(Inst, Operands, CPol); @@ -6916,6 +6925,7 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { int64_t CPolVal = 0; ParseStatus ResTH = ParseStatus::NoMatch; ParseStatus ResScope = ParseStatus::NoMatch; + ParseStatus ResNV = ParseStatus::NoMatch; for (;;) { if (ResTH.isNoMatch()) { @@ -6940,10 +6950,24 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) { } } + // NV bit exists on GFX12+, but does something starting from GFX1250. + // Allow parsing on all GFX12 and fail on validation for better + // diagnostics. + if (ResNV.isNoMatch()) { + if (trySkipId("nv")) { + ResNV = ParseStatus::Success; + CPolVal |= CPol::NV; + continue; + } else if (trySkipId("no", "nv")) { + ResNV = ParseStatus::Success; + continue; + } + } + break; } - if (ResTH.isNoMatch() && ResScope.isNoMatch()) + if (ResTH.isNoMatch() && ResScope.isNoMatch() && ResNV.isNoMatch()) return ParseStatus::NoMatch; Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc, diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 0caabe41e9b79..e994aeeb82251 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -2451,6 +2451,7 @@ class VBUFFER_Real op, BUF_Pseudo ps, string real_name> : let Inst{62} = ps.offen; let Inst{63} = ps.idxen; + let Inst{7} = cpol{5}; // nv let Inst{54-53} = cpol{2-1}; // th{2-1} let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0} let Inst{51-50} = cpol{4-3}; // scope diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 1432b5940f3f0..f7f29f17f9d0e 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -183,7 +183,7 @@ class VFLAT_Real op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : bits<7> saddr; bits<8> vdst; - bits<6> cpol; + bits<12> cpol; bits<8> vdata; // vsrc bits<8> vaddr; bits<24> offset; @@ -193,6 +193,7 @@ class VFLAT_Real op, FLAT_Pseudo ps, string opName = ps.Mnemonic> : let Inst{31-26} = 0x3b; let Inst{39-32} = !if(ps.has_vdst, vdst, ?); let Inst{49} = ps.sve; + let Inst{7} = cpol{5}; // nv let Inst{54-53} = cpol{2-1}; // th{2-1} let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0} let Inst{51-50} = cpol{4-3}; // scope diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index ec9248b972ec4..44d2f947ec9c2 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -160,6 +160,9 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, printTH(MI, TH, Scope, O); printScope(Scope, O); + if (Imm & CPol::NV) + O << " nv"; + return; } diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index a8649970aa825..edc74605ab241 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -398,6 +398,8 @@ enum CPol { SCOPE_DEV = 2 << 3, SCOPE_SYS = 3 << 3, + NV = 1 << 5, // Non-volatile bit + SWZ = 1 << 6, // Swizzle bit ALL = TH | SCOPE, diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td index a368bc5d0b1a1..6b419347c01d9 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -317,6 +317,7 @@ def CPolBit { int SLC = 1; int DLC = 2; int SCC = 4; + int NV = 5; } class VOPDstOperand : RegisterOperand ; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 37dcc10086257..d8b52d271a964 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -87,7 +87,7 @@ class SM_Real bits<7> sdst; bits<32> offset; bits<8> soffset; - bits<5> cpol; + bits<12> cpol; } class OffsetMode op, string ps, string opName, OffsetMode offs RegisterClass BaseClass = !cast(ps # offsets.Variant).BaseClass; let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol)); + let Inst{20} = cpol{CPolBit.NV}; // non-volatile let Inst{22-21} = cpol{4-3}; // scope let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported } diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s b/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s new file mode 100644 index 0000000000000..899c4c7aca0ba --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_smem.s @@ -0,0 +1,14 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s + +s_load_b32 s4, s[2:3], 10 nv +// GFX1250: s_load_b32 s4, s[2:3], 0xa nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}s_load_b32 s4, s[2:3], 10 nv +// GFX12-ERR-NEXT:{{^}} ^ + +s_buffer_load_i8 s5, s[4:7], s0 nv +// GFX1250: s_buffer_load_i8 s5, s[4:7], s0 offset:0x0 nv ; encoding: [0x42,0x01,0x13,0xf4,0x00,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}s_buffer_load_i8 s5, s[4:7], s0 nv +// GFX12-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s new file mode 100644 index 0000000000000..1d14bd91a7569 --- /dev/null +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vbuffer_mubuf.s @@ -0,0 +1,20 @@ +// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s +// RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s + +buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 nv +// GFX1250: buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 nv ; encoding: [0x83,0x00,0x05,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 nv +// GFX12-ERR-NEXT:{{^}} ^ + +buffer_store_b128 v[2:5], v0, s[12:15], s4 idxen offset:4095 nv +// GFX1250: buffer_store_b128 v[2:5], v0, s[12:15], s4 idxen offset:4095 nv ; encoding: [0x84,0x40,0x07,0xc4,0x02,0x18,0x80,0x80,0x00,0xff,0x0f,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}buffer_store_b128 v[2:5], v0, s[12:15], s4 idxen offset:4095 nv +// GFX12-ERR-NEXT:{{^}} ^ + +buffer_atomic_and_b32 v5, v1, s[8:11], s3 offen offset:4095 nv +// GFX1250: buffer_atomic_and_b32 v5, v1, s[8:11], s3 offen offset:4095 nv ; encoding: [0x83,0x00,0x0f,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}buffer_atomic_and_b32 v5, v1, s[8:11], s3 offen offset:4095 nv +// GFX12-ERR-NEXT:{{^}} ^ diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s index 737d7b3de4e92..488040e1b5390 100644 --- a/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s +++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vflat.s @@ -1,6 +1,66 @@ // RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s | FileCheck --check-prefix=GFX1250 %s // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: --strict-whitespace %s +global_load_b32 v0, v[2:3], off nv +// GFX1250: global_load_b32 v0, v[2:3], off nv ; encoding: [0xfc,0x00,0x05,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}global_load_b32 v0, v[2:3], off nv +// GFX12-ERR-NEXT:{{^}} ^ + +global_store_b32 v[2:3], v0, off nv +// GFX1250: global_store_b32 v[2:3], v0, off nv ; encoding: [0xfc,0x80,0x06,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}global_store_b32 v[2:3], v0, off nv +// GFX12-ERR-NEXT:{{^}} ^ + +global_atomic_add v[2:3], v2, off nv +// GFX1250: global_atomic_add_u32 v[2:3], v2, off nv ; encoding: [0xfc,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}global_atomic_add v[2:3], v2, off nv +// GFX12-ERR-NEXT:{{^}} ^ + +global_load_addtid_b32 v5, s[2:3] nv +// GFX1250: global_load_addtid_b32 v5, s[2:3] nv ; encoding: [0x82,0x00,0x0a,0xee,0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}global_load_addtid_b32 v5, s[2:3] nv +// GFX12-ERR-NEXT:{{^}} ^ + +scratch_load_b32 v0, v2, off nv +// GFX1250: scratch_load_b32 v0, v2, off nv ; encoding: [0xfc,0x00,0x05,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v0, v2, off nv +// GFX12-ERR-NEXT:{{^}} ^ + +scratch_store_b32 v2, v0, off nv +// GFX1250: scratch_store_b32 v2, v0, off nv ; encoding: [0xfc,0x80,0x06,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}scratch_store_b32 v2, v0, off nv +// GFX12-ERR-NEXT:{{^}} ^ + +flat_load_b32 v0, v[2:3] nv +// GFX1250: flat_load_b32 v0, v[2:3] nv ; encoding: [0xfc,0x00,0x05,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}flat_load_b32 v0, v[2:3] nv +// GFX12-ERR-NEXT:{{^}} ^ + +flat_store_b32 v[2:3], v0 nv +// GFX1250: flat_store_b32 v[2:3], v0 nv ; encoding: [0xfc,0x80,0x06,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}flat_store_b32 v[2:3], v0 nv +// GFX12-ERR-NEXT:{{^}} ^ + +flat_atomic_add v[2:3], v2 nv +// GFX1250: flat_atomic_add_u32 v[2:3], v2 nv ; encoding: [0xfc,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}flat_atomic_add v[2:3], v2 nv +// GFX12-ERR-NEXT:{{^}} ^ + +scratch_load_b32 v5, v2, off nv +// GFX1250: scratch_load_b32 v5, v2, off nv ; encoding: [0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: nv is not supported on this GPU +// GFX12-ERR-NEXT:{{^}}scratch_load_b32 v5, v2, off nv +// GFX12-ERR-NEXT:{{^}} ^ + tensor_save s[0:1] // GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt new file mode 100644 index 0000000000000..4bd9ab45af6bd --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_smem.txt @@ -0,0 +1,7 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s + +# GFX1250: s_buffer_load_i8 s5, s[4:7], s0 offset:0x0 nv ; encoding: [0x42,0x01,0x13,0xf4,0x00,0x00,0x00,0x00] +0x42,0x01,0x13,0xf4,0x00,0x00,0x00,0x00 + +# GFX1250: s_load_b32 s4, s[2:3], 0xa nv ; encoding: [0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8] +0x01,0x01,0x10,0xf4,0x0a,0x00,0x00,0xf8 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt new file mode 100644 index 0000000000000..a2f12115bb64b --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vbuffer_mubuf.txt @@ -0,0 +1,10 @@ +# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1250 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1250 %s + +# GFX1250: buffer_atomic_and_b32 v5, v1, s[8:11], s3 offen offset:4095 nv ; encoding: [0x83,0x00,0x0f,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00] +0x83,0x00,0x0f,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00 + +# GFX1250: buffer_load_b32 v5, v1, s[8:11], s3 offen offset:4095 nv ; encoding: [0x83,0x00,0x05,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00] +0x83,0x00,0x05,0xc4,0x05,0x10,0x80,0x40,0x01,0xff,0x0f,0x00 + +# GFX1250: buffer_store_b128 v[2:5], v0, s[12:15], s4 idxen offset:4095 nv ; encoding: [0x84,0x40,0x07,0xc4,0x02,0x18,0x80,0x80,0x00,0xff,0x0f,0x00] +0x84,0x40,0x07,0xc4,0x02,0x18,0x80,0x80,0x00,0xff,0x0f,0x00 diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt index 55bc3e7a5746c..fcbb58bf7865e 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vflat.txt @@ -2826,6 +2826,36 @@ # GFX1250: scratch_store_d16_hi_b8 v1, v2, s3 ; encoding: [0x03,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00] 0x03,0x00,0x09,0xed,0x00,0x00,0x02,0x01,0x01,0x00,0x00,0x00 +# GFX1250: flat_atomic_add_u32 v[2:3], v2 nv ; encoding: [0xfc,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00] +0xfc,0x40,0x0d,0xec,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00 + +# GFX1250: flat_load_b32 v0, v[2:3] nv ; encoding: [0xfc,0x00,0x05,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +0xfc,0x00,0x05,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: flat_store_b32 v[2:3], v0 nv ; encoding: [0xfc,0x80,0x06,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +0xfc,0x80,0x06,0xec,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_atomic_add_u32 v[2:3], v2, off nv ; encoding: [0xfc,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00] +0xfc,0x40,0x0d,0xee,0x00,0x00,0x00,0x01,0x02,0x00,0x00,0x00 + +# GFX1250: global_load_addtid_b32 v5, s[2:3] nv ; encoding: [0x82,0x00,0x0a,0xee,0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00] +0x82,0x00,0x0a,0xee,0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + +# GFX1250: global_load_b32 v0, v[2:3], off nv ; encoding: [0xfc,0x00,0x05,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +0xfc,0x00,0x05,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: global_store_b32 v[2:3], v0, off nv ; encoding: [0xfc,0x80,0x06,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00] +0xfc,0x80,0x06,0xee,0x00,0x00,0x00,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_b32 v0, v2, off nv ; encoding: [0xfc,0x00,0x05,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0xfc,0x00,0x05,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_store_b32 v2, v0, off nv ; encoding: [0xfc,0x80,0x06,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0xfc,0x80,0x06,0xed,0x00,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + +# GFX1250: scratch_load_b32 v5, v2, off nv ; encoding: [0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00] +0xfc,0x00,0x05,0xed,0x05,0x00,0x02,0x00,0x02,0x00,0x00,0x00 + # GFX1250: tensor_save s[0:1] ; encoding: [0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00] 0x00,0x80,0x1b,0xee,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 From 921c6dbecaf49e3ed24b94802f094cd7f61f1873 Mon Sep 17 00:00:00 2001 From: Prabhu Rajasekaran Date: Fri, 18 Jul 2025 14:40:54 -0700 Subject: [PATCH 412/813] [llvm] Introduce callee_type metadata Introduce `callee_type` metadata which will be attached to the indirect call instructions. The `callee_type` metadata will be used to generate `.callgraph` section described in this RFC: https://lists.llvm.org/pipermail/llvm-dev/2021-July/151739.html Reviewers: morehouse, petrhosek, nikic, ilovepi Reviewed By: nikic, ilovepi Pull Request: https://github.com/llvm/llvm-project/pull/87573 --- llvm/docs/CalleeTypeMetadata.rst | 33 ++++ llvm/docs/LangRef.rst | 5 + llvm/docs/Reference.rst | 1 + llvm/include/llvm/IR/FixedMetadataKinds.def | 1 + llvm/include/llvm/IR/Metadata.h | 9 + llvm/lib/IR/Metadata.cpp | 18 ++ llvm/lib/IR/Verifier.cpp | 31 ++++ .../InstCombine/InstCombineCalls.cpp | 7 + llvm/lib/Transforms/Utils/Local.cpp | 6 + llvm/lib/Transforms/Utils/ValueMapper.cpp | 7 + llvm/test/Assembler/callee-type-metadata.ll | 21 +++ .../Inline/drop-callee-type-metadata.ll | 41 +++++ .../InstCombine/drop-callee-type-metadata.ll | 25 +++ .../SimplifyCFG/merge-callee-type-metadata.ll | 167 ++++++++++++++++++ llvm/test/Verifier/callee-type-metadata.ll | 33 ++++ 15 files changed, 405 insertions(+) create mode 100644 llvm/docs/CalleeTypeMetadata.rst create mode 100644 llvm/test/Assembler/callee-type-metadata.ll create mode 100644 llvm/test/Transforms/Inline/drop-callee-type-metadata.ll create mode 100644 llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll create mode 100644 llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll create mode 100644 llvm/test/Verifier/callee-type-metadata.ll diff --git a/llvm/docs/CalleeTypeMetadata.rst b/llvm/docs/CalleeTypeMetadata.rst new file mode 100644 index 0000000000000..45d0657966a8c --- /dev/null +++ b/llvm/docs/CalleeTypeMetadata.rst @@ -0,0 +1,33 @@ +==================== +Callee Type Metadata +==================== + +Introduction +============ +This ``!callee_type`` metadata is introduced to support the generation of a call graph +section in the object file. The ``!callee_type`` metadata is used +to identify the types of the intended callees of indirect call instructions. The ``!callee_type`` metadata is a +list of one or more generalized ``!type`` metadata objects (See :doc:`TypeMetadata`) with each ``!type`` +metadata pointing to a callee's :ref:`type identifier `. +LLVM's `Control Flow Integrity (CFI)`_ also uses the ``!type`` metadata in its implementation. + +.. _Control Flow Integrity (CFI): https://clang.llvm.org/docs/ControlFlowIntegrity.html + +.. _calleetype-type-identifier: + +Type identifier +================ + +The type for an indirect call target is the callee's function signature. +Mapping from a type to an identifier is an ABI detail. +In the current implementation, an identifier of type T is +computed as follows: + + - Obtain the generalized mangled name for “typeinfo name for T”. + - Compute MD5 hash of the name as a string. + - Reinterpret the first 8 bytes of the hash as a little-endian 64-bit integer. + +To avoid mismatched pointer types, generalizations are applied. +Pointers in return and argument types are treated as equivalent as long as the qualifiers for the +type they point to match. For example, ``char*``, ``char**``, and ``int*`` are considered equivalent +types. However, ``char*`` and ``const char*`` are considered distinct types. diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 371f356c80b0a..be5f7fbd90b5e 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -8171,6 +8171,11 @@ change in the future. See :doc:`TypeMetadata`. +'``callee_type``' Metadata +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +See :doc:`CalleeTypeMetadata`. + '``associated``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst index cb9576b15d701..35a6f59ecbf35 100644 --- a/llvm/docs/Reference.rst +++ b/llvm/docs/Reference.rst @@ -14,6 +14,7 @@ LLVM and API reference documentation. BlockFrequencyTerminology BranchWeightMetadata Bugpoint + CalleeTypeMetadata CIBestPractices CommandGuide/index ConvergenceAndUniformity diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def index df572e8791e13..90276eae13e4b 100644 --- a/llvm/include/llvm/IR/FixedMetadataKinds.def +++ b/llvm/include/llvm/IR/FixedMetadataKinds.def @@ -53,3 +53,4 @@ LLVM_FIXED_MD_KIND(MD_DIAssignID, "DIAssignID", 38) LLVM_FIXED_MD_KIND(MD_coro_outside_frame, "coro.outside.frame", 39) LLVM_FIXED_MD_KIND(MD_mmra, "mmra", 40) LLVM_FIXED_MD_KIND(MD_noalias_addrspace, "noalias.addrspace", 41) +LLVM_FIXED_MD_KIND(MD_callee_type, "callee_type", 42) diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h index 2de26c0c1f7c7..af252aa24567a 100644 --- a/llvm/include/llvm/IR/Metadata.h +++ b/llvm/include/llvm/IR/Metadata.h @@ -1255,6 +1255,13 @@ class MDNode : public Metadata { bool isReplaceable() const { return isTemporary() || isAlwaysReplaceable(); } bool isAlwaysReplaceable() const { return getMetadataID() == DIAssignIDKind; } + /// Check if this is a valid generalized type metadata node. + bool hasGeneralizedMDString() { + if (getNumOperands() < 2 || !isa(getOperand(1))) + return false; + return cast(getOperand(1))->getString().ends_with(".generalized"); + } + unsigned getNumTemporaryUses() const { assert(isTemporary() && "Only for temporaries"); return Context.getReplaceableUses()->getNumUses(); @@ -1467,6 +1474,8 @@ class MDNode : public Metadata { const Instruction *BInstr); LLVM_ABI static MDNode *getMergedMemProfMetadata(MDNode *A, MDNode *B); LLVM_ABI static MDNode *getMergedCallsiteMetadata(MDNode *A, MDNode *B); + LLVM_ABI static MDNode *getMergedCalleeTypeMetadata(const MDNode *A, + const MDNode *B); }; /// Tuple of metadata. diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp index f0448b06e7e82..0dbd07f4865dc 100644 --- a/llvm/lib/IR/Metadata.cpp +++ b/llvm/lib/IR/Metadata.cpp @@ -1303,6 +1303,24 @@ static void addRange(SmallVectorImpl &EndPoints, EndPoints.push_back(High); } +MDNode *MDNode::getMergedCalleeTypeMetadata(const MDNode *A, const MDNode *B) { + // Drop the callee_type metadata if either of the call instructions do not + // have it. + if (!A || !B) + return nullptr; + SmallVector AB; + SmallPtrSet MergedCallees; + auto AddUniqueCallees = [&AB, &MergedCallees](const MDNode *N) { + for (Metadata *MD : N->operands()) { + if (MergedCallees.insert(MD).second) + AB.push_back(MD); + } + }; + AddUniqueCallees(A); + AddUniqueCallees(B); + return MDNode::get(A->getContext(), AB); +} + MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) { // Given two ranges, we want to compute the union of the ranges. This // is slightly complicated by having to combine the intervals and merge diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 8c8ed3c5e47ba..9bd573e773610 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -531,6 +531,7 @@ class Verifier : public InstVisitor, VerifierSupport { void visitCallStackMetadata(MDNode *MD); void visitMemProfMetadata(Instruction &I, MDNode *MD); void visitCallsiteMetadata(Instruction &I, MDNode *MD); + void visitCalleeTypeMetadata(Instruction &I, MDNode *MD); void visitDIAssignIDMetadata(Instruction &I, MDNode *MD); void visitMMRAMetadata(Instruction &I, MDNode *MD); void visitAnnotationMetadata(MDNode *Annotation); @@ -5193,6 +5194,33 @@ void Verifier::visitCallsiteMetadata(Instruction &I, MDNode *MD) { visitCallStackMetadata(MD); } +static inline bool isConstantIntMetadataOperand(const Metadata *MD) { + if (auto *VAL = dyn_cast(MD)) + return isa(VAL->getValue()); + return false; +} + +void Verifier::visitCalleeTypeMetadata(Instruction &I, MDNode *MD) { + Check(isa(I), "!callee_type metadata should only exist on calls", + &I); + for (Metadata *Op : MD->operands()) { + Check(isa(Op), + "The callee_type metadata must be a list of type metadata nodes", Op); + auto *TypeMD = cast(Op); + Check(TypeMD->getNumOperands() == 2, + "Well-formed generalized type metadata must contain exactly two " + "operands", + Op); + Check(isConstantIntMetadataOperand(TypeMD->getOperand(0)) && + mdconst::extract(TypeMD->getOperand(0))->isZero(), + "The first operand of type metadata for functions must be zero", Op); + Check(TypeMD->hasGeneralizedMDString(), + "Only generalized type metadata can be part of the callee_type " + "metadata list", + Op); + } +} + void Verifier::visitAnnotationMetadata(MDNode *Annotation) { Check(isa(Annotation), "annotation must be a tuple"); Check(Annotation->getNumOperands() >= 1, @@ -5470,6 +5498,9 @@ void Verifier::visitInstruction(Instruction &I) { if (MDNode *MD = I.getMetadata(LLVMContext::MD_callsite)) visitCallsiteMetadata(I, MD); + if (MDNode *MD = I.getMetadata(LLVMContext::MD_callee_type)) + visitCalleeTypeMetadata(I, MD); + if (MDNode *MD = I.getMetadata(LLVMContext::MD_DIAssignID)) visitDIAssignIDMetadata(I, MD); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 3321435a6fecb..d88bc2c4901c7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -4352,6 +4352,13 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) { Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy)); } + // Drop unnecessary callee_type metadata from calls that were converted + // into direct calls. + if (Call.getMetadata(LLVMContext::MD_callee_type) && !Call.isIndirectCall()) { + Call.setMetadata(LLVMContext::MD_callee_type, nullptr); + Changed = true; + } + // Drop unnecessary kcfi operand bundles from calls that were converted // into direct calls. auto Bundle = Call.getOperandBundle(LLVMContext::OB_kcfi); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index d481ad9dee181..7f0c23bd24efb 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3005,6 +3005,12 @@ static void combineMetadata(Instruction *K, const Instruction *J, case LLVMContext::MD_memprof: case LLVMContext::MD_callsite: break; + case LLVMContext::MD_callee_type: + if (!AAOnly) { + K->setMetadata(LLVMContext::MD_callee_type, + MDNode::getMergedCalleeTypeMetadata(KMD, JMD)); + } + break; case LLVMContext::MD_align: if (!AAOnly && (DoesKMove || !K->hasMetadata(LLVMContext::MD_noundef))) K->setMetadata( diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp index 7ba95e299c1b1..8d8a60b6918fe 100644 --- a/llvm/lib/Transforms/Utils/ValueMapper.cpp +++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp @@ -987,6 +987,13 @@ void Mapper::remapInstruction(Instruction *I) { "Referenced value not in value map!"); } + // Drop callee_type metadata from calls that were remapped + // into a direct call from an indirect one. + if (auto *CB = dyn_cast(I)) { + if (CB->getMetadata(LLVMContext::MD_callee_type) && !CB->isIndirectCall()) + CB->setMetadata(LLVMContext::MD_callee_type, nullptr); + } + // Remap phi nodes' incoming blocks. if (PHINode *PN = dyn_cast(I)) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { diff --git a/llvm/test/Assembler/callee-type-metadata.ll b/llvm/test/Assembler/callee-type-metadata.ll new file mode 100644 index 0000000000000..9c3cfbe82fc13 --- /dev/null +++ b/llvm/test/Assembler/callee-type-metadata.ll @@ -0,0 +1,21 @@ +;; Test if the callee_type metadata attached to indirect call sites adhere to the expected format. + +; RUN: llvm-as < %s | llvm-dis | FileCheck %s +define i32 @_Z13call_indirectPFicEc(ptr %func, i8 signext %x) !type !0 { +entry: + %func.addr = alloca ptr, align 8 + %x.addr = alloca i8, align 1 + store ptr %func, ptr %func.addr, align 8 + store i8 %x, ptr %x.addr, align 1 + %fptr = load ptr, ptr %func.addr, align 8 + %x_val = load i8, ptr %x.addr, align 1 + ; CHECK: %call = call i32 %fptr(i8 signext %x_val), !callee_type !1 + %call = call i32 %fptr(i8 signext %x_val), !callee_type !1 + ret i32 %call +} + +declare !type !2 i32 @_Z3barc(i8 signext) + +!0 = !{i64 0, !"_ZTSFiPvcE.generalized"} +!1 = !{!2} +!2 = !{i64 0, !"_ZTSFicE.generalized"} diff --git a/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll b/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll new file mode 100644 index 0000000000000..547588089c5b0 --- /dev/null +++ b/llvm/test/Transforms/Inline/drop-callee-type-metadata.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +;; Test if the callee_type metadata is dropped when it is +;; is mapped to a direct function call from an indirect call during inlining. + +; RUN: opt -passes=inline -S < %s | FileCheck %s + +define i32 @_Z13call_indirectPFicEc(ptr %func, i8 %x) !type !0 { +; CHECK-LABEL: define i32 @_Z13call_indirectPFicEc( +; CHECK-SAME: ptr [[FUNC:%.*]], i8 [[X:%.*]]) !type [[META0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = call i32 [[FUNC]](i8 [[X]]), !callee_type [[META1:![0-9]+]] +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = call i32 %func(i8 %x), !callee_type !1 + ret i32 %call +} + +define i32 @_Z3barv() !type !3 { +; CHECK-LABEL: define i32 @_Z3barv( +; CHECK-SAME: ) !type [[META3:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL_I:%.*]] = call i32 @_Z3fooc(i8 97) +; CHECK-NEXT: ret i32 [[CALL_I]] +; +entry: + %call = call i32 @_Z13call_indirectPFicEc(ptr nonnull @_Z3fooc, i8 97) + ret i32 %call +} +declare !type !2 i32 @_Z3fooc(i8 signext) + +!0 = !{i64 0, !"_ZTSFiPvcE.generalized"} +!1 = !{!2} +!2 = !{i64 0, !"_ZTSFicE.generalized"} +!3 = !{i64 0, !"_ZTSFivE.generalized"} +;. +; CHECK: [[META0]] = !{i64 0, !"_ZTSFiPvcE.generalized"} +; CHECK: [[META1]] = !{[[META2:![0-9]+]]} +; CHECK: [[META2]] = !{i64 0, !"_ZTSFicE.generalized"} +; CHECK: [[META3]] = !{i64 0, !"_ZTSFivE.generalized"} +;. diff --git a/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll b/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll new file mode 100644 index 0000000000000..83215f78be1b0 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/drop-callee-type-metadata.ll @@ -0,0 +1,25 @@ +;; Test if the callee_type metadata is dropped when it is attached +;; to a direct function call during instcombine. + +; RUN: opt -passes=instcombine -S < %s | FileCheck %s + +define i32 @_Z3barv() !type !0 { +; CHECK-LABEL: define i32 @_Z3barv( +; CHECK-SAME: ) !type [[META0:![0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CALL:%.*]] = call i32 @_Z3fooc(i8 97){{$}} +; CHECK-NEXT: ret i32 [[CALL]] +; +entry: + %call = call i32 @_Z3fooc(i8 97), !callee_type !1 + ret i32 %call +} + +declare !type !2 i32 @_Z3fooc(i8 signext) + +!0 = !{i64 0, !"_ZTSFivE.generalized"} +!1 = !{!2} +!2 = !{i64 0, !"_ZTSFicE.generalized"} +;. +; CHECK: [[META0]] = !{i64 0, !"_ZTSFivE.generalized"} +;. diff --git a/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll b/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll new file mode 100644 index 0000000000000..3e56939b1642f --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/merge-callee-type-metadata.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +;; Test if the callee_type metadata is merged correctly. + +; RUN: opt -passes=simplifycfg -S < %s | FileCheck %s + +;; Test if the callee_type metadata is merged correctly when +;; the instructions carry differring callee_type metadata. +define ptr @_Z10test_diffb(i1 zeroext %b) { +; CHECK-LABEL: define ptr @_Z10test_diffb( +; CHECK-SAME: i1 zeroext [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr @_Znwm, ptr [[FN]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META0:![0-9]+]] +; CHECK-NEXT: ret ptr [[CALL]] +; +entry: + %fn = alloca ptr + store ptr @_Znwm, ptr %fn + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call ptr %fn(i64 4), !callee_type !4 + br label %if.end + +if.else: ; preds = %entry + %call1 = call ptr %fn(i64 4), !callee_type !3 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ] + ret ptr %x.0 +} + +;; Test if the callee_type metadata is merged correctly when +;; the instructions carry same callee_type metadata. +define ptr @_Z10test_sameb(i1 zeroext %b) { +; CHECK-LABEL: define ptr @_Z10test_sameb( +; CHECK-SAME: i1 zeroext [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr @_Znwm, ptr [[FN]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META3:![0-9]+]] +; CHECK-NEXT: ret ptr [[CALL]] +; +entry: + %fn = alloca ptr + store ptr @_Znwm, ptr %fn + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call ptr %fn(i64 4), !callee_type !3 + br label %if.end + +if.else: ; preds = %entry + %call1 = call ptr %fn(i64 4), !callee_type !3 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ] + ret ptr %x.0 +} + +;; Test if the callee_type metadata is dropped correctly when +;; only the left instruction has callee_type metadata. +define ptr @_Z10test_leftb(i1 zeroext %b) { +; CHECK-LABEL: define ptr @_Z10test_leftb( +; CHECK-SAME: i1 zeroext [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr @_Znwm, ptr [[FN]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call ptr [[FN]](i64 4) +; CHECK-NEXT: ret ptr [[CALL]] +; +entry: + %fn = alloca ptr + store ptr @_Znwm, ptr %fn + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call ptr %fn(i64 4), !callee_type !4 + br label %if.end + +if.else: ; preds = %entry + %call1 = call ptr %fn(i64 4) + br label %if.end + +if.end: ; preds = %if.else, %if.then + %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ] + ret ptr %x.0 +} + +;; Test if the callee_type metadata is dropped correctly when +;; only the right instruction has callee_type metadata. +define ptr @_Z10test_rightb(i1 zeroext %b) { +; CHECK-LABEL: define ptr @_Z10test_rightb( +; CHECK-SAME: i1 zeroext [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr @_Znwm, ptr [[FN]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call ptr [[FN]](i64 4) +; CHECK-NEXT: ret ptr [[CALL]] +; +entry: + %fn = alloca ptr + store ptr @_Znwm, ptr %fn + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call ptr %fn(i64 4) + br label %if.end + +if.else: ; preds = %entry + %call1 = call ptr %fn(i64 4), !callee_type !3 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ] + ret ptr %x.0 +} + +;; Test if the callee_type metadata is merged correctly when +;; each of the callee_type metadata are lists. +define ptr @_Z10test_listb(i1 zeroext %b) { +; CHECK-LABEL: define ptr @_Z10test_listb( +; CHECK-SAME: i1 zeroext [[B:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[FN:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr @_Znwm, ptr [[FN]], align 8 +; CHECK-NEXT: [[CALL:%.*]] = call ptr [[FN]](i64 4), !callee_type [[META4:![0-9]+]] +; CHECK-NEXT: ret ptr [[CALL]] +; +entry: + %fn = alloca ptr + store ptr @_Znwm, ptr %fn + br i1 %b, label %if.then, label %if.else + +if.then: ; preds = %entry + %call = call ptr %fn(i64 4), !callee_type !6 + br label %if.end + +if.else: ; preds = %entry + %call1 = call ptr %fn(i64 4), !callee_type !5 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %x.0 = phi ptr [ %call, %if.then ], [ %call1, %if.else ] + ret ptr %x.0 +} + +declare ptr @_Znwm(i64) + +!0 = !{i64 0, !"callee_type0.generalized"} +!1 = !{i64 0, !"callee_type1.generalized"} +!2 = !{i64 0, !"callee_type2.generalized"} +!3 = !{!0} +!4 = !{!2} +!5 = !{!1, !2} +!6 = !{!0, !2} +;. +; CHECK: [[META0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{i64 0, !"callee_type2.generalized"} +; CHECK: [[META2]] = !{i64 0, !"callee_type0.generalized"} +; CHECK: [[META3]] = !{[[META2]]} +; CHECK: [[META4]] = !{[[META2]], [[META1]], [[META5:![0-9]+]]} +; CHECK: [[META5]] = !{i64 0, !"callee_type1.generalized"} +;. diff --git a/llvm/test/Verifier/callee-type-metadata.ll b/llvm/test/Verifier/callee-type-metadata.ll new file mode 100644 index 0000000000000..50cf37b941fe9 --- /dev/null +++ b/llvm/test/Verifier/callee-type-metadata.ll @@ -0,0 +1,33 @@ +;; Test if the callee_type metadata attached to indirect call sites adhere to the expected format. + +; RUN: not llvm-as -disable-output < %s 2>&1 | FileCheck %s +define i32 @_Z13call_indirectPFicEc(ptr %func, i8 signext %x) !type !0 { +entry: + %func.addr = alloca ptr, align 8 + %x.addr = alloca i8, align 1 + store ptr %func, ptr %func.addr, align 8 + store i8 %x, ptr %x.addr, align 1 + %fptr = load ptr, ptr %func.addr, align 8 + %x_val = load i8, ptr %x.addr, align 1 + ; CHECK: The callee_type metadata must be a list of type metadata nodes + %call = call i32 %fptr(i8 signext %x_val), !callee_type !0 + ; CHECK: Well-formed generalized type metadata must contain exactly two operands + %call1 = call i32 %fptr(i8 signext %x_val), !callee_type !2 + ; CHECK: The first operand of type metadata for functions must be zero + %call2 = call i32 %fptr(i8 signext %x_val), !callee_type !4 + ; CHECK: The first operand of type metadata for functions must be zero + %call3 = call i32 %fptr(i8 signext %x_val), !callee_type !6 + ; CHECK: Only generalized type metadata can be part of the callee_type metadata list + %call4 = call i32 %fptr(i8 signext %x_val), !callee_type !8 + ret i32 %call +} + +!0 = !{i64 0, !"_ZTSFiPvcE.generalized"} +!1 = !{!"_ZTSFicE"} +!2 = !{!2} +!3 = !{i64 1, !"_ZTSFicE"} +!4 = !{!3} +!5 = !{!"expected_int", !"_ZTSFicE"} +!6 = !{!5} +!7 = !{i64 0, !"_ZTSFicE"} +!8 = !{!7} From c99c213e725adee22c386d2059118a49f0c57054 Mon Sep 17 00:00:00 2001 From: Diego Caballero Date: Fri, 18 Jul 2025 14:43:03 -0700 Subject: [PATCH 413/813] [mlir][Flang][NFC] Replace use of `vector.insertelement/extractelement` (#143272) This PR is part of the last step to remove `vector.extractelement` and `vector.insertelement` ops (RFC: https://discourse.llvm.org/t/rfc-psa-remove-vector-extractelement-and-vector-insertelement-ops-in-favor-of-vector-extract-and-vector-insert-ops). It replaces `vector.insertelement` and `vector.extractelement` with `vector.insert` and `vector.extract` in Flang. It looks like no lit tests are impacted? --- flang/include/flang/Optimizer/Support/InitFIR.h | 4 +++- flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp | 11 ++++++++--- flang/lib/Optimizer/CodeGen/CMakeLists.txt | 1 + flang/lib/Optimizer/CodeGen/CodeGen.cpp | 2 ++ 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h index 8bb4791859bb7..aacba233a2b32 100644 --- a/flang/include/flang/Optimizer/Support/InitFIR.h +++ b/flang/include/flang/Optimizer/Support/InitFIR.h @@ -23,6 +23,7 @@ #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Complex/IR/Complex.h" #include "mlir/Dialect/Func/Extensions/InlinerExtension.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/OpenACC/Transforms/Passes.h" #include "mlir/Dialect/SCF/Transforms/Passes.h" @@ -41,7 +42,8 @@ namespace fir::support { mlir::cf::ControlFlowDialect, mlir::func::FuncDialect, \ mlir::vector::VectorDialect, mlir::math::MathDialect, \ mlir::complex::ComplexDialect, mlir::DLTIDialect, cuf::CUFDialect, \ - mlir::NVVM::NVVMDialect, mlir::gpu::GPUDialect + mlir::NVVM::NVVMDialect, mlir::gpu::GPUDialect, \ + mlir::index::IndexDialect #define FLANG_CODEGEN_DIALECT_LIST FIRCodeGenDialect, mlir::LLVM::LLVMDialect diff --git a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp index 0094ce892d61b..db12c84496b10 100644 --- a/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/PPCIntrinsicCall.cpp @@ -17,6 +17,7 @@ #include "flang/Evaluate/common.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/MutableBox.h" +#include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" namespace fir { @@ -1685,7 +1686,9 @@ PPCIntrinsicLibrary::genVecExtract(mlir::Type resultType, if (!isNativeVecElemOrderOnLE()) uremOp = convertVectorElementOrder(builder, loc, vecTyInfo, uremOp); - return builder.create(loc, varg0, uremOp); + mlir::Value index = builder.createOrFold( + loc, builder.getIndexType(), uremOp); + return builder.create(loc, varg0, index); } // VEC_INSERT @@ -1706,8 +1709,10 @@ PPCIntrinsicLibrary::genVecInsert(mlir::Type resultType, if (!isNativeVecElemOrderOnLE()) uremOp = convertVectorElementOrder(builder, loc, vecTyInfo, uremOp); - auto res{builder.create(loc, argBases[0], - varg1, uremOp)}; + mlir::Value index = builder.createOrFold( + loc, builder.getIndexType(), uremOp); + mlir::Value res = + builder.create(loc, argBases[0], varg1, index); return builder.create(loc, vecTyInfo.toFirVectorType(), res); } diff --git a/flang/lib/Optimizer/CodeGen/CMakeLists.txt b/flang/lib/Optimizer/CodeGen/CMakeLists.txt index 16c7944a885a1..d5ea3c7a8e282 100644 --- a/flang/lib/Optimizer/CodeGen/CMakeLists.txt +++ b/flang/lib/Optimizer/CodeGen/CMakeLists.txt @@ -37,6 +37,7 @@ add_flang_library(FIRCodeGen MLIRComplexToROCDLLibraryCalls MLIRComplexToStandard MLIRGPUDialect + MLIRIndexToLLVM MLIRMathToFuncs MLIRMathToLLVM MLIRMathToLibm diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index d879382555c39..609ba27bc212b 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -37,6 +37,7 @@ #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h" #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" +#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Conversion/MathToFuncs/MathToFuncs.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" @@ -4224,6 +4225,7 @@ class FIRToLLVMLowering if (!isAMDGCN) mlir::populateMathToLibmConversionPatterns(pattern); mlir::populateComplexToLLVMConversionPatterns(typeConverter, pattern); + mlir::index::populateIndexToLLVMConversionPatterns(typeConverter, pattern); mlir::populateVectorToLLVMConversionPatterns(typeConverter, pattern); // Flang specific overloads for OpenMP operations, to allow for special From 2e67dcfdcd023df2f06e0823eeea23990ce41534 Mon Sep 17 00:00:00 2001 From: Kelvin Li Date: Fri, 18 Jul 2025 17:43:15 -0400 Subject: [PATCH 414/813] [flang] update ppc lit tests after using vector.insert and vector.extract (NFC) (#148775) See https://github.com/llvm/llvm-project/pull/143272 --- .../PowerPC/ppc-vec-extract-elem-order.f90 | 5 +- flang/test/Lower/PowerPC/ppc-vec-extract.f90 | 142 ++++++++++-------- .../PowerPC/ppc-vec-insert-elem-order.f90 | 5 +- flang/test/Lower/PowerPC/ppc-vec-insert.f90 | 142 ++++++++++-------- .../PowerPC/ppc-vec-splat-elem-order.f90 | 2 +- flang/test/Lower/PowerPC/ppc-vec-splat.f90 | 124 +++++++-------- 6 files changed, 229 insertions(+), 191 deletions(-) diff --git a/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90 index 73669c25b339e..d7d14581b4b7f 100644 --- a/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90 +++ b/flang/test/Lower/PowerPC/ppc-vec-extract-elem-order.f90 @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -fno-ppc-native-vector-element-order -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR" %s +! RUN: %flang_fc1 -emit-llvm %s -fno-ppc-native-vector-element-order -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR" %s ! REQUIRES: target=powerpc{{.*}} !CHECK-LABEL: vec_extract_testr4i8 @@ -27,6 +27,7 @@ subroutine vec_extract_testi8i1(arg1, arg2, r) ! LLVMIR: %[[arg2:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[urem:.*]] = urem i8 %[[arg2]], 2 ! LLVMIR: %[[sub:.*]] = sub i8 1, %[[urem]] -! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[arg1]], i8 %[[sub]] +! LLVMIR: %[[idx:.*]] = zext i8 %[[sub]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[arg1]], i64 %[[idx]] ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8 end subroutine vec_extract_testi8i1 diff --git a/flang/test/Lower/PowerPC/ppc-vec-extract.f90 b/flang/test/Lower/PowerPC/ppc-vec-extract.f90 index 0f279347b6b75..32c0dcfd66013 100644 --- a/flang/test/Lower/PowerPC/ppc-vec-extract.f90 +++ b/flang/test/Lower/PowerPC/ppc-vec-extract.f90 @@ -1,5 +1,5 @@ -! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s -! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s +! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s +! RUN: %flang_fc1 -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s ! REQUIRES: target=powerpc{{.*}} !------------- @@ -19,8 +19,9 @@ subroutine vec_extract_testf32(x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i8 3, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <4 x float> %[[x]], i8 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <4 x float> %[[x]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[idx]] ! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4 r = vec_extract(x, i2) @@ -29,8 +30,9 @@ subroutine vec_extract_testf32(x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i16 3, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <4 x float> %[[x]], i16 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <4 x float> %[[x]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[idx]] ! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4 r = vec_extract(x, i4) @@ -39,18 +41,19 @@ subroutine vec_extract_testf32(x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i32 3, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <4 x float> %[[x]], i32 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <4 x float> %[[x]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[idx]] ! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4 r = vec_extract(x, i8) ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 4 -! LLVMIR-BE: %[[s:.*]] = sub i64 3, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[s]] +! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 4 +! LLVMIR-BE: %[[idx:.*]] = sub i64 3, %[[u]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 4 +! LLVMIR: %[[r:.*]] = extractelement <4 x float> %[[x]], i64 %[[idx]] ! LLVMIR: store float %[[r]], ptr %{{[0-9]}}, align 4 end subroutine vec_extract_testf32 @@ -68,8 +71,9 @@ subroutine vec_extract_testf64(x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i8 1, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <2 x double> %[[x]], i8 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <2 x double> %[[x]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[idx]] ! LLVMIR: store double %[[r]], ptr %{{[0-9]}}, align 8 r = vec_extract(x, i2) @@ -78,8 +82,9 @@ subroutine vec_extract_testf64(x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i16 1, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <2 x double> %[[x]], i16 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <2 x double> %[[x]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[idx]] ! LLVMIR: store double %[[r]], ptr %{{[0-9]}}, align 8 @@ -89,18 +94,19 @@ subroutine vec_extract_testf64(x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i32 1, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <2 x double> %[[x]], i32 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <2 x double> %[[x]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[idx]] ! LLVMIR: store double %[[r]], ptr %{{[0-9]}}, align 8 r = vec_extract(x, i8) ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 2 -! LLVMIR-BE: %[[s:.*]] = sub i64 1, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[s]] +! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 2 +! LLVMIR-BE: %[[idx:.*]] = sub i64 1, %[[u]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 2 +! LLVMIR: %[[r:.*]] = extractelement <2 x double> %[[x]], i64 %[[idx]] ! LLVMIR: store double %[[r]], ptr %{{[0-9]}}, align 8 end subroutine vec_extract_testf64 @@ -118,8 +124,9 @@ subroutine vec_extract_testi8(x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 16 ! LLVMIR-BE: %[[s:.*]] = sub i8 15, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i8 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[idx]] ! LLVMIR: store i8 %[[r]], ptr %{{[0-9]}}, align 1 r = vec_extract(x, i2) @@ -128,8 +135,9 @@ subroutine vec_extract_testi8(x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 16 ! LLVMIR-BE: %[[s:.*]] = sub i16 15, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i16 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[idx]] ! LLVMIR: store i8 %[[r]], ptr %{{[0-9]}}, align 1 r = vec_extract(x, i4) @@ -138,18 +146,19 @@ subroutine vec_extract_testi8(x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 16 ! LLVMIR-BE: %[[s:.*]] = sub i32 15, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i32 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[idx]] ! LLVMIR: store i8 %[[r]], ptr %{{[0-9]}}, align 1 r = vec_extract(x, i8) ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 16 -! LLVMIR-BE: %[[s:.*]] = sub i64 15, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[s]] +! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 16 +! LLVMIR-BE: %[[idx:.*]] = sub i64 15, %[[u]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 16 +! LLVMIR: %[[r:.*]] = extractelement <16 x i8> %[[x]], i64 %[[idx]] ! LLVMIR: store i8 %[[r]], ptr %{{[0-9]}}, align 1 end subroutine vec_extract_testi8 @@ -167,8 +176,9 @@ subroutine vec_extract_testi16(x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 8 ! LLVMIR-BE: %[[s:.*]] = sub i8 7, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i8 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[idx]] ! LLVMIR: store i16 %[[r]], ptr %{{[0-9]}}, align 2 r = vec_extract(x, i2) @@ -177,8 +187,9 @@ subroutine vec_extract_testi16(x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 8 ! LLVMIR-BE: %[[s:.*]] = sub i16 7, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i16 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[idx]] ! LLVMIR: store i16 %[[r]], ptr %{{[0-9]}}, align 2 r = vec_extract(x, i4) @@ -187,18 +198,19 @@ subroutine vec_extract_testi16(x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 8 ! LLVMIR-BE: %[[s:.*]] = sub i32 7, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i32 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[idx]] ! LLVMIR: store i16 %[[r]], ptr %{{[0-9]}}, align 2 r = vec_extract(x, i8) ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 8 -! LLVMIR-BE: %[[s:.*]] = sub i64 7, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[s]] +! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 8 +! LLVMIR-BE: %[[idx:.*]] = sub i64 7, %[[u]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 8 +! LLVMIR: %[[r:.*]] = extractelement <8 x i16> %[[x]], i64 %[[idx]] ! LLVMIR: store i16 %[[r]], ptr %{{[0-9]}}, align 2 end subroutine vec_extract_testi16 @@ -216,8 +228,9 @@ subroutine vec_extract_testi32(x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i8 3, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i8 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[idx]] ! LLVMIR: store i32 %[[r]], ptr %{{[0-9]}}, align 4 r = vec_extract(x, i2) @@ -226,8 +239,9 @@ subroutine vec_extract_testi32(x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i16 3, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i16 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[idx]] ! LLVMIR: store i32 %[[r]], ptr %{{[0-9]}}, align 4 r = vec_extract(x, i4) @@ -236,18 +250,19 @@ subroutine vec_extract_testi32(x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i32 3, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i32 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[idx]] ! LLVMIR: store i32 %[[r]], ptr %{{[0-9]}}, align 4 r = vec_extract(x, i8) ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 4 -! LLVMIR-BE: %[[s:.*]] = sub i64 3, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[s]] +! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 4 +! LLVMIR-BE: %[[idx:.*]] = sub i64 3, %[[u]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 4 +! LLVMIR: %[[r:.*]] = extractelement <4 x i32> %[[x]], i64 %[[idx]] ! LLVMIR: store i32 %[[r]], ptr %{{[0-9]}}, align 4 end subroutine vec_extract_testi32 @@ -265,8 +280,9 @@ subroutine vec_extract_testi64(x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[u:.*]] = urem i8 %[[i1]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i8 1, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i8 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[idx]] ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8 r = vec_extract(x, i2) @@ -275,8 +291,9 @@ subroutine vec_extract_testi64(x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[u:.*]] = urem i16 %[[i2]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i16 1, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i16 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[idx]] ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8 r = vec_extract(x, i4) @@ -285,17 +302,18 @@ subroutine vec_extract_testi64(x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[u:.*]] = urem i32 %[[i4]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i32 1, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i32 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[u]] to i64 +! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[idx]] ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8 r = vec_extract(x, i8) ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[u:.*]] = urem i64 %[[i8]], 2 -! LLVMIR-BE: %[[s:.*]] = sub i64 1, %[[u]] -! LLVMIR-LE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[u]] -! LLVMIR-BE: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[s]] +! LLVMIR-BE: %[[u:.*]] = urem i64 %[[i8]], 2 +! LLVMIR-BE: %[[idx:.*]] = sub i64 1, %[[u]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 2 +! LLVMIR: %[[r:.*]] = extractelement <2 x i64> %[[x]], i64 %[[idx]] ! LLVMIR: store i64 %[[r]], ptr %{{[0-9]}}, align 8 end subroutine vec_extract_testi64 diff --git a/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90 index f64df46f170ab..b30065d74e46b 100644 --- a/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90 +++ b/flang/test/Lower/PowerPC/ppc-vec-insert-elem-order.f90 @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -flang-experimental-hlfir -emit-llvm %s -fno-ppc-native-vector-element-order -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR" %s +! RUN: %flang_fc1 -emit-llvm %s -fno-ppc-native-vector-element-order -triple ppc64le-unknown-linux -o - | FileCheck --check-prefixes="LLVMIR" %s ! REQUIRES: target=powerpc{{.*}} !CHECK-LABEL: vec_insert_testf32i64 @@ -31,6 +31,7 @@ subroutine vec_insert_testi64i8(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 2 ! LLVMIR: %[[sub:.*]] = sub i8 1, %[[urem]] -! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i8 %[[sub]] +! LLVMIR: %[[idx:.*]] = zext i8 %[[sub]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16 end subroutine vec_insert_testi64i8 diff --git a/flang/test/Lower/PowerPC/ppc-vec-insert.f90 b/flang/test/Lower/PowerPC/ppc-vec-insert.f90 index dd57fcc67be08..26bc7fc114cec 100644 --- a/flang/test/Lower/PowerPC/ppc-vec-insert.f90 +++ b/flang/test/Lower/PowerPC/ppc-vec-insert.f90 @@ -1,5 +1,5 @@ -! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s -! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s +! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s +! RUN: %flang_fc1 -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s ! REQUIRES: target=powerpc{{.*}} ! vec_insert @@ -20,8 +20,9 @@ subroutine vec_insert_testf32(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i8 3, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i8 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[idx]] ! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i2) @@ -31,8 +32,9 @@ subroutine vec_insert_testf32(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i16 3, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i16 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[idx]] ! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i4) @@ -42,8 +44,9 @@ subroutine vec_insert_testf32(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i32 3, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i32 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[idx]] ! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i8) @@ -51,10 +54,10 @@ subroutine vec_insert_testf32(v, x, i1, i2, i4, i8) ! LLVMIR: %[[v:.*]] = load float, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 4 -! LLVMIR-BE: %[[s:.*]] = sub i64 3, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[s]] +! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 4 +! LLVMIR-BE: %[[idx:.*]] = sub i64 3, %[[urem]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 4 +! LLVMIR: %[[r:.*]] = insertelement <4 x float> %[[x]], float %[[v]], i64 %[[idx]] ! LLVMIR: store <4 x float> %[[r]], ptr %{{[0-9]}}, align 16 end subroutine vec_insert_testf32 @@ -74,8 +77,9 @@ subroutine vec_insert_testf64(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i8 1, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i8 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i2) @@ -85,8 +89,9 @@ subroutine vec_insert_testf64(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i16 1, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i16 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i4) @@ -96,8 +101,9 @@ subroutine vec_insert_testf64(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i32 1, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i32 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i8) @@ -105,10 +111,10 @@ subroutine vec_insert_testf64(v, x, i1, i2, i4, i8) ! LLVMIR: %[[v:.*]] = load double, ptr %{{[0-9]}}, align 8 ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 2 -! LLVMIR-BE: %[[s:.*]] = sub i64 1, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[s]] +! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 2 +! LLVMIR-BE: %[[idx:.*]] = sub i64 1, %[[urem]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 2 +! LLVMIR: %[[r:.*]] = insertelement <2 x double> %[[x]], double %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x double> %[[r]], ptr %{{[0-9]}}, align 16 end subroutine vec_insert_testf64 @@ -128,8 +134,9 @@ subroutine vec_insert_testi8(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 16 ! LLVMIR-BE: %[[s:.*]] = sub i8 15, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i8 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[idx]] ! LLVMIR: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i2) @@ -139,8 +146,9 @@ subroutine vec_insert_testi8(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 16 ! LLVMIR-BE: %[[s:.*]] = sub i16 15, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i16 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[idx]] ! LLVMIR: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i4) @@ -150,8 +158,9 @@ subroutine vec_insert_testi8(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 16 ! LLVMIR-BE: %[[s:.*]] = sub i32 15, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i32 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[idx]] ! LLVMIR: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i8) @@ -159,10 +168,10 @@ subroutine vec_insert_testi8(v, x, i1, i2, i4, i8) ! LLVMIR: %[[v:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 16 -! LLVMIR-BE: %[[s:.*]] = sub i64 15, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[s]] +! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 16 +! LLVMIR-BE: %[[idx:.*]] = sub i64 15, %[[urem]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 16 +! LLVMIR: %[[r:.*]] = insertelement <16 x i8> %[[x]], i8 %[[v]], i64 %[[idx]] ! LLVMIR: store <16 x i8> %[[r]], ptr %{{[0-9]}}, align 16 end subroutine vec_insert_testi8 @@ -182,8 +191,9 @@ subroutine vec_insert_testi16(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 8 ! LLVMIR-BE: %[[s:.*]] = sub i8 7, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i8 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[idx]] ! LLVMIR: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i2) @@ -193,8 +203,9 @@ subroutine vec_insert_testi16(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 8 ! LLVMIR-BE: %[[s:.*]] = sub i16 7, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i16 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[idx]] ! LLVMIR: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i4) @@ -204,8 +215,9 @@ subroutine vec_insert_testi16(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 8 ! LLVMIR-BE: %[[s:.*]] = sub i32 7, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i32 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[idx]] ! LLVMIR: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i8) @@ -213,10 +225,10 @@ subroutine vec_insert_testi16(v, x, i1, i2, i4, i8) ! LLVMIR: %[[v:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 8 -! LLVMIR-BE: %[[s:.*]] = sub i64 7, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[s]] +! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 8 +! LLVMIR-BE: %[[idx:.*]] = sub i64 7, %[[urem]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 8 +! LLVMIR: %[[r:.*]] = insertelement <8 x i16> %[[x]], i16 %[[v]], i64 %[[idx]] ! LLVMIR: store <8 x i16> %[[r]], ptr %{{[0-9]}}, align 16 end subroutine vec_insert_testi16 @@ -236,8 +248,9 @@ subroutine vec_insert_testi32(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i8 3, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i8 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[idx]] ! LLVMIR: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i2) @@ -247,8 +260,9 @@ subroutine vec_insert_testi32(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i16 3, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i16 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[idx]] ! LLVMIR: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i4) @@ -258,8 +272,9 @@ subroutine vec_insert_testi32(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 4 ! LLVMIR-BE: %[[s:.*]] = sub i32 3, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i32 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[idx]] ! LLVMIR: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i8) @@ -267,10 +282,10 @@ subroutine vec_insert_testi32(v, x, i1, i2, i4, i8) ! LLVMIR: %[[v:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 4 -! LLVMIR-BE: %[[s:.*]] = sub i64 3, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[s]] +! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 4 +! LLVMIR-BE: %[[idx:.*]] = sub i64 3, %[[urem]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 4 +! LLVMIR: %[[r:.*]] = insertelement <4 x i32> %[[x]], i32 %[[v]], i64 %[[idx]] ! LLVMIR: store <4 x i32> %[[r]], ptr %{{[0-9]}}, align 16 end subroutine vec_insert_testi32 @@ -290,8 +305,9 @@ subroutine vec_insert_testi64(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i1:.*]] = load i8, ptr %{{[0-9]}}, align 1 ! LLVMIR: %[[urem:.*]] = urem i8 %[[i1]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i8 1, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i8 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i8 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i8 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i8 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i2) @@ -301,8 +317,9 @@ subroutine vec_insert_testi64(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i2:.*]] = load i16, ptr %{{[0-9]}}, align 2 ! LLVMIR: %[[urem:.*]] = urem i16 %[[i2]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i16 1, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i16 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i16 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i16 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i16 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i4) @@ -312,8 +329,9 @@ subroutine vec_insert_testi64(v, x, i1, i2, i4, i8) ! LLVMIR: %[[i4:.*]] = load i32, ptr %{{[0-9]}}, align 4 ! LLVMIR: %[[urem:.*]] = urem i32 %[[i4]], 2 ! LLVMIR-BE: %[[s:.*]] = sub i32 1, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i32 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i32 %[[s]] +! LLVMIR-BE: %[[idx:.*]] = zext i32 %[[s]] to i64 +! LLVMIR-LE: %[[idx:.*]] = zext i32 %[[urem]] to i64 +! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16 r = vec_insert(v, x, i8) @@ -321,9 +339,9 @@ subroutine vec_insert_testi64(v, x, i1, i2, i4, i8) ! LLVMIR: %[[v:.*]] = load i64, ptr %{{[0-9]}}, align 8 ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16 ! LLVMIR: %[[i8:.*]] = load i64, ptr %{{[0-9]}}, align 8 -! LLVMIR: %[[urem:.*]] = urem i64 %[[i8]], 2 -! LLVMIR-BE: %[[s:.*]] = sub i64 1, %[[urem]] -! LLVMIR-LE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[urem]] -! LLVMIR-BE: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[s]] +! LLVMIR-BE: %[[urem:.*]] = urem i64 %[[i8]], 2 +! LLVMIR-BE: %[[idx:.*]] = sub i64 1, %[[urem]] +! LLVMIR-LE: %[[idx:.*]] = urem i64 %[[i8]], 2 +! LLVMIR: %[[r:.*]] = insertelement <2 x i64> %[[x]], i64 %[[v]], i64 %[[idx]] ! LLVMIR: store <2 x i64> %[[r]], ptr %{{[0-9]}}, align 16 end subroutine vec_insert_testi64 diff --git a/flang/test/Lower/PowerPC/ppc-vec-splat-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-splat-elem-order.f90 index 50604e1f720f3..ca8c0c3f6f1d8 100644 --- a/flang/test/Lower/PowerPC/ppc-vec-splat-elem-order.f90 +++ b/flang/test/Lower/PowerPC/ppc-vec-splat-elem-order.f90 @@ -19,7 +19,7 @@ subroutine vec_splat_testu8i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 -! LLVMIR: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 15 +! LLVMIR: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16 diff --git a/flang/test/Lower/PowerPC/ppc-vec-splat.f90 b/flang/test/Lower/PowerPC/ppc-vec-splat.f90 index f3c1f19d5877d..55614c75d1ad8 100644 --- a/flang/test/Lower/PowerPC/ppc-vec-splat.f90 +++ b/flang/test/Lower/PowerPC/ppc-vec-splat.f90 @@ -1,5 +1,5 @@ -! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s -! RUN: %flang_fc1 -flang-experimental-hlfir -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s +! RUN: %flang_fc1 -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-LE" %s +! RUN: %flang_fc1 -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefixes="LLVMIR","LLVMIR-BE" %s ! REQUIRES: target=powerpc{{.*}} !---------------- @@ -12,8 +12,8 @@ subroutine vec_splat_testi8i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i8 15 +! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16 @@ -25,8 +25,8 @@ subroutine vec_splat_testi8i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 15 +! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16 @@ -38,8 +38,8 @@ subroutine vec_splat_testi8i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i32 15 +! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16 @@ -64,8 +64,8 @@ subroutine vec_splat_testi16i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i8 7 +! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16 @@ -77,8 +77,8 @@ subroutine vec_splat_testi16i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i16 7 +! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16 @@ -90,8 +90,8 @@ subroutine vec_splat_testi16i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i32 7 +! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16 @@ -116,8 +116,8 @@ subroutine vec_splat_testi32i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i8 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16 @@ -129,8 +129,8 @@ subroutine vec_splat_testi32i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i16 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16 @@ -142,8 +142,8 @@ subroutine vec_splat_testi32i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i32 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16 @@ -168,8 +168,8 @@ subroutine vec_splat_testi64i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i8 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16 @@ -181,8 +181,8 @@ subroutine vec_splat_testi64i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i16 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16 @@ -194,8 +194,8 @@ subroutine vec_splat_testi64i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i32 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16 @@ -220,8 +220,8 @@ subroutine vec_splat_testf32i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i8 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x float> poison, float %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x float> %[[ins]], <4 x float> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x float> %[[y]], ptr %{{[0-9]}}, align 16 @@ -233,8 +233,8 @@ subroutine vec_splat_testf32i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i16 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x float> poison, float %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x float> %[[ins]], <4 x float> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x float> %[[y]], ptr %{{[0-9]}}, align 16 @@ -246,8 +246,8 @@ subroutine vec_splat_testf32i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <4 x float>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i32 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x float> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x float> poison, float %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x float> %[[ins]], <4 x float> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x float> %[[y]], ptr %{{[0-9]}}, align 16 @@ -272,8 +272,8 @@ subroutine vec_splat_testf64i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i8 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x double> poison, double %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x double> %[[ins]], <2 x double> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x double> %[[y]], ptr %{{[0-9]}}, align 16 @@ -285,8 +285,8 @@ subroutine vec_splat_testf64i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i16 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x double> poison, double %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x double> %[[ins]], <2 x double> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x double> %[[y]], ptr %{{[0-9]}}, align 16 @@ -298,8 +298,8 @@ subroutine vec_splat_testf64i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <2 x double>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i32 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x double> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x double> poison, double %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x double> %[[ins]], <2 x double> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x double> %[[y]], ptr %{{[0-9]}}, align 16 @@ -324,8 +324,8 @@ subroutine vec_splat_testu8i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i8 15 +! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16 @@ -337,8 +337,8 @@ subroutine vec_splat_testu8i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i16 15 +! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16 @@ -350,8 +350,8 @@ subroutine vec_splat_testu8i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <16 x i8>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i32 15 +! LLVMIR-LE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <16 x i8> %[[x]], i64 15 ! LLVMIR: %[[ins:.*]] = insertelement <16 x i8> poison, i8 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <16 x i8> %[[ins]], <16 x i8> poison, <16 x i32> zeroinitializer ! LLVMIR: store <16 x i8> %[[y]], ptr %{{[0-9]}}, align 16 @@ -376,8 +376,8 @@ subroutine vec_splat_testu16i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i8 7 +! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16 @@ -389,8 +389,8 @@ subroutine vec_splat_testu16i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i16 7 +! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16 @@ -402,8 +402,8 @@ subroutine vec_splat_testu16i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <8 x i16>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i32 7 +! LLVMIR-LE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <8 x i16> %[[x]], i64 7 ! LLVMIR: %[[ins:.*]] = insertelement <8 x i16> poison, i16 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <8 x i16> %[[ins]], <8 x i16> poison, <8 x i32> zeroinitializer ! LLVMIR: store <8 x i16> %[[y]], ptr %{{[0-9]}}, align 16 @@ -428,8 +428,8 @@ subroutine vec_splat_testu32i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i8 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16 @@ -441,8 +441,8 @@ subroutine vec_splat_testu32i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i16 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16 @@ -454,8 +454,8 @@ subroutine vec_splat_testu32i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <4 x i32>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i32 3 +! LLVMIR-LE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <4 x i32> %[[x]], i64 3 ! LLVMIR: %[[ins:.*]] = insertelement <4 x i32> poison, i32 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <4 x i32> %[[ins]], <4 x i32> poison, <4 x i32> zeroinitializer ! LLVMIR: store <4 x i32> %[[y]], ptr %{{[0-9]}}, align 16 @@ -480,8 +480,8 @@ subroutine vec_splat_testu64i8(x) y = vec_splat(x, 0_1) ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i8 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i8 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16 @@ -493,8 +493,8 @@ subroutine vec_splat_testu64i16(x) y = vec_splat(x, 0_2) ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i16 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i16 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16 @@ -506,8 +506,8 @@ subroutine vec_splat_testu64i32(x) y = vec_splat(x, 0_4) ! LLVMIR: %[[x:.*]] = load <2 x i64>, ptr %{{[0-9]}}, align 16 -! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i32 0 -! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i32 1 +! LLVMIR-LE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 0 +! LLVMIR-BE: %[[ele:.*]] = extractelement <2 x i64> %[[x]], i64 1 ! LLVMIR: %[[ins:.*]] = insertelement <2 x i64> poison, i64 %[[ele]], i32 0 ! LLVMIR: %[[y:.*]] = shufflevector <2 x i64> %[[ins]], <2 x i64> poison, <2 x i32> zeroinitializer ! LLVMIR: store <2 x i64> %[[y]], ptr %{{[0-9]}}, align 16 From 20c5daa032ca5e4c655d1248ef9cbf8a33183aaf Mon Sep 17 00:00:00 2001 From: lntue Date: Fri, 18 Jul 2025 17:54:47 -0400 Subject: [PATCH 415/813] [libc] Fix conflicting symbols when shared/math.h is included. (#149591) --- libc/src/__support/math/exp.h | 19 ++++++++++--------- libc/src/__support/math/exp10.h | 19 ++++++++++--------- libc/src/__support/math/exp10f_utils.h | 6 +++--- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/libc/src/__support/math/exp.h b/libc/src/__support/math/exp.h index a538df1e825dc..ff59ff79e3381 100644 --- a/libc/src/__support/math/exp.h +++ b/libc/src/__support/math/exp.h @@ -40,11 +40,11 @@ static constexpr double LOG2_E = 0x1.71547652b82fep+0; // Error bounds: // Errors when using double precision. -static constexpr double ERR_D = 0x1.8p-63; +static constexpr double EXP_ERR_D = 0x1.8p-63; #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS // Errors when using double-double precision. -static constexpr double ERR_DD = 0x1.0p-99; +static constexpr double EXP_ERR_DD = 0x1.0p-99; #endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS // -2^-12 * log(2) @@ -387,7 +387,8 @@ static double exp(double x) { #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS if (LIBC_UNLIKELY(denorm)) { - return ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D) + return ziv_test_denorm(hi, exp_mid.hi, lo, + EXP_ERR_D) .value(); } else { // to multiply by 2^hi, a fast way is to simply add hi to the exponent @@ -399,12 +400,12 @@ static double exp(double x) { } #else if (LIBC_UNLIKELY(denorm)) { - if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D); + if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, EXP_ERR_D); LIBC_LIKELY(r.has_value())) return r.value(); } else { - double upper = exp_mid.hi + (lo + ERR_D); - double lower = exp_mid.hi + (lo - ERR_D); + double upper = exp_mid.hi + (lo + EXP_ERR_D); + double lower = exp_mid.hi + (lo - EXP_ERR_D); if (LIBC_LIKELY(upper == lower)) { // to multiply by 2^hi, a fast way is to simply add hi to the exponent @@ -419,12 +420,12 @@ static double exp(double x) { DoubleDouble r_dd = exp_double_double(x, kd, exp_mid); if (LIBC_UNLIKELY(denorm)) { - if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD); + if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, EXP_ERR_DD); LIBC_LIKELY(r.has_value())) return r.value(); } else { - double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD); - double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD); + double upper_dd = r_dd.hi + (r_dd.lo + EXP_ERR_DD); + double lower_dd = r_dd.hi + (r_dd.lo - EXP_ERR_DD); if (LIBC_LIKELY(upper_dd == lower_dd)) { int64_t exp_hi = static_cast(hi) << FPBits::FRACTION_LEN; diff --git a/libc/src/__support/math/exp10.h b/libc/src/__support/math/exp10.h index 88748523deb3d..fa60e40c43e5d 100644 --- a/libc/src/__support/math/exp10.h +++ b/libc/src/__support/math/exp10.h @@ -54,11 +54,11 @@ static constexpr double MLOG10_2_EXP2_M12_LO = 0x1.da994fd20dba2p-87; // Error bounds: // Errors when using double precision. -constexpr double ERR_D = 0x1.8p-63; +constexpr double EXP10_ERR_D = 0x1.8p-63; #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS // Errors when using double-double precision. -static constexpr double ERR_DD = 0x1.8p-99; +static constexpr double EXP10_ERR_DD = 0x1.8p-99; #endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS // Polynomial approximations with double precision. Generated by Sollya with: @@ -207,17 +207,18 @@ static double exp10_denorm(double x) { double lo = fputil::multiply_add(p, mid_lo, exp_mid.lo); #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - return ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D) + return ziv_test_denorm(hi, exp_mid.hi, lo, + EXP10_ERR_D) .value(); #else - if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, ERR_D); + if (auto r = ziv_test_denorm(hi, exp_mid.hi, lo, EXP10_ERR_D); LIBC_LIKELY(r.has_value())) return r.value(); // Use double-double DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid); - if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, ERR_DD); + if (auto r = ziv_test_denorm(hi, r_dd.hi, r_dd.lo, EXP10_ERR_DD); LIBC_LIKELY(r.has_value())) return r.value(); @@ -409,8 +410,8 @@ static constexpr double exp10(double x) { cpp::bit_cast(exp_hi + cpp::bit_cast(exp_mid.hi + lo)); return r; #else - double upper = exp_mid.hi + (lo + ERR_D); - double lower = exp_mid.hi + (lo - ERR_D); + double upper = exp_mid.hi + (lo + EXP10_ERR_D); + double lower = exp_mid.hi + (lo - EXP10_ERR_D); if (LIBC_LIKELY(upper == lower)) { // To multiply by 2^hi, a fast way is to simply add hi to the exponent @@ -476,8 +477,8 @@ static constexpr double exp10(double x) { // Use double-double DoubleDouble r_dd = exp10_double_double(x, kd, exp_mid); - double upper_dd = r_dd.hi + (r_dd.lo + ERR_DD); - double lower_dd = r_dd.hi + (r_dd.lo - ERR_DD); + double upper_dd = r_dd.hi + (r_dd.lo + EXP10_ERR_DD); + double lower_dd = r_dd.hi + (r_dd.lo - EXP10_ERR_DD); if (LIBC_LIKELY(upper_dd == lower_dd)) { // To multiply by 2^hi, a fast way is to simply add hi to the exponent diff --git a/libc/src/__support/math/exp10f_utils.h b/libc/src/__support/math/exp10f_utils.h index 0493e1b993e0c..c30def9d62db2 100644 --- a/libc/src/__support/math/exp10f_utils.h +++ b/libc/src/__support/math/exp10f_utils.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H -#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_UTILS_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_UTILS_H #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" @@ -154,4 +154,4 @@ LIBC_INLINE static constexpr exp_b_reduc_t exp_b_range_reduc(float x) { } // namespace LIBC_NAMESPACE_DECL -#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP_FLOAT_CONSTANTS_H +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP10F_UTILS_H From 28c1433cf3cb3a4a7a4cb0e9488e04b2699c72d6 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 14:58:51 -0700 Subject: [PATCH 416/813] [Vectorize] Fix a warning This patch fixes: llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp:726:13: error: unused variable 'RedPhiRK' [-Werror,-Wunused-variable] --- llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 1a614c3c12119..ba1f9aad6a9c6 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -726,6 +726,7 @@ bool VPlanTransforms::handleMaxMinNumReductionsWithoutFastMath(VPlan &Plan) { RecurKind RedPhiRK = RedPhiR->getRecurrenceKind(); assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) && "unsupported reduction"); + (void)RedPhiRK; /// Check if the vector loop of \p Plan can early exit and restart /// execution of last vector iteration in the scalar loop. This requires all From b9aa06f897efafc2b8c9c1de45bdfe987aadb879 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Fri, 18 Jul 2025 14:59:53 -0700 Subject: [PATCH 417/813] [llvm] Improve grammar and punctuation of LLVM Language Reference Manual (#149553) --- llvm/docs/LangRef.rst | 48 +++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index be5f7fbd90b5e..9a32f0cd15dd3 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -410,7 +410,7 @@ added in the future: calling convention: on most platforms, they are not preserved and need to be saved by the caller, but on Windows, xmm6-xmm15 are preserved. - - On AArch64 the callee preserve all general purpose registers, except + - On AArch64 the callee preserves all general purpose registers, except X0-X8 and X16-X18. Not allowed with ``nest``. The idea behind this convention is to support calls to runtime functions @@ -425,10 +425,10 @@ added in the future: on the hot path and definitely executed a lot. Furthermore `preserve_mostcc` doesn't prevent the inliner from inlining the function call. - This calling convention will be used by a future version of the ObjectiveC + This calling convention will be used by a future version of the Objective-C runtime and should therefore still be considered experimental at this time. Although this convention was created to optimize certain runtime calls to - the ObjectiveC runtime, it is not limited to this runtime and might be used + the Objective-C runtime, it is not limited to this runtime and might be used by other runtimes in the future too. The current implementation only supports X86-64, but the intention is to support more architectures in the future. @@ -455,14 +455,14 @@ added in the future: that don't need to call out to any other functions. This calling convention, like the `PreserveMost` calling convention, will be - used by a future version of the ObjectiveC runtime and should be considered + used by a future version of the Objective-C runtime and should be considered experimental at this time. "``preserve_nonecc``" - The `PreserveNone` calling convention This calling convention doesn't preserve any general registers. So all general registers are caller saved registers. It also uses all general registers to pass arguments. This attribute doesn't impact non-general purpose registers (e.g. floating point registers, on X86 XMMs/YMMs). - Non-general purpose registers still follow the standard c calling + Non-general purpose registers still follow the standard C calling convention. Currently it is for x86_64 and AArch64 only. "``cxx_fast_tlscc``" - The `CXX_FAST_TLS` calling convention for access functions Clang generates an access function to access C++-style Thread Local Storage @@ -513,7 +513,7 @@ added in the future: - On AArch64 the target address is passed in X15. "``cc ``" - Numbered convention Any calling convention may be specified by number, allowing - target-specific calling conventions to be used. Target specific + target-specific calling conventions to be used. Target-specific calling conventions start at 64. More calling conventions can be added/defined on an as-needed basis, to @@ -559,7 +559,7 @@ DLL Storage Classes ------------------- All Global Variables, Functions and Aliases can have one of the following -DLL storage class: +DLL storage classes: ``dllimport`` "``dllimport``" causes the compiler to reference a function or variable via @@ -569,7 +569,7 @@ DLL storage class: ``dllexport`` On Microsoft Windows targets, "``dllexport``" causes the compiler to provide a global pointer to a pointer in a DLL, so that it can be referenced with the - ``dllimport`` attribute. the pointer name is formed by combining ``__imp_`` + ``dllimport`` attribute. The pointer name is formed by combining ``__imp_`` and the function or variable name. On XCOFF targets, ``dllexport`` indicates that the symbol will be made visible to other modules using "exported" visibility and thus placed by the linker in the loader section symbol table. @@ -586,7 +586,7 @@ Thread Local Storage Models --------------------------- A variable may be defined as ``thread_local``, which means that it will -not be shared by threads (each thread will have a separated copy of the +not be shared by threads (each thread will have a separate copy of the variable). Not all targets support thread-local variables. Optionally, a TLS model may be specified: @@ -606,10 +606,10 @@ be used. The target may choose a different TLS model if the specified model is not supported, or if a better choice of model can be made. A model can also be specified in an alias, but then it only governs how -the alias is accessed. It will not have any effect in the aliasee. +the alias is accessed. It will not have any effect on the aliasee. For platforms without linker support of ELF TLS model, the -femulated-tls -flag can be used to generate GCC compatible emulated TLS code. +flag can be used to generate GCC-compatible emulated TLS code. .. _runtime_preemption_model: @@ -750,7 +750,7 @@ is zero. The address space qualifier must precede any other attributes. LLVM allows an explicit section to be specified for globals. If the target supports it, it will emit globals to the section specified. -Additionally, the global can placed in a comdat if the target has the necessary +Additionally, the global can be placed in a comdat if the target has the necessary support. External declarations may have an explicit section specified. Section @@ -1316,7 +1316,7 @@ Currently, only the following parameter attributes are defined: must be cleared off with :ref:`llvm.stackrestore `. - The inalloca attribute requires a type argument. + The ``inalloca`` attribute requires a type argument. See :doc:`InAlloca` for more information on how to use this attribute. @@ -1328,7 +1328,7 @@ Currently, only the following parameter attributes are defined: loads and stores to the structure may be assumed by the callee not to trap and to be properly aligned. - The sret type argument specifies the in memory type. + The sret type argument specifies the in-memory type. A function that accepts an ``sret`` argument must return ``void``. A return value may not be ``sret``. @@ -1397,7 +1397,7 @@ Currently, only the following parameter attributes are defined: pointer. This is not a valid attribute for return values. This attribute applies only to the particular copy of the pointer passed in this argument. - The arguments of ``captures`` is a list of captured pointer components, + The arguments of ``captures`` are a list of captured pointer components, which may be ``none``, or a combination of: - ``address``: The integral address of the pointer. @@ -1429,7 +1429,7 @@ Currently, only the following parameter attributes are defined: is null is captured in some other way. ``nofree`` - This indicates that callee does not free the pointer argument. This is not + This indicates that the callee does not free the pointer argument. This is not a valid attribute for return values. .. _nest: @@ -1545,7 +1545,7 @@ Currently, only the following parameter attributes are defined: (matching the supported types for :ref:`fast-math flags `). The test mask has the same format as the second argument to the :ref:`llvm.is.fpclass `, and indicates which classes - of floating-point values are not permitted for the value. For example + of floating-point values are not permitted for the value. For example, a bitmask of 3 indicates the parameter may not be a NaN. If the value is a floating-point class indicated by the @@ -1783,7 +1783,7 @@ string: define void @f() gc "name" { ... } -The supported values of *name* includes those :ref:`built in to LLVM +The supported values of *name* include those :ref:`built in to LLVM ` and any provided by loaded plugins. Specifying a GC strategy will cause the compiler to alter its output in order to support the named garbage collection algorithm. Note that LLVM itself does not contain a @@ -2056,9 +2056,9 @@ For example: ``hot`` This attribute indicates that this function is a hot spot of the program execution. The function will be optimized more aggressively and will be - placed into special subsection of the text section to improving locality. + placed into a special subsection of the text section to improve locality. - When profile feedback is enabled, this attribute has the precedence over + When profile feedback is enabled, this attribute takes precedence over the profile information. By marking a function ``hot``, users can work around the cases where the training input does not have good coverage on all the hot functions. @@ -2162,10 +2162,10 @@ For example: and on function declarations and definitions. ``nocallback`` This attribute indicates that the function is only allowed to jump back into - caller's module by a return or an exception, and is not allowed to jump back + the caller's module by a return or an exception, and is not allowed to jump back by invoking a callback function, a direct, possibly transitive, external function call, use of ``longjmp``, or other means. It is a compiler hint that - is used at module level to improve dataflow analysis, dropped during linking, + is used at the module level to improve dataflow analysis, dropped during linking, and has no effect on functions defined in the current module. ``nodivergencesource`` A call to this function is not a source of divergence. In uniformity @@ -2297,7 +2297,7 @@ For example: in address-space 0 is considered to be a valid address for memory loads and stores. Any analysis or optimization should not treat dereferencing a pointer to ``null`` as undefined behavior in this function. - Note: Comparing address of a global variable to ``null`` may still + Note: Comparing the address of a global variable to ``null`` may still evaluate to false because of a limitation in querying this attribute inside constant expressions. ``optdebug`` @@ -2370,7 +2370,7 @@ For example: This attribute controls the behavior of stack probes: either the ``"probe-stack"`` attribute, or ABI-required stack probes, if any. It defines the size of the guard region. It ensures that if the function - may use more stack space than the size of the guard region, stack probing + may use more stack space than the size of the guard region, a stack probing sequence will be emitted. It takes one required integer value, which is 4096 by default. From 7b5d8a02d21e5309a4a48eb3d699b75c53144492 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 18 Jul 2025 15:06:50 -0700 Subject: [PATCH 418/813] Revert "[NFC][profdata] Apply lints and other format fixes" (#149601) Reverts llvm/llvm-project#149433 This broke the hwasan buildbot: https://lab.llvm.org/buildbot/#/builders/55/builds/14455 --- llvm/tools/llvm-profdata/llvm-profdata.cpp | 120 +++++++++++---------- 1 file changed, 62 insertions(+), 58 deletions(-) diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp index 96d135b9746ff..207ae2ddd4cf2 100644 --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -16,6 +16,7 @@ #include "llvm/Debuginfod/HTTPClient.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Object/Binary.h" +#include "llvm/ProfileData/DataAccessProf.h" #include "llvm/ProfileData/InstrProfCorrelator.h" #include "llvm/ProfileData/InstrProfReader.h" #include "llvm/ProfileData/InstrProfWriter.h" @@ -52,23 +53,23 @@ using ProfCorrelatorKind = InstrProfCorrelator::ProfCorrelatorKind; // https://llvm.org/docs/CommandGuide/llvm-profdata.html has documentations // on each subcommand. -static cl::SubCommand ShowSubcommand( +cl::SubCommand ShowSubcommand( "show", "Takes a profile data file and displays the profiles. See detailed " "documentation in " "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-show"); -static cl::SubCommand OrderSubcommand( +cl::SubCommand OrderSubcommand( "order", "Reads temporal profiling traces from a profile and outputs a function " "order that reduces the number of page faults for those traces. See " "detailed documentation in " "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-order"); -static cl::SubCommand OverlapSubcommand( +cl::SubCommand OverlapSubcommand( "overlap", "Computes and displays the overlap between two profiles. See detailed " "documentation in " "https://llvm.org/docs/CommandGuide/llvm-profdata.html#profdata-overlap"); -static cl::SubCommand MergeSubcommand( +cl::SubCommand MergeSubcommand( "merge", "Takes several profiles and merge them together. See detailed " "documentation in " @@ -91,11 +92,12 @@ enum class ShowFormat { Text, Json, Yaml }; } // namespace // Common options. -static cl::opt - OutputFilename("output", cl::value_desc("output"), cl::init("-"), - cl::desc("Output file"), cl::sub(ShowSubcommand), - cl::sub(OrderSubcommand), cl::sub(OverlapSubcommand), - cl::sub(MergeSubcommand)); +cl::opt OutputFilename("output", cl::value_desc("output"), + cl::init("-"), cl::desc("Output file"), + cl::sub(ShowSubcommand), + cl::sub(OrderSubcommand), + cl::sub(OverlapSubcommand), + cl::sub(MergeSubcommand)); // NOTE: cl::alias must not have cl::sub(), since aliased option's cl::sub() // will be used. llvm::cl::alias::done() method asserts this condition. static cl::alias OutputFilenameA("o", cl::desc("Alias for --output"), @@ -525,9 +527,9 @@ static void exitWithError(Twine Message, StringRef Whence = "", static void exitWithError(Error E, StringRef Whence = "") { if (E.isA()) { handleAllErrors(std::move(E), [&](const InstrProfError &IPE) { - instrprof_error InstrError = IPE.get(); + instrprof_error instrError = IPE.get(); StringRef Hint = ""; - if (InstrError == instrprof_error::unrecognized_format) { + if (instrError == instrprof_error::unrecognized_format) { // Hint in case user missed specifying the profile type. Hint = "Perhaps you forgot to use the --sample or --memory option?"; } @@ -634,7 +636,7 @@ class SymbolRemapper { return New.empty() ? Name : FunctionId(New); } }; -} // namespace +} struct WeightedFile { std::string Filename; @@ -824,18 +826,18 @@ loadInput(const WeightedFile &Input, SymbolRemapper *Remapper, // Only show hint the first time an error occurs. auto [ErrCode, Msg] = InstrProfError::take(std::move(E)); std::unique_lock ErrGuard{WC->ErrLock}; - bool FirstTime = WC->WriterErrorCodes.insert(ErrCode).second; + bool firstTime = WC->WriterErrorCodes.insert(ErrCode).second; handleMergeWriterError(make_error(ErrCode, Msg), - Input.Filename, FuncName, FirstTime); + Input.Filename, FuncName, firstTime); }); } if (KeepVTableSymbols) { - const InstrProfSymtab &Symtab = Reader->getSymtab(); - const auto &VTableNames = Symtab.getVTableNames(); + const InstrProfSymtab &symtab = Reader->getSymtab(); + const auto &VTableNames = symtab.getVTableNames(); - for (const auto &KV : VTableNames) - WC->Writer.addVTableName(KV.getKey()); + for (const auto &kv : VTableNames) + WC->Writer.addVTableName(kv.getKey()); } if (Reader->hasTemporalProfile()) { @@ -876,8 +878,8 @@ static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) { Dst->Writer.mergeRecordsFromWriter(std::move(Src->Writer), [&](Error E) { auto [ErrorCode, Msg] = InstrProfError::take(std::move(E)); std::unique_lock ErrGuard{Dst->ErrLock}; - bool FirstTime = Dst->WriterErrorCodes.insert(ErrorCode).second; - if (FirstTime) + bool firstTime = Dst->WriterErrorCodes.insert(ErrorCode).second; + if (firstTime) warn(toString(make_error(ErrorCode, Msg))); }); } @@ -887,22 +889,24 @@ getFuncName(const StringMap::value_type &Val) { return Val.first(); } -static std::string getFuncName(const SampleProfileMap::value_type &Val) { +static std::string +getFuncName(const SampleProfileMap::value_type &Val) { return Val.second.getContext().toString(); } -template static void filterFunctions(T &ProfileMap) { - bool HasFilter = !FuncNameFilter.empty(); - bool HasNegativeFilter = !FuncNameNegativeFilter.empty(); - if (!HasFilter && !HasNegativeFilter) +template +static void filterFunctions(T &ProfileMap) { + bool hasFilter = !FuncNameFilter.empty(); + bool hasNegativeFilter = !FuncNameNegativeFilter.empty(); + if (!hasFilter && !hasNegativeFilter) return; // If filter starts with '?' it is MSVC mangled name, not a regex. llvm::Regex ProbablyMSVCMangledName("[?@$_0-9A-Za-z]+"); - if (HasFilter && FuncNameFilter[0] == '?' && + if (hasFilter && FuncNameFilter[0] == '?' && ProbablyMSVCMangledName.match(FuncNameFilter)) FuncNameFilter = llvm::Regex::escape(FuncNameFilter); - if (HasNegativeFilter && FuncNameNegativeFilter[0] == '?' && + if (hasNegativeFilter && FuncNameNegativeFilter[0] == '?' && ProbablyMSVCMangledName.match(FuncNameNegativeFilter)) FuncNameNegativeFilter = llvm::Regex::escape(FuncNameNegativeFilter); @@ -910,9 +914,9 @@ template static void filterFunctions(T &ProfileMap) { llvm::Regex Pattern(FuncNameFilter); llvm::Regex NegativePattern(FuncNameNegativeFilter); std::string Error; - if (HasFilter && !Pattern.isValid(Error)) + if (hasFilter && !Pattern.isValid(Error)) exitWithError(Error); - if (HasNegativeFilter && !NegativePattern.isValid(Error)) + if (hasNegativeFilter && !NegativePattern.isValid(Error)) exitWithError(Error); // Handle MD5 profile, so it is still able to match using the original name. @@ -924,10 +928,10 @@ template static void filterFunctions(T &ProfileMap) { auto Tmp = I++; const auto &FuncName = getFuncName(*Tmp); // Negative filter has higher precedence than positive filter. - if ((HasNegativeFilter && + if ((hasNegativeFilter && (NegativePattern.match(FuncName) || (FunctionSamples::UseMD5 && NegativeMD5Name == FuncName))) || - (HasFilter && !(Pattern.match(FuncName) || + (hasFilter && !(Pattern.match(FuncName) || (FunctionSamples::UseMD5 && MD5Name == FuncName)))) ProfileMap.erase(Tmp); } @@ -1188,7 +1192,7 @@ adjustInstrProfile(std::unique_ptr &WC, StringMap StaticFuncMap; InstrProfSummaryBuilder IPBuilder(ProfileSummaryBuilder::DefaultCutoffs); - auto CheckSampleProfileHasFUnique = [&Reader]() { + auto checkSampleProfileHasFUnique = [&Reader]() { for (const auto &PD : Reader->getProfiles()) { auto &FContext = PD.second.getContext(); if (FContext.toString().find(FunctionSamples::UniqSuffix) != @@ -1199,9 +1203,9 @@ adjustInstrProfile(std::unique_ptr &WC, return false; }; - bool SampleProfileHasFUnique = CheckSampleProfileHasFUnique(); + bool SampleProfileHasFUnique = checkSampleProfileHasFUnique(); - auto BuildStaticFuncMap = [&StaticFuncMap, + auto buildStaticFuncMap = [&StaticFuncMap, SampleProfileHasFUnique](const StringRef Name) { std::string FilePrefixes[] = {".cpp", "cc", ".c", ".hpp", ".h"}; size_t PrefixPos = StringRef::npos; @@ -1361,7 +1365,7 @@ adjustInstrProfile(std::unique_ptr &WC, InstrProfRecord *R = &PD.getValue().begin()->second; StringRef FullName = PD.getKey(); InstrProfileMap[FullName] = InstrProfileEntry(R); - BuildStaticFuncMap(FullName); + buildStaticFuncMap(FullName); } for (auto &PD : Reader->getProfiles()) { @@ -1492,8 +1496,8 @@ remapSamples(const sampleprof::FunctionSamples &Samples, BodySample.second.getSamples()); for (const auto &Target : BodySample.second.getCallTargets()) { Result.addCalledTargetSamples(BodySample.first.LineOffset, - MaskedDiscriminator, Remapper(Target.first), - Target.second); + MaskedDiscriminator, + Remapper(Target.first), Target.second); } } for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) { @@ -1754,7 +1758,7 @@ static void parseInputFilenamesFile(MemoryBuffer *Buffer, if (SanitizedEntry.starts_with("#")) continue; // If there's no comma, it's an unweighted profile. - if (!SanitizedEntry.contains(',')) + else if (!SanitizedEntry.contains(',')) addWeightedInput(WFV, {std::string(SanitizedEntry), 1}); else addWeightedInput(WFV, parseWeightedFile(SanitizedEntry)); @@ -2735,11 +2739,10 @@ std::error_code SampleOverlapAggregator::loadProfiles() { return std::error_code(); } -static void overlapSampleProfile(const std::string &BaseFilename, - const std::string &TestFilename, - const OverlapFuncFilters &FuncFilter, - uint64_t SimilarityCutoff, - raw_fd_ostream &OS) { +void overlapSampleProfile(const std::string &BaseFilename, + const std::string &TestFilename, + const OverlapFuncFilters &FuncFilter, + uint64_t SimilarityCutoff, raw_fd_ostream &OS) { using namespace sampleprof; // We use 0.000005 to initialize OverlapAggr.Epsilon because the final metrics @@ -2870,7 +2873,7 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { OS << ":ir\n"; for (const auto &Func : *Reader) { - if (IsIRInstr) { + if (Reader->isIRLevelProfile()) { bool FuncIsCS = NamedInstrProfRecord::hasCSFlagInHash(Func.Hash); if (FuncIsCS != ShowCS) continue; @@ -2878,7 +2881,9 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { bool Show = ShowAllFunctions || (!FuncNameFilter.empty() && Func.Name.contains(FuncNameFilter)); - if (Show && TextFormat) { + bool doTextFormatDump = (Show && TextFormat); + + if (doTextFormatDump) { InstrProfSymtab &Symtab = Reader->getSymtab(); InstrProfWriter::writeRecordInText(Func.Name, Func.Hash, Func, Symtab, OS); @@ -2916,9 +2921,9 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { continue; } - for (const auto &Count : Func.Counts) { - FuncMax = std::max(FuncMax, Count); - FuncSum += Count; + for (size_t I = 0, E = Func.Counts.size(); I < E; ++I) { + FuncMax = std::max(FuncMax, Func.Counts[I]); + FuncSum += Func.Counts[I]; } if (FuncMax < ShowValueCutoff) { @@ -2928,8 +2933,7 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { << " Sum = " << FuncSum << ")\n"; } continue; - } - if (OnlyListBelow) + } else if (OnlyListBelow) continue; if (TopNFunctions || ShowHotFuncList) @@ -2996,8 +3000,9 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { if (TextFormat || ShowCovered) return 0; std::unique_ptr PS(Builder.getSummary()); - OS << "Instrumentation level: " << (IsIRInstr ? "IR" : "Front-end"); - if (IsIRInstr) { + bool IsIR = Reader->isIRLevelProfile(); + OS << "Instrumentation level: " << (IsIR ? "IR" : "Front-end"); + if (IsIR) { OS << " entry_first = " << Reader->instrEntryBBEnabled(); OS << " instrument_loop_entries = " << Reader->instrLoopEntriesEnabled(); } @@ -3065,10 +3070,10 @@ static int showInstrProfile(ShowFormat SFormat, raw_fd_ostream &OS) { auto &Traces = Reader->getTemporalProfTraces(); OS << "Temporal Profile Traces (samples=" << Traces.size() << " seen=" << Reader->getTemporalProfTraceStreamSize() << "):\n"; - for (auto [Index, Trace] : llvm::enumerate(Traces)) { - OS << " Temporal Profile Trace " << Index << " (weight=" << Trace.Weight - << " count=" << Trace.FunctionNameRefs.size() << "):\n"; - for (auto &NameRef : Trace.FunctionNameRefs) + for (unsigned i = 0; i < Traces.size(); i++) { + OS << " Temporal Profile Trace " << i << " (weight=" << Traces[i].Weight + << " count=" << Traces[i].FunctionNameRefs.size() << "):\n"; + for (auto &NameRef : Traces[i].FunctionNameRefs) OS << " " << Reader->getSymtab().getFuncOrVarName(NameRef) << "\n"; } } @@ -3381,8 +3386,7 @@ static int show_main(StringRef ProgName) { exitWithErrorCode(EC, OutputFilename); if (ShowAllFunctions && !FuncNameFilter.empty()) - WithColor::warning() - << "-function argument ignored: showing all functions\n"; + WithColor::warning() << "-function argument ignored: showing all functions\n"; if (!DebugInfoFilename.empty()) return showDebugInfoCorrelation(DebugInfoFilename, SFormat, OS); From 7c57b559a4c40599b4ec8ac7a638ed151f24fc8c Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Fri, 18 Jul 2025 17:43:46 -0500 Subject: [PATCH 419/813] [lldb][scripts] Fix framework script unifdef test (#149607) Fixes a test that's failing on LLDB GreenDragon due to a mistake in the arguments used when calling the framework-header-fix script. --- lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test index a7e82d2f3640c..ba18b4b41d3a0 100644 --- a/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test +++ b/lldb/test/Shell/Scripts/TestFrameworkFixUnifdef.test @@ -1,7 +1,7 @@ # REQUIRES: system-darwin # Create a temp dir for output and run the framework fix script on the truncated version of SBAddress.h in the inputs dir. RUN: mkdir -p %t/Outputs -RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef USWIG +RUN: %python %p/../../../scripts/framework-header-fix.py -f lldb_main -i %p/Inputs/Main/SBAddress.h -o %t/Outputs/SBAddress.h -p /usr/bin/unifdef --unifdef_guards USWIG # Check the output RUN: cat %t/Outputs/SBAddress.h | FileCheck %s From 09bea21d9507e142d2ff6a5068362bdc824fce54 Mon Sep 17 00:00:00 2001 From: lonely eagle <2020382038@qq.com> Date: Sat, 19 Jul 2025 07:11:10 +0800 Subject: [PATCH 420/813] [mlir][memref] Simplify memref.copy canonicalization (#149506) FoldCopyOfCast has both a OpRewritePattern implementation and a folder implementation. This PR removes the OpRewritePattern implementation. --- mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp | 67 +++++------------------- 1 file changed, 14 insertions(+), 53 deletions(-) diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp index d1a9920aa66c5..51c813682ce25 100644 --- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp +++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp @@ -715,51 +715,6 @@ OpFoldResult CastOp::fold(FoldAdaptor adaptor) { //===----------------------------------------------------------------------===// namespace { -/// If the source/target of a CopyOp is a CastOp that does not modify the shape -/// and element type, the cast can be skipped. Such CastOps only cast the layout -/// of the type. -struct FoldCopyOfCast : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(CopyOp copyOp, - PatternRewriter &rewriter) const override { - bool modified = false; - - // Check source. - if (auto castOp = copyOp.getSource().getDefiningOp()) { - auto fromType = llvm::dyn_cast(castOp.getSource().getType()); - auto toType = llvm::dyn_cast(castOp.getSource().getType()); - - if (fromType && toType) { - if (fromType.getShape() == toType.getShape() && - fromType.getElementType() == toType.getElementType()) { - rewriter.modifyOpInPlace(copyOp, [&] { - copyOp.getSourceMutable().assign(castOp.getSource()); - }); - modified = true; - } - } - } - - // Check target. - if (auto castOp = copyOp.getTarget().getDefiningOp()) { - auto fromType = llvm::dyn_cast(castOp.getSource().getType()); - auto toType = llvm::dyn_cast(castOp.getSource().getType()); - - if (fromType && toType) { - if (fromType.getShape() == toType.getShape() && - fromType.getElementType() == toType.getElementType()) { - rewriter.modifyOpInPlace(copyOp, [&] { - copyOp.getTargetMutable().assign(castOp.getSource()); - }); - modified = true; - } - } - } - - return success(modified); - } -}; /// Fold memref.copy(%x, %x). struct FoldSelfCopy : public OpRewritePattern { @@ -797,22 +752,28 @@ struct FoldEmptyCopy final : public OpRewritePattern { void CopyOp::getCanonicalizationPatterns(RewritePatternSet &results, MLIRContext *context) { - results.add(context); + results.add(context); } -LogicalResult CopyOp::fold(FoldAdaptor adaptor, - SmallVectorImpl &results) { - /// copy(memrefcast) -> copy - bool folded = false; - Operation *op = *this; +/// If the source/target of a CopyOp is a CastOp that does not modify the shape +/// and element type, the cast can be skipped. Such CastOps only cast the layout +/// of the type. +static LogicalResult FoldCopyOfCast(CopyOp op) { for (OpOperand &operand : op->getOpOperands()) { auto castOp = operand.get().getDefiningOp(); if (castOp && memref::CastOp::canFoldIntoConsumerOp(castOp)) { operand.set(castOp.getOperand()); - folded = true; + return success(); } } - return success(folded); + return failure(); +} + +LogicalResult CopyOp::fold(FoldAdaptor adaptor, + SmallVectorImpl &results) { + + /// copy(memrefcast) -> copy + return FoldCopyOfCast(*this); } //===----------------------------------------------------------------------===// From 68fd102598a27e2654c0ced9c122c601795097fe Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 18 Jul 2025 16:15:38 -0700 Subject: [PATCH 421/813] [lldb] Use std::make_shared for StopInfoSP (#149612) Use std::make_shared to create a StopInfoSP, which inherits from shared_from_this. It's both the most efficient and safest way to create these objects: - With make_shared, the object and the control block are allocated together, which is more efficient. - With make_shared, the enable_shared_from_this base class is properly linked to the control block before the constructor finishes, so shared_from_this() will be safe to use (though still not recommended during construction). --- lldb/source/Target/StopInfo.cpp | 37 +++++++++++++++++---------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index 19f89b8246926..ddf8c62e969ed 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -851,8 +851,9 @@ class StopInfoWatchpoint : public StopInfo { // We have to step over the watchpoint before we know what to do: StopInfoWatchpointSP me_as_siwp_sp = std::static_pointer_cast(shared_from_this()); - ThreadPlanSP step_over_wp_sp(new ThreadPlanStepOverWatchpoint( - *(thread_sp.get()), me_as_siwp_sp, wp_sp)); + ThreadPlanSP step_over_wp_sp = + std::make_shared(*(thread_sp.get()), + me_as_siwp_sp, wp_sp); // When this plan is done we want to stop, so set this as a Controlling // plan. step_over_wp_sp->SetIsControllingPlan(true); @@ -1475,13 +1476,13 @@ StopInfoSP StopInfo::CreateStopReasonWithBreakpointSiteID(Thread &thread, break_id_t break_id) { thread.SetThreadHitBreakpointSite(); - return StopInfoSP(new StopInfoBreakpoint(thread, break_id)); + return std::make_shared(thread, break_id); } StopInfoSP StopInfo::CreateStopReasonWithBreakpointSiteID(Thread &thread, break_id_t break_id, bool should_stop) { - return StopInfoSP(new StopInfoBreakpoint(thread, break_id, should_stop)); + return std::make_shared(thread, break_id, should_stop); } // LWP_TODO: We'll need a CreateStopReasonWithWatchpointResourceID akin @@ -1489,67 +1490,67 @@ StopInfoSP StopInfo::CreateStopReasonWithBreakpointSiteID(Thread &thread, StopInfoSP StopInfo::CreateStopReasonWithWatchpointID(Thread &thread, break_id_t watch_id, bool silently_continue) { - return StopInfoSP( - new StopInfoWatchpoint(thread, watch_id, silently_continue)); + return std::make_shared(thread, watch_id, + silently_continue); } StopInfoSP StopInfo::CreateStopReasonWithSignal(Thread &thread, int signo, const char *description, std::optional code) { thread.GetProcess()->GetUnixSignals()->IncrementSignalHitCount(signo); - return StopInfoSP(new StopInfoUnixSignal(thread, signo, description, code)); + return std::make_shared(thread, signo, description, code); } StopInfoSP StopInfo::CreateStopReasonWithInterrupt(Thread &thread, int signo, const char *description) { - return StopInfoSP(new StopInfoInterrupt(thread, signo, description)); + return std::make_shared(thread, signo, description); } StopInfoSP StopInfo::CreateStopReasonToTrace(Thread &thread) { - return StopInfoSP(new StopInfoTrace(thread)); + return std::make_shared(thread); } StopInfoSP StopInfo::CreateStopReasonWithPlan( ThreadPlanSP &plan_sp, ValueObjectSP return_valobj_sp, ExpressionVariableSP expression_variable_sp) { - return StopInfoSP(new StopInfoThreadPlan(plan_sp, return_valobj_sp, - expression_variable_sp)); + return std::make_shared(plan_sp, return_valobj_sp, + expression_variable_sp); } StopInfoSP StopInfo::CreateStopReasonWithException(Thread &thread, const char *description) { - return StopInfoSP(new StopInfoException(thread, description)); + return std::make_shared(thread, description); } StopInfoSP StopInfo::CreateStopReasonProcessorTrace(Thread &thread, const char *description) { - return StopInfoSP(new StopInfoProcessorTrace(thread, description)); + return std::make_shared(thread, description); } StopInfoSP StopInfo::CreateStopReasonHistoryBoundary(Thread &thread, const char *description) { - return StopInfoSP(new StopInfoHistoryBoundary(thread, description)); + return std::make_shared(thread, description); } StopInfoSP StopInfo::CreateStopReasonWithExec(Thread &thread) { - return StopInfoSP(new StopInfoExec(thread)); + return std::make_shared(thread); } StopInfoSP StopInfo::CreateStopReasonFork(Thread &thread, lldb::pid_t child_pid, lldb::tid_t child_tid) { - return StopInfoSP(new StopInfoFork(thread, child_pid, child_tid)); + return std::make_shared(thread, child_pid, child_tid); } StopInfoSP StopInfo::CreateStopReasonVFork(Thread &thread, lldb::pid_t child_pid, lldb::tid_t child_tid) { - return StopInfoSP(new StopInfoVFork(thread, child_pid, child_tid)); + return std::make_shared(thread, child_pid, child_tid); } StopInfoSP StopInfo::CreateStopReasonVForkDone(Thread &thread) { - return StopInfoSP(new StopInfoVForkDone(thread)); + return std::make_shared(thread); } ValueObjectSP StopInfo::GetReturnValueObject(StopInfoSP &stop_info_sp) { From fef42382882fdd094add0f404d872aa8f4282ea9 Mon Sep 17 00:00:00 2001 From: Colin De Vlieghere Date: Fri, 18 Jul 2025 16:53:11 -0700 Subject: [PATCH 422/813] [MLIR][SCF] Add dedicated Python bindings for ForallOp (#149416) This patch specializes the Python bindings for ForallOp and InParallelOp, similar to the existing one for ForOp. These bindings create the regions and blocks properly and expose some additional helpers. --- mlir/python/mlir/dialects/scf.py | 119 ++++++++++++++++++++++++++++++- mlir/test/python/dialects/scf.py | 20 ++++++ 2 files changed, 138 insertions(+), 1 deletion(-) diff --git a/mlir/python/mlir/dialects/scf.py b/mlir/python/mlir/dialects/scf.py index 2d0047b76c702..678ceeebac204 100644 --- a/mlir/python/mlir/dialects/scf.py +++ b/mlir/python/mlir/dialects/scf.py @@ -17,7 +17,7 @@ except ImportError as e: raise RuntimeError("Error loading imports from extension module") from e -from typing import Optional, Sequence, Union +from typing import List, Optional, Sequence, Tuple, Union @_ods_cext.register_operation(_Dialect, replace=True) @@ -71,6 +71,123 @@ def inner_iter_args(self): return self.body.arguments[1:] +def _dispatch_index_op_fold_results( + ofrs: Sequence[Union[Operation, OpView, Value, int]], +) -> Tuple[List[Value], List[int]]: + """`mlir::dispatchIndexOpFoldResults`""" + dynamic_vals = [] + static_vals = [] + for ofr in ofrs: + if isinstance(ofr, (Operation, OpView, Value)): + val = _get_op_result_or_value(ofr) + dynamic_vals.append(val) + static_vals.append(ShapedType.get_dynamic_size()) + else: + static_vals.append(ofr) + return dynamic_vals, static_vals + + +@_ods_cext.register_operation(_Dialect, replace=True) +class ForallOp(ForallOp): + """Specialization for the SCF forall op class.""" + + def __init__( + self, + lower_bounds: Sequence[Union[Operation, OpView, Value, int]], + upper_bounds: Sequence[Union[Operation, OpView, Value, int]], + steps: Sequence[Union[Value, int]], + shared_outs: Optional[Union[Operation, OpView, Sequence[Value]]] = None, + *, + mapping=None, + loc=None, + ip=None, + ): + """Creates an SCF `forall` operation. + + - `lower_bounds` are the values to use as lower bounds of the loop. + - `upper_bounds` are the values to use as upper bounds of the loop. + - `steps` are the values to use as loop steps. + - `shared_outs` is a list of additional loop-carried arguments or an operation + producing them as results. + """ + assert ( + len(lower_bounds) == len(upper_bounds) == len(steps) + ), "Mismatch in length of lower bounds, upper bounds, and steps" + if shared_outs is None: + shared_outs = [] + shared_outs = _get_op_results_or_values(shared_outs) + + dynamic_lbs, static_lbs = _dispatch_index_op_fold_results(lower_bounds) + dynamic_ubs, static_ubs = _dispatch_index_op_fold_results(upper_bounds) + dynamic_steps, static_steps = _dispatch_index_op_fold_results(steps) + + results = [arg.type for arg in shared_outs] + super().__init__( + results, + dynamic_lbs, + dynamic_ubs, + dynamic_steps, + static_lbs, + static_ubs, + static_steps, + shared_outs, + mapping=mapping, + loc=loc, + ip=ip, + ) + rank = len(static_lbs) + iv_types = [IndexType.get()] * rank + self.regions[0].blocks.append(*iv_types, *results) + + @property + def body(self) -> Block: + """Returns the body (block) of the loop.""" + return self.regions[0].blocks[0] + + @property + def rank(self) -> int: + """Returns the number of induction variables the loop has.""" + return len(self.staticLowerBound) + + @property + def induction_variables(self) -> BlockArgumentList: + """Returns the induction variables usable within the loop.""" + return self.body.arguments[: self.rank] + + @property + def inner_iter_args(self) -> BlockArgumentList: + """Returns the loop-carried arguments usable within the loop. + + To obtain the loop-carried operands, use `iter_args`. + """ + return self.body.arguments[self.rank :] + + def terminator(self) -> InParallelOp: + """ + Returns the loop terminator if it exists. + Otherwise, creates a new one. + """ + ops = self.body.operations + with InsertionPoint(self.body): + if not ops: + return InParallelOp() + last = ops[len(ops) - 1] + return last if isinstance(last, InParallelOp) else InParallelOp() + + +@_ods_cext.register_operation(_Dialect, replace=True) +class InParallelOp(InParallelOp): + """Specialization of the SCF forall.in_parallel op class.""" + + def __init__(self, loc=None, ip=None): + super().__init__(loc=loc, ip=ip) + self.region.blocks.append() + + @property + def block(self) -> Block: + return self.region.blocks[0] + + @_ods_cext.register_operation(_Dialect, replace=True) class IfOp(IfOp): """Specialization for the SCF if op class.""" diff --git a/mlir/test/python/dialects/scf.py b/mlir/test/python/dialects/scf.py index de61f4613868f..62d11d5e189c8 100644 --- a/mlir/test/python/dialects/scf.py +++ b/mlir/test/python/dialects/scf.py @@ -18,6 +18,26 @@ def constructAndPrintInModule(f): return f +# CHECK-LABEL: TEST: testSimpleForall +# CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (4, 8) shared_outs(%[[BOUND_ARG:.*]] = %{{.*}}) -> (tensor<4x8xf32>) +# CHECK: arith.addi %[[IV0]], %[[IV1]] +# CHECK: scf.forall.in_parallel +@constructAndPrintInModule +def testSimpleForall(): + f32 = F32Type.get() + tensor_type = RankedTensorType.get([4, 8], f32) + + @func.FuncOp.from_py_func(tensor_type) + def forall_loop(tensor): + loop = scf.ForallOp([0, 0], [4, 8], [1, 1], [tensor]) + with InsertionPoint(loop.body): + i, j = loop.induction_variables + arith.addi(i, j) + loop.terminator() + # The verifier will check that the regions have been created properly. + assert loop.verify() + + # CHECK-LABEL: TEST: testSimpleLoop @constructAndPrintInModule def testSimpleLoop(): From a5d6fa68e399dee9eb56f2671670085b26c06b4a Mon Sep 17 00:00:00 2001 From: Jens Reidel Date: Sat, 19 Jul 2025 02:01:44 +0200 Subject: [PATCH 423/813] [compiler-rt][Mips] Fix stat size check on mips64 musl (#143301) The sizes of the struct stat on MIPS64 differ in musl vs glibc. See https://godbolt.org/z/qf9bcq8Y8 for the proof. Prior to this change, compilation for MIPS64 musl would fail. Signed-off-by: Jens Reidel --- .../lib/sanitizer_common/sanitizer_platform_limits_posix.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h index a2b6c37d5450c..0d1273821d655 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h @@ -102,6 +102,8 @@ const unsigned struct_kernel_stat_sz = SANITIZER_ANDROID ? FIRST_32_SECOND_64(104, 128) # if defined(_ABIN32) && _MIPS_SIM == _ABIN32 : FIRST_32_SECOND_64(176, 216); +# elif SANITIZER_MUSL + : FIRST_32_SECOND_64(160, 208); # else : FIRST_32_SECOND_64(160, 216); # endif From 2f38ced51b7c560dcb6d01180efe5ab22bbe004f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 19 Jul 2025 09:15:48 +0900 Subject: [PATCH 424/813] StringMap: Remove redundant member init in constructor (#149491) These are already zeroinitialized in the field definitions. --- llvm/lib/Support/StringMap.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp index 432e1fc343f1f..3432dc15ceef2 100644 --- a/llvm/lib/Support/StringMap.cpp +++ b/llvm/lib/Support/StringMap.cpp @@ -45,23 +45,15 @@ static inline unsigned *getHashTable(StringMapEntryBase **TheTable, uint32_t StringMapImpl::hash(StringRef Key) { return xxh3_64bits(Key); } -StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) { - ItemSize = itemSize; - +StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) + : ItemSize(itemSize) { // If a size is specified, initialize the table with that many buckets. if (InitSize) { // The table will grow when the number of entries reach 3/4 of the number of // buckets. To guarantee that "InitSize" number of entries can be inserted // in the table without growing, we allocate just what is needed here. init(getMinBucketToReserveForEntries(InitSize)); - return; } - - // Otherwise, initialize it with zero buckets to avoid the allocation. - TheTable = nullptr; - NumBuckets = 0; - NumItems = 0; - NumTombstones = 0; } void StringMapImpl::init(unsigned InitSize) { From b3c9ed151f18fbbfe027cf93dd7957f36bcbaccf Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Fri, 18 Jul 2025 18:49:49 -0700 Subject: [PATCH 425/813] [NVPTX][test] regenerate some tests broken by successive changes (#149611) #149393 and #149571 landed in quick succession requiring some tests to be regenerated to account for their interactions. --- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 3 ++- llvm/test/CodeGen/NVPTX/f32x2-instructions.ll | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index a077ca17e4215..8918fbd8c6f3b 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1504,7 +1504,8 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %r2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %r1; ; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll index 2109449fa586c..30afd690452eb 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll @@ -2116,7 +2116,8 @@ define void @test_trunc_to_v2bf16(<2 x float> %a, ptr %p) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2bf16_param_1]; -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2bf16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2bf16_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rn.bf16x2.f32 %r3, %r2, %r1; ; CHECK-NEXT: st.b32 [%rd2], %r3; ; CHECK-NEXT: ret; @@ -2133,7 +2134,8 @@ define void @test_trunc_to_v2f16(<2 x float> %a, ptr %p) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd2, [test_trunc_to_v2f16_param_1]; -; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_to_v2f16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_trunc_to_v2f16_param_0]; +; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: cvt.rn.f16x2.f32 %r3, %r2, %r1; ; CHECK-NEXT: st.b32 [%rd2], %r3; ; CHECK-NEXT: ret; From cfddb401db111c53f0a345c2a590974487a96bb9 Mon Sep 17 00:00:00 2001 From: Muhammad Bassiouni <60100307+bassiounix@users.noreply.github.com> Date: Sat, 19 Jul 2025 05:21:34 +0300 Subject: [PATCH 426/813] [libc][math] Refactor acos implementation to header-only in src/__support/math folder. (#148409) Part of #147386 in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450 --- libc/shared/math.h | 1 + libc/shared/math/acos.h | 23 ++ libc/src/__support/math/CMakeLists.txt | 33 ++ libc/src/__support/math/acos.h | 285 ++++++++++++++++++ .../generic => __support/math}/asin_utils.h | 34 +-- libc/src/math/generic/CMakeLists.txt | 28 +- libc/src/math/generic/acos.cpp | 266 +--------------- libc/src/math/generic/asin.cpp | 2 +- .../llvm-project-overlay/libc/BUILD.bazel | 39 +++ 9 files changed, 403 insertions(+), 308 deletions(-) create mode 100644 libc/shared/math/acos.h create mode 100644 libc/src/__support/math/acos.h rename libc/src/{math/generic => __support/math}/asin_utils.h (96%) diff --git a/libc/shared/math.h b/libc/shared/math.h index 26f69d6fa43ea..8dcfaf0352339 100644 --- a/libc/shared/math.h +++ b/libc/shared/math.h @@ -11,6 +11,7 @@ #include "libc_common.h" +#include "math/acos.h" #include "math/exp.h" #include "math/exp10.h" #include "math/exp10f.h" diff --git a/libc/shared/math/acos.h b/libc/shared/math/acos.h new file mode 100644 index 0000000000000..73c6b512e16f4 --- /dev/null +++ b/libc/shared/math/acos.h @@ -0,0 +1,23 @@ +//===-- Shared acos function ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_MATH_ACOS_H +#define LLVM_LIBC_SHARED_MATH_ACOS_H + +#include "shared/libc_common.h" +#include "src/__support/math/acos.h" + +namespace LIBC_NAMESPACE_DECL { +namespace shared { + +using math::acos; + +} // namespace shared +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SHARED_MATH_ACOS_H diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt index 77a47c65489dd..4a29c2975d523 100644 --- a/libc/src/__support/math/CMakeLists.txt +++ b/libc/src/__support/math/CMakeLists.txt @@ -1,3 +1,36 @@ +add_header_library( + acos + HDRS + acos.h + DEPENDS + .asin_utils + libc.src.__support.math.asin_utils + libc.src.__support.FPUtil.double_double + libc.src.__support.FPUtil.dyadic_float + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.sqrt + libc.src.__support.macros.optimization + libc.src.__support.macros.properties.types + libc.src.__support.macros.properties.cpu_features +) + +add_header_library( + asin_utils + HDRS + asin_utils.h + DEPENDS + libc.src.__support.integer_literals + libc.src.__support.FPUtil.double_double + libc.src.__support.FPUtil.dyadic_float + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.nearest_integer + libc.src.__support.FPUtil.polyeval + libc.src.__support.macros.optimization +) + add_header_library( exp_float_constants HDRS diff --git a/libc/src/__support/math/acos.h b/libc/src/__support/math/acos.h new file mode 100644 index 0000000000000..a7287f11aa302 --- /dev/null +++ b/libc/src/__support/math/acos.h @@ -0,0 +1,285 @@ +//===-- Implementation header for acos --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ACOS_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_ACOS_H + +#include "asin_utils.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/double_double.h" +#include "src/__support/FPUtil/dyadic_float.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/sqrt.h" +#include "src/__support/macros/config.h" +#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY +#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA + +namespace LIBC_NAMESPACE_DECL { + +namespace math { + +using DoubleDouble = fputil::DoubleDouble; +using Float128 = fputil::DyadicFloat<128>; + +static constexpr double acos(double x) { + using FPBits = fputil::FPBits; + + FPBits xbits(x); + int x_exp = xbits.get_biased_exponent(); + + // |x| < 0.5. + if (x_exp < FPBits::EXP_BIAS - 1) { + // |x| < 2^-55. + if (LIBC_UNLIKELY(x_exp < FPBits::EXP_BIAS - 55)) { + // When |x| < 2^-55, acos(x) = pi/2 +#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) + return PI_OVER_TWO.hi; +#else + // Force the evaluation and prevent constant propagation so that it + // is rounded correctly for FE_UPWARD rounding mode. + return (xbits.abs().get_val() + 0x1.0p-160) + PI_OVER_TWO.hi; +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + } + +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + // acos(x) = pi/2 - asin(x) + // = pi/2 - x * P(x^2) + double p = asin_eval(x * x); + return PI_OVER_TWO.hi + fputil::multiply_add(-x, p, PI_OVER_TWO.lo); +#else + unsigned idx = 0; + DoubleDouble x_sq = fputil::exact_mult(x, x); + double err = xbits.abs().get_val() * 0x1.0p-51; + // Polynomial approximation: + // p ~ asin(x)/x + DoubleDouble p = asin_eval(x_sq, idx, err); + // asin(x) ~ x * p + DoubleDouble r0 = fputil::exact_mult(x, p.hi); + // acos(x) = pi/2 - asin(x) + // ~ pi/2 - x * p + // = pi/2 - x * (p.hi + p.lo) + double r_hi = fputil::multiply_add(-x, p.hi, PI_OVER_TWO.hi); + // Use Dekker's 2SUM algorithm to compute the lower part. + double r_lo = ((PI_OVER_TWO.hi - r_hi) - r0.hi) - r0.lo; + r_lo = fputil::multiply_add(-x, p.lo, r_lo + PI_OVER_TWO.lo); + + // Ziv's accuracy test. + + double r_upper = r_hi + (r_lo + err); + double r_lower = r_hi + (r_lo - err); + + if (LIBC_LIKELY(r_upper == r_lower)) + return r_upper; + + // Ziv's accuracy test failed, perform 128-bit calculation. + + // Recalculate mod 1/64. + idx = static_cast(fputil::nearest_integer(x_sq.hi * 0x1.0p6)); + + // Get x^2 - idx/64 exactly. When FMA is available, double-double + // multiplication will be correct for all rounding modes. Otherwise we use + // Float128 directly. + Float128 x_f128(x); + +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE + // u = x^2 - idx/64 + Float128 u_hi( + fputil::multiply_add(static_cast(idx), -0x1.0p-6, x_sq.hi)); + Float128 u = fputil::quick_add(u_hi, Float128(x_sq.lo)); +#else + Float128 x_sq_f128 = fputil::quick_mul(x_f128, x_f128); + Float128 u = fputil::quick_add( + x_sq_f128, Float128(static_cast(idx) * (-0x1.0p-6))); +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE + + Float128 p_f128 = asin_eval(u, idx); + // Flip the sign of x_f128 to perform subtraction. + x_f128.sign = x_f128.sign.negate(); + Float128 r = + fputil::quick_add(PI_OVER_TWO_F128, fputil::quick_mul(x_f128, p_f128)); + + return static_cast(r); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS + } + // |x| >= 0.5 + + double x_abs = xbits.abs().get_val(); + + // Maintaining the sign: + constexpr double SIGN[2] = {1.0, -1.0}; + double x_sign = SIGN[xbits.is_neg()]; + // |x| >= 1 + if (LIBC_UNLIKELY(x_exp >= FPBits::EXP_BIAS)) { + // x = +-1, asin(x) = +- pi/2 + if (x_abs == 1.0) { + // x = 1, acos(x) = 0, + // x = -1, acos(x) = pi + return x == 1.0 ? 0.0 : fputil::multiply_add(-x_sign, PI.hi, PI.lo); + } + // |x| > 1, return NaN. + if (xbits.is_quiet_nan()) + return x; + + // Set domain error for non-NaN input. + if (!xbits.is_nan()) + fputil::set_errno_if_required(EDOM); + + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + + // When |x| >= 0.5, we perform range reduction as follow: + // + // When 0.5 <= x < 1, let: + // y = acos(x) + // We will use the double angle formula: + // cos(2y) = 1 - 2 sin^2(y) + // and the complement angle identity: + // x = cos(y) = 1 - 2 sin^2 (y/2) + // So: + // sin(y/2) = sqrt( (1 - x)/2 ) + // And hence: + // y/2 = asin( sqrt( (1 - x)/2 ) ) + // Equivalently: + // acos(x) = y = 2 * asin( sqrt( (1 - x)/2 ) ) + // Let u = (1 - x)/2, then: + // acos(x) = 2 * asin( sqrt(u) ) + // Moreover, since 0.5 <= x < 1: + // 0 < u <= 1/4, and 0 < sqrt(u) <= 0.5, + // And hence we can reuse the same polynomial approximation of asin(x) when + // |x| <= 0.5: + // acos(x) ~ 2 * sqrt(u) * P(u). + // + // When -1 < x <= -0.5, we reduce to the previous case using the formula: + // acos(x) = pi - acos(-x) + // = pi - 2 * asin ( sqrt( (1 + x)/2 ) ) + // ~ pi - 2 * sqrt(u) * P(u), + // where u = (1 - |x|)/2. + + // u = (1 - |x|)/2 + double u = fputil::multiply_add(x_abs, -0.5, 0.5); + // v_hi + v_lo ~ sqrt(u). + // Let: + // h = u - v_hi^2 = (sqrt(u) - v_hi) * (sqrt(u) + v_hi) + // Then: + // sqrt(u) = v_hi + h / (sqrt(u) + v_hi) + // ~ v_hi + h / (2 * v_hi) + // So we can use: + // v_lo = h / (2 * v_hi). + double v_hi = fputil::sqrt(u); + +#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS + constexpr DoubleDouble CONST_TERM[2] = {{0.0, 0.0}, PI}; + DoubleDouble const_term = CONST_TERM[xbits.is_neg()]; + + double p = asin_eval(u); + double scale = x_sign * 2.0 * v_hi; + double r = const_term.hi + fputil::multiply_add(scale, p, const_term.lo); + return r; +#else + +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE + double h = fputil::multiply_add(v_hi, -v_hi, u); +#else + DoubleDouble v_hi_sq = fputil::exact_mult(v_hi, v_hi); + double h = (u - v_hi_sq.hi) - v_hi_sq.lo; +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE + + // Scale v_lo and v_hi by 2 from the formula: + // vh = v_hi * 2 + // vl = 2*v_lo = h / v_hi. + double vh = v_hi * 2.0; + double vl = h / v_hi; + + // Polynomial approximation: + // p ~ asin(sqrt(u))/sqrt(u) + unsigned idx = 0; + double err = vh * 0x1.0p-51; + + DoubleDouble p = asin_eval(DoubleDouble{0.0, u}, idx, err); + + // Perform computations in double-double arithmetic: + // asin(x) = pi/2 - (v_hi + v_lo) * (ASIN_COEFFS[idx][0] + p) + DoubleDouble r0 = fputil::quick_mult(DoubleDouble{vl, vh}, p); + + double r_hi = 0, r_lo = 0; + if (xbits.is_pos()) { + r_hi = r0.hi; + r_lo = r0.lo; + } else { + DoubleDouble r = fputil::exact_add(PI.hi, -r0.hi); + r_hi = r.hi; + r_lo = (PI.lo - r0.lo) + r.lo; + } + + // Ziv's accuracy test. + + double r_upper = r_hi + (r_lo + err); + double r_lower = r_hi + (r_lo - err); + + if (LIBC_LIKELY(r_upper == r_lower)) + return r_upper; + + // Ziv's accuracy test failed, we redo the computations in Float128. + // Recalculate mod 1/64. + idx = static_cast(fputil::nearest_integer(u * 0x1.0p6)); + + // After the first step of Newton-Raphson approximating v = sqrt(u), we have + // that: + // sqrt(u) = v_hi + h / (sqrt(u) + v_hi) + // v_lo = h / (2 * v_hi) + // With error: + // sqrt(u) - (v_hi + v_lo) = h * ( 1/(sqrt(u) + v_hi) - 1/(2*v_hi) ) + // = -h^2 / (2*v * (sqrt(u) + v)^2). + // Since: + // (sqrt(u) + v_hi)^2 ~ (2sqrt(u))^2 = 4u, + // we can add another correction term to (v_hi + v_lo) that is: + // v_ll = -h^2 / (2*v_hi * 4u) + // = -v_lo * (h / 4u) + // = -vl * (h / 8u), + // making the errors: + // sqrt(u) - (v_hi + v_lo + v_ll) = O(h^3) + // well beyond 128-bit precision needed. + + // Get the rounding error of vl = 2 * v_lo ~ h / vh + // Get full product of vh * vl +#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE + double vl_lo = fputil::multiply_add(-v_hi, vl, h) / v_hi; +#else + DoubleDouble vh_vl = fputil::exact_mult(v_hi, vl); + double vl_lo = ((h - vh_vl.hi) - vh_vl.lo) / v_hi; +#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE + // vll = 2*v_ll = -vl * (h / (4u)). + double t = h * (-0.25) / u; + double vll = fputil::multiply_add(vl, t, vl_lo); + // m_v = -(v_hi + v_lo + v_ll). + Float128 m_v = fputil::quick_add( + Float128(vh), fputil::quick_add(Float128(vl), Float128(vll))); + m_v.sign = xbits.sign(); + + // Perform computations in Float128: + // acos(x) = (v_hi + v_lo + vll) * P(u) , when 0.5 <= x < 1, + // = pi - (v_hi + v_lo + vll) * P(u) , when -1 < x <= -0.5. + Float128 y_f128(fputil::multiply_add(static_cast(idx), -0x1.0p-6, u)); + + Float128 p_f128 = asin_eval(y_f128, idx); + Float128 r_f128 = fputil::quick_mul(m_v, p_f128); + + if (xbits.is_neg()) + r_f128 = fputil::quick_add(PI_F128, r_f128); + + return static_cast(r_f128); +#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS +} + +} // namespace math + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ACOS_H diff --git a/libc/src/math/generic/asin_utils.h b/libc/src/__support/math/asin_utils.h similarity index 96% rename from libc/src/math/generic/asin_utils.h rename to libc/src/__support/math/asin_utils.h index 44913d573de2c..3146444afc51f 100644 --- a/libc/src/math/generic/asin_utils.h +++ b/libc/src/__support/math/asin_utils.h @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_LIBC_SRC_MATH_GENERIC_ASIN_UTILS_H -#define LLVM_LIBC_SRC_MATH_GENERIC_ASIN_UTILS_H +#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ASIN_UTILS_H +#define LLVM_LIBC_SRC___SUPPORT_MATH_ASIN_UTILS_H #include "src/__support/FPUtil/PolyEval.h" #include "src/__support/FPUtil/double_double.h" @@ -16,7 +16,6 @@ #include "src/__support/FPUtil/nearest_integer.h" #include "src/__support/integer_literals.h" #include "src/__support/macros/config.h" -#include "src/__support/macros/optimization.h" namespace LIBC_NAMESPACE_DECL { @@ -25,10 +24,10 @@ namespace { using DoubleDouble = fputil::DoubleDouble; using Float128 = fputil::DyadicFloat<128>; -constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p1}; +static constexpr DoubleDouble PI = {0x1.1a62633145c07p-53, 0x1.921fb54442d18p1}; -constexpr DoubleDouble PI_OVER_TWO = {0x1.1a62633145c07p-54, - 0x1.921fb54442d18p0}; +static constexpr DoubleDouble PI_OVER_TWO = {0x1.1a62633145c07p-54, + 0x1.921fb54442d18p0}; #ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS @@ -39,14 +38,14 @@ constexpr DoubleDouble PI_OVER_TWO = {0x1.1a62633145c07p-54, // > dirtyinfnorm(asin(x)/x - P, [0, 0.5]); // 0x1.1a71ef0a0f26a9fb7ed7e41dee788b13d1770db3dp-52 -constexpr double ASIN_COEFFS[12] = { +static constexpr double ASIN_COEFFS[12] = { 0x1.0000000000000p0, 0x1.5555555556dcfp-3, 0x1.3333333082e11p-4, 0x1.6db6dd14099edp-5, 0x1.f1c69b35bf81fp-6, 0x1.6e97194225a67p-6, 0x1.1babddb82ce12p-6, 0x1.d55bd078600d6p-7, 0x1.33328959e63d6p-7, 0x1.2b5993bda1d9bp-6, -0x1.806aff270bf25p-7, 0x1.02614e5ed3936p-5, }; -LIBC_INLINE double asin_eval(double u) { +LIBC_INLINE static constexpr double asin_eval(double u) { double u2 = u * u; double c0 = fputil::multiply_add(u, ASIN_COEFFS[1], ASIN_COEFFS[0]); double c1 = fputil::multiply_add(u, ASIN_COEFFS[3], ASIN_COEFFS[2]); @@ -124,7 +123,7 @@ LIBC_INLINE double asin_eval(double u) { // > dirtyinfnorm(asin(x)/x - P, [-1/64, 1/64]); // 0x1.999075402cafp-83 -constexpr double ASIN_COEFFS[9][12] = { +static constexpr double ASIN_COEFFS[9][12] = { {1.0, 0.0, 0x1.5555555555555p-3, 0x1.5555555555555p-57, 0x1.3333333333333p-4, 0x1.6db6db6db6db7p-5, 0x1.f1c71c71c71c7p-6, 0x1.6e8ba2e8ba2e9p-6, 0x1.1c4ec4ec4ec4fp-6, 0x1.c99999999999ap-7, @@ -164,8 +163,8 @@ constexpr double ASIN_COEFFS[9][12] = { }; // We calculate the lower part of the approximation P(u). -LIBC_INLINE DoubleDouble asin_eval(const DoubleDouble &u, unsigned &idx, - double &err) { +LIBC_INLINE static DoubleDouble asin_eval(const DoubleDouble &u, unsigned &idx, + double &err) { using fputil::multiply_add; // k = round(u * 32). double k = fputil::nearest_integer(u.hi * 0x1.0p5); @@ -239,7 +238,7 @@ LIBC_INLINE DoubleDouble asin_eval(const DoubleDouble &u, unsigned &idx, // + (676039 x^24)/104857600 + (1300075 x^26)/226492416 + // + (5014575 x^28)/973078528 + (9694845 x^30)/2080374784. -constexpr Float128 ASIN_COEFFS_F128[17][16] = { +static constexpr Float128 ASIN_COEFFS_F128[17][16] = { { {Sign::POS, -127, 0x80000000'00000000'00000000'00000000_u128}, {Sign::POS, -130, 0xaaaaaaaa'aaaaaaaa'aaaaaaaa'aaaaaaab_u128}, @@ -548,13 +547,14 @@ constexpr Float128 ASIN_COEFFS_F128[17][16] = { }, }; -constexpr Float128 PI_OVER_TWO_F128 = { +static constexpr Float128 PI_OVER_TWO_F128 = { Sign::POS, -127, 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}; -constexpr Float128 PI_F128 = {Sign::POS, -126, - 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}; +static constexpr Float128 PI_F128 = { + Sign::POS, -126, 0xc90fdaa2'2168c234'c4c6628b'80dc1cd1_u128}; -LIBC_INLINE Float128 asin_eval(const Float128 &u, unsigned idx) { +LIBC_INLINE static constexpr Float128 asin_eval(const Float128 &u, + unsigned idx) { return fputil::polyeval(u, ASIN_COEFFS_F128[idx][0], ASIN_COEFFS_F128[idx][1], ASIN_COEFFS_F128[idx][2], ASIN_COEFFS_F128[idx][3], ASIN_COEFFS_F128[idx][4], ASIN_COEFFS_F128[idx][5], @@ -571,4 +571,4 @@ LIBC_INLINE Float128 asin_eval(const Float128 &u, unsigned idx) { } // namespace LIBC_NAMESPACE_DECL -#endif // LLVM_LIBC_SRC_MATH_GENERIC_ASIN_UTILS_H +#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ASIN_UTILS_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index fb253a4502700..7e6a32b7cdf16 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -4016,20 +4016,6 @@ add_entrypoint_object( libc.src.__support.macros.properties.types ) -add_header_library( - asin_utils - HDRS - atan_utils.h - DEPENDS - libc.src.__support.integer_literals - libc.src.__support.FPUtil.double_double - libc.src.__support.FPUtil.dyadic_float - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.nearest_integer - libc.src.__support.FPUtil.polyeval - libc.src.__support.macros.optimization -) - add_entrypoint_object( asin SRCS @@ -4037,7 +4023,7 @@ add_entrypoint_object( HDRS ../asin.h DEPENDS - .asin_utils + libc.src.__support.math.asin_utils libc.src.__support.FPUtil.double_double libc.src.__support.FPUtil.dyadic_float libc.src.__support.FPUtil.fenv_impl @@ -4092,17 +4078,7 @@ add_entrypoint_object( HDRS ../acos.h DEPENDS - .asin_utils - libc.src.__support.FPUtil.double_double - libc.src.__support.FPUtil.dyadic_float - libc.src.__support.FPUtil.fenv_impl - libc.src.__support.FPUtil.fp_bits - libc.src.__support.FPUtil.multiply_add - libc.src.__support.FPUtil.polyeval - libc.src.__support.FPUtil.sqrt - libc.src.__support.macros.optimization - libc.src.__support.macros.properties.types - libc.src.__support.macros.properties.cpu_features + libc.src.__support.math.acos ) add_entrypoint_object( diff --git a/libc/src/math/generic/acos.cpp b/libc/src/math/generic/acos.cpp index c14721faef3ce..3a5964290cdd3 100644 --- a/libc/src/math/generic/acos.cpp +++ b/libc/src/math/generic/acos.cpp @@ -7,272 +7,10 @@ //===----------------------------------------------------------------------===// #include "src/math/acos.h" -#include "asin_utils.h" -#include "src/__support/FPUtil/FEnvImpl.h" -#include "src/__support/FPUtil/FPBits.h" -#include "src/__support/FPUtil/PolyEval.h" -#include "src/__support/FPUtil/double_double.h" -#include "src/__support/FPUtil/dyadic_float.h" -#include "src/__support/FPUtil/multiply_add.h" -#include "src/__support/FPUtil/sqrt.h" -#include "src/__support/macros/config.h" -#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY -#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA +#include "src/__support/math/acos.h" namespace LIBC_NAMESPACE_DECL { -using DoubleDouble = fputil::DoubleDouble; -using Float128 = fputil::DyadicFloat<128>; - -LLVM_LIBC_FUNCTION(double, acos, (double x)) { - using FPBits = fputil::FPBits; - - FPBits xbits(x); - int x_exp = xbits.get_biased_exponent(); - - // |x| < 0.5. - if (x_exp < FPBits::EXP_BIAS - 1) { - // |x| < 2^-55. - if (LIBC_UNLIKELY(x_exp < FPBits::EXP_BIAS - 55)) { - // When |x| < 2^-55, acos(x) = pi/2 -#if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) - return PI_OVER_TWO.hi; -#else - // Force the evaluation and prevent constant propagation so that it - // is rounded correctly for FE_UPWARD rounding mode. - return (xbits.abs().get_val() + 0x1.0p-160) + PI_OVER_TWO.hi; -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS - } - -#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - // acos(x) = pi/2 - asin(x) - // = pi/2 - x * P(x^2) - double p = asin_eval(x * x); - return PI_OVER_TWO.hi + fputil::multiply_add(-x, p, PI_OVER_TWO.lo); -#else - unsigned idx; - DoubleDouble x_sq = fputil::exact_mult(x, x); - double err = xbits.abs().get_val() * 0x1.0p-51; - // Polynomial approximation: - // p ~ asin(x)/x - DoubleDouble p = asin_eval(x_sq, idx, err); - // asin(x) ~ x * p - DoubleDouble r0 = fputil::exact_mult(x, p.hi); - // acos(x) = pi/2 - asin(x) - // ~ pi/2 - x * p - // = pi/2 - x * (p.hi + p.lo) - double r_hi = fputil::multiply_add(-x, p.hi, PI_OVER_TWO.hi); - // Use Dekker's 2SUM algorithm to compute the lower part. - double r_lo = ((PI_OVER_TWO.hi - r_hi) - r0.hi) - r0.lo; - r_lo = fputil::multiply_add(-x, p.lo, r_lo + PI_OVER_TWO.lo); - - // Ziv's accuracy test. - - double r_upper = r_hi + (r_lo + err); - double r_lower = r_hi + (r_lo - err); - - if (LIBC_LIKELY(r_upper == r_lower)) - return r_upper; - - // Ziv's accuracy test failed, perform 128-bit calculation. - - // Recalculate mod 1/64. - idx = static_cast(fputil::nearest_integer(x_sq.hi * 0x1.0p6)); - - // Get x^2 - idx/64 exactly. When FMA is available, double-double - // multiplication will be correct for all rounding modes. Otherwise we use - // Float128 directly. - Float128 x_f128(x); - -#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE - // u = x^2 - idx/64 - Float128 u_hi( - fputil::multiply_add(static_cast(idx), -0x1.0p-6, x_sq.hi)); - Float128 u = fputil::quick_add(u_hi, Float128(x_sq.lo)); -#else - Float128 x_sq_f128 = fputil::quick_mul(x_f128, x_f128); - Float128 u = fputil::quick_add( - x_sq_f128, Float128(static_cast(idx) * (-0x1.0p-6))); -#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE - - Float128 p_f128 = asin_eval(u, idx); - // Flip the sign of x_f128 to perform subtraction. - x_f128.sign = x_f128.sign.negate(); - Float128 r = - fputil::quick_add(PI_OVER_TWO_F128, fputil::quick_mul(x_f128, p_f128)); - - return static_cast(r); -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS - } - // |x| >= 0.5 - - double x_abs = xbits.abs().get_val(); - - // Maintaining the sign: - constexpr double SIGN[2] = {1.0, -1.0}; - double x_sign = SIGN[xbits.is_neg()]; - // |x| >= 1 - if (LIBC_UNLIKELY(x_exp >= FPBits::EXP_BIAS)) { - // x = +-1, asin(x) = +- pi/2 - if (x_abs == 1.0) { - // x = 1, acos(x) = 0, - // x = -1, acos(x) = pi - return x == 1.0 ? 0.0 : fputil::multiply_add(-x_sign, PI.hi, PI.lo); - } - // |x| > 1, return NaN. - if (xbits.is_quiet_nan()) - return x; - - // Set domain error for non-NaN input. - if (!xbits.is_nan()) - fputil::set_errno_if_required(EDOM); - - fputil::raise_except_if_required(FE_INVALID); - return FPBits::quiet_nan().get_val(); - } - - // When |x| >= 0.5, we perform range reduction as follow: - // - // When 0.5 <= x < 1, let: - // y = acos(x) - // We will use the double angle formula: - // cos(2y) = 1 - 2 sin^2(y) - // and the complement angle identity: - // x = cos(y) = 1 - 2 sin^2 (y/2) - // So: - // sin(y/2) = sqrt( (1 - x)/2 ) - // And hence: - // y/2 = asin( sqrt( (1 - x)/2 ) ) - // Equivalently: - // acos(x) = y = 2 * asin( sqrt( (1 - x)/2 ) ) - // Let u = (1 - x)/2, then: - // acos(x) = 2 * asin( sqrt(u) ) - // Moreover, since 0.5 <= x < 1: - // 0 < u <= 1/4, and 0 < sqrt(u) <= 0.5, - // And hence we can reuse the same polynomial approximation of asin(x) when - // |x| <= 0.5: - // acos(x) ~ 2 * sqrt(u) * P(u). - // - // When -1 < x <= -0.5, we reduce to the previous case using the formula: - // acos(x) = pi - acos(-x) - // = pi - 2 * asin ( sqrt( (1 + x)/2 ) ) - // ~ pi - 2 * sqrt(u) * P(u), - // where u = (1 - |x|)/2. - - // u = (1 - |x|)/2 - double u = fputil::multiply_add(x_abs, -0.5, 0.5); - // v_hi + v_lo ~ sqrt(u). - // Let: - // h = u - v_hi^2 = (sqrt(u) - v_hi) * (sqrt(u) + v_hi) - // Then: - // sqrt(u) = v_hi + h / (sqrt(u) + v_hi) - // ~ v_hi + h / (2 * v_hi) - // So we can use: - // v_lo = h / (2 * v_hi). - double v_hi = fputil::sqrt(u); - -#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS - constexpr DoubleDouble CONST_TERM[2] = {{0.0, 0.0}, PI}; - DoubleDouble const_term = CONST_TERM[xbits.is_neg()]; - - double p = asin_eval(u); - double scale = x_sign * 2.0 * v_hi; - double r = const_term.hi + fputil::multiply_add(scale, p, const_term.lo); - return r; -#else - -#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE - double h = fputil::multiply_add(v_hi, -v_hi, u); -#else - DoubleDouble v_hi_sq = fputil::exact_mult(v_hi, v_hi); - double h = (u - v_hi_sq.hi) - v_hi_sq.lo; -#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE - - // Scale v_lo and v_hi by 2 from the formula: - // vh = v_hi * 2 - // vl = 2*v_lo = h / v_hi. - double vh = v_hi * 2.0; - double vl = h / v_hi; - - // Polynomial approximation: - // p ~ asin(sqrt(u))/sqrt(u) - unsigned idx; - double err = vh * 0x1.0p-51; - - DoubleDouble p = asin_eval(DoubleDouble{0.0, u}, idx, err); - - // Perform computations in double-double arithmetic: - // asin(x) = pi/2 - (v_hi + v_lo) * (ASIN_COEFFS[idx][0] + p) - DoubleDouble r0 = fputil::quick_mult(DoubleDouble{vl, vh}, p); - - double r_hi, r_lo; - if (xbits.is_pos()) { - r_hi = r0.hi; - r_lo = r0.lo; - } else { - DoubleDouble r = fputil::exact_add(PI.hi, -r0.hi); - r_hi = r.hi; - r_lo = (PI.lo - r0.lo) + r.lo; - } - - // Ziv's accuracy test. - - double r_upper = r_hi + (r_lo + err); - double r_lower = r_hi + (r_lo - err); - - if (LIBC_LIKELY(r_upper == r_lower)) - return r_upper; - - // Ziv's accuracy test failed, we redo the computations in Float128. - // Recalculate mod 1/64. - idx = static_cast(fputil::nearest_integer(u * 0x1.0p6)); - - // After the first step of Newton-Raphson approximating v = sqrt(u), we have - // that: - // sqrt(u) = v_hi + h / (sqrt(u) + v_hi) - // v_lo = h / (2 * v_hi) - // With error: - // sqrt(u) - (v_hi + v_lo) = h * ( 1/(sqrt(u) + v_hi) - 1/(2*v_hi) ) - // = -h^2 / (2*v * (sqrt(u) + v)^2). - // Since: - // (sqrt(u) + v_hi)^2 ~ (2sqrt(u))^2 = 4u, - // we can add another correction term to (v_hi + v_lo) that is: - // v_ll = -h^2 / (2*v_hi * 4u) - // = -v_lo * (h / 4u) - // = -vl * (h / 8u), - // making the errors: - // sqrt(u) - (v_hi + v_lo + v_ll) = O(h^3) - // well beyond 128-bit precision needed. - - // Get the rounding error of vl = 2 * v_lo ~ h / vh - // Get full product of vh * vl -#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE - double vl_lo = fputil::multiply_add(-v_hi, vl, h) / v_hi; -#else - DoubleDouble vh_vl = fputil::exact_mult(v_hi, vl); - double vl_lo = ((h - vh_vl.hi) - vh_vl.lo) / v_hi; -#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE - // vll = 2*v_ll = -vl * (h / (4u)). - double t = h * (-0.25) / u; - double vll = fputil::multiply_add(vl, t, vl_lo); - // m_v = -(v_hi + v_lo + v_ll). - Float128 m_v = fputil::quick_add( - Float128(vh), fputil::quick_add(Float128(vl), Float128(vll))); - m_v.sign = xbits.sign(); - - // Perform computations in Float128: - // acos(x) = (v_hi + v_lo + vll) * P(u) , when 0.5 <= x < 1, - // = pi - (v_hi + v_lo + vll) * P(u) , when -1 < x <= -0.5. - Float128 y_f128(fputil::multiply_add(static_cast(idx), -0x1.0p-6, u)); - - Float128 p_f128 = asin_eval(y_f128, idx); - Float128 r_f128 = fputil::quick_mul(m_v, p_f128); - - if (xbits.is_neg()) - r_f128 = fputil::quick_add(PI_F128, r_f128); - - return static_cast(r_f128); -#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS -} +LLVM_LIBC_FUNCTION(double, acos, (double x)) { return math::acos(x); } } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/math/generic/asin.cpp b/libc/src/math/generic/asin.cpp index ad77683d1f880..c033597334345 100644 --- a/libc/src/math/generic/asin.cpp +++ b/libc/src/math/generic/asin.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/math/asin.h" -#include "asin_utils.h" #include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/PolyEval.h" @@ -18,6 +17,7 @@ #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA +#include "src/__support/math/asin_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index f0b45a99aae40..1d9989debdcdb 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -2077,6 +2077,38 @@ libc_support_library( ], ) +libc_support_library( + name = "__support_math_acos", + hdrs = ["src/__support/math/acos.h"], + deps = [ + ":__support_math_asin_utils", + ":__support_fputil_double_double", + ":__support_fputil_dyadic_float", + ":__support_fputil_fenv_impl", + ":__support_fputil_fp_bits", + ":__support_fputil_multiply_add", + ":__support_fputil_polyeval", + ":__support_fputil_sqrt", + ":__support_macros_optimization", + ":__support_macros_properties_types", + ":__support_macros_properties_cpu_features", + ], +) + +libc_support_library( + name = "__support_math_asin_utils", + hdrs = ["src/__support/math/asin_utils.h"], + deps = [ + ":__support_integer_literals", + ":__support_fputil_double_double", + ":__support_fputil_dyadic_float", + ":__support_fputil_multiply_add", + ":__support_fputil_nearest_integer", + ":__support_fputil_polyeval", + ":__support_macros_optimization", + ], +) + libc_support_library( name = "__support_math_exp_float_constants", hdrs = ["src/__support/math/exp_float_constants.h"], @@ -2554,6 +2586,13 @@ libc_function( ################################ math targets ################################## +libc_math_function( + name = "acos", + additional_deps = [ + ":__support_math_acos", + ], +) + libc_math_function( name = "acosf", additional_deps = [ From e1ac57c1a560b1d9891f93081e2f9c862c4a8d77 Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Fri, 18 Jul 2025 22:21:09 -0500 Subject: [PATCH 427/813] [mlir][test] Add missing `REQUIRES: asserts` for --debug-only flag (#149634) Debug flags are not provided in fully optimized builds. Test added in #149378 / #146228 --- mlir/test/IR/test-pattern-logging-listener.mlir | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/test/IR/test-pattern-logging-listener.mlir b/mlir/test/IR/test-pattern-logging-listener.mlir index e8f0d61a75960..c521110a87aa3 100644 --- a/mlir/test/IR/test-pattern-logging-listener.mlir +++ b/mlir/test/IR/test-pattern-logging-listener.mlir @@ -1,3 +1,4 @@ +// REQUIRES: asserts // RUN: mlir-opt %s --test-walk-pattern-rewrite-driver \ // RUN: --allow-unregistered-dialect --debug-only=pattern-logging-listener 2>&1 | FileCheck %s From e57315e6ca8f05154e205136bb940b8cb14028d3 Mon Sep 17 00:00:00 2001 From: Teresa Johnson Date: Fri, 18 Jul 2025 21:11:37 -0700 Subject: [PATCH 428/813] [MemProf] Fix discarding of noncold contexts after inlining (#149599) When we rebuild the call site tries after inlining of an allocation with MD_memprof metadata, we don't want to reapply the discarding of small non-cold contexts (under -memprof-callsite-cold-threshold=) because we have either no context size info (without -memprof-report-hinted-sizes or another option that causes us to keep that as metadata), and even with that information in the metadata, we have imperfect information at that point as we have already discarded some contexts during matching. The first case was even worse because we didn't guard our check by whether the number of cold bytes was 0, leading to very aggressive pruning during post-inline metadata rebuilding without the context size information. --- .../include/llvm/Analysis/MemoryProfileInfo.h | 6 + llvm/lib/Analysis/MemoryProfileInfo.cpp | 41 ++- .../test/Transforms/Inline/memprof_inline2.ll | 3 + .../test/Transforms/Inline/memprof_inline3.ll | 296 ++++++++++++++++++ 4 files changed, 333 insertions(+), 13 deletions(-) create mode 100644 llvm/test/Transforms/Inline/memprof_inline3.ll diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h index b042a717e4e49..571caf95f275d 100644 --- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h +++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h @@ -102,6 +102,12 @@ class CallStackTrie { // The maximum size of a cold allocation context, from the profile summary. uint64_t MaxColdSize; + // Tracks whether we have built the Trie from existing MD_memprof metadata. We + // apply different heuristics for determining whether to discard non-cold + // contexts when rebuilding as we have lost information available during the + // original profile match. + bool BuiltFromExistingMetadata = false; + void deleteTrieNode(CallStackTrieNode *Node) { if (!Node) return; diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp index c08024a38ffc2..b3c8a7d4563b7 100644 --- a/llvm/lib/Analysis/MemoryProfileInfo.cpp +++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp @@ -157,6 +157,8 @@ void CallStackTrie::addCallStack( } void CallStackTrie::addCallStack(MDNode *MIB) { + // Note that we are building this from existing MD_memprof metadata. + BuiltFromExistingMetadata = true; MDNode *StackMD = getMIBStackNode(MIB); assert(StackMD); std::vector CallStack; @@ -187,8 +189,9 @@ void CallStackTrie::addCallStack(MDNode *MIB) { static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, AllocationType AllocType, ArrayRef ContextSizeInfo, - const uint64_t MaxColdSize, uint64_t &TotalBytes, - uint64_t &ColdBytes) { + const uint64_t MaxColdSize, + bool BuiltFromExistingMetadata, + uint64_t &TotalBytes, uint64_t &ColdBytes) { SmallVector MIBPayload( {buildCallstackMetadata(MIBCallStack, Ctx)}); MIBPayload.push_back( @@ -197,8 +200,9 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef MIBCallStack, if (ContextSizeInfo.empty()) { // The profile matcher should have provided context size info if there was a // MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully - // handle a user-provided percent larger than 100. - assert(MinCallsiteColdBytePercent >= 100); + // handle a user-provided percent larger than 100. However, we may not have + // this information if we built the Trie from existing MD_memprof metadata. + assert(BuiltFromExistingMetadata || MinCallsiteColdBytePercent >= 100); return MDNode::get(Ctx, MIBPayload); } @@ -252,9 +256,19 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) { static void saveFilteredNewMIBNodes(std::vector &NewMIBNodes, std::vector &SavedMIBNodes, unsigned CallerContextLength, - uint64_t TotalBytes, uint64_t ColdBytes) { + uint64_t TotalBytes, uint64_t ColdBytes, + bool BuiltFromExistingMetadata) { const bool MostlyCold = - MinCallsiteColdBytePercent < 100 && + // If we have built the Trie from existing MD_memprof metadata, we may or + // may not have context size information (in which case ColdBytes and + // TotalBytes are 0, which is not also guarded against below). Even if we + // do have some context size information from the the metadata, we have + // already gone through a round of discarding of small non-cold contexts + // during matching, and it would be overly aggressive to do it again, and + // we also want to maintain the same behavior with and without reporting + // of hinted bytes enabled. + !BuiltFromExistingMetadata && MinCallsiteColdBytePercent < 100 && + ColdBytes > 0 && ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes; // In the simplest case, with pruning disabled, keep all the new MIB nodes. @@ -386,9 +400,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, if (hasSingleAllocType(Node->AllocTypes)) { std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back( - createMIBNode(Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, - ContextSizeInfo, MaxColdSize, TotalBytes, ColdBytes)); + MIBNodes.push_back(createMIBNode( + Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo, + MaxColdSize, BuiltFromExistingMetadata, TotalBytes, ColdBytes)); return true; } @@ -416,7 +430,8 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, // Pass in the stack length of the MIB nodes added for the immediate caller, // which is the current stack length plus 1. saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1, - CallerTotalBytes, CallerColdBytes); + CallerTotalBytes, CallerColdBytes, + BuiltFromExistingMetadata); TotalBytes += CallerTotalBytes; ColdBytes += CallerColdBytes; @@ -441,9 +456,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx, return false; std::vector ContextSizeInfo; collectContextSizeInfo(Node, ContextSizeInfo); - MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold, - ContextSizeInfo, MaxColdSize, TotalBytes, - ColdBytes)); + MIBNodes.push_back(createMIBNode( + Ctx, MIBCallStack, AllocationType::NotCold, ContextSizeInfo, MaxColdSize, + BuiltFromExistingMetadata, TotalBytes, ColdBytes)); return true; } diff --git a/llvm/test/Transforms/Inline/memprof_inline2.ll b/llvm/test/Transforms/Inline/memprof_inline2.ll index 21448f142ed07..d2e3927602b81 100644 --- a/llvm/test/Transforms/Inline/memprof_inline2.ll +++ b/llvm/test/Transforms/Inline/memprof_inline2.ll @@ -38,6 +38,9 @@ ;; } ; RUN: opt -passes=inline %s -S | FileCheck %s +;; We should not perform additional discarding of non-cold contexts when +;; rebuilding the tries after inlining, even with a very low threshold. +; RUN: opt -passes=inline -memprof-callsite-cold-threshold=1 %s -S | FileCheck %s ; ModuleID = 'memprof_inline2.cc' source_filename = "memprof_inline2.cc" diff --git a/llvm/test/Transforms/Inline/memprof_inline3.ll b/llvm/test/Transforms/Inline/memprof_inline3.ll new file mode 100644 index 0000000000000..e802f2b150da7 --- /dev/null +++ b/llvm/test/Transforms/Inline/memprof_inline3.ll @@ -0,0 +1,296 @@ +;; This test is the same code as memprof_inline2.ll, except that it has +;; manually synthesized context size information. This test ensures that we +;; don't attempt to apply -memprof-callsite-cold-threshold again when +;; rebuilding the metadata after inlining. +; +; RUN: opt -passes=inline %s -S | FileCheck %s +;; We should not perform additional discarding of non-cold contexts when +;; rebuilding the tries after inlining, even with a very low threshold. +; RUN: opt -passes=inline -memprof-callsite-cold-threshold=0 %s -S | FileCheck %s + +; ModuleID = 'memprof_inline2.cc' +source_filename = "memprof_inline2.cc" +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: mustprogress uwtable +; CHECK-LABEL: define dso_local noundef ptr @_Z3foov +define dso_local noundef ptr @_Z3foov() #0 !dbg !39 { +entry: + ;; We should keep the original memprof metadata intact. + ; CHECK: call {{.*}} @_Znam{{.*}} !memprof ![[ORIGMEMPROF:[0-9]+]] + %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #7, !dbg !42, !memprof !43, !callsite !52 + ret ptr %call, !dbg !53 +} + +; Function Attrs: nobuiltin allocsize(0) +declare noundef nonnull ptr @_Znam(i64 noundef) #1 + +;; Mark noinline so we don't inline into calls from bar and baz. We should end +;; up with a memprof metadata on the call to foo below. +; Function Attrs: mustprogress noinline uwtable +; CHECK-LABEL: define dso_local noundef ptr @_Z4foo2v +define dso_local noundef ptr @_Z4foo2v() #2 !dbg !54 { +entry: + ;; We should have memprof metadata for the call stacks from bar and baz, + ;; and the callsite metadata should be the concatentation of the id from the + ;; inlined call to new and the original callsite. + ; CHECK: call {{.*}} @_Znam{{.*}} !memprof ![[NEWMEMPROF:[0-9]+]], !callsite ![[NEWCALLSITE:[0-9]+]] + %call = call noundef ptr @_Z3foov(), !dbg !55, !callsite !56 + ret ptr %call, !dbg !57 +} + +; Function Attrs: mustprogress uwtable +define dso_local noundef ptr @_Z3barv() #0 !dbg !58 { +entry: + %call = call noundef ptr @_Z4foo2v(), !dbg !59, !callsite !60 + ret ptr %call, !dbg !61 +} + +; Function Attrs: mustprogress uwtable +define dso_local noundef ptr @_Z3bazv() #0 !dbg !62 { +entry: + %call = call noundef ptr @_Z4foo2v(), !dbg !63, !callsite !64 + ret ptr %call, !dbg !65 +} + +;; Make sure we don't propagate any memprof/callsite metadata +; Function Attrs: mustprogress uwtable +; CHECK-LABEL: define dso_local noundef ptr @notprofiled +define dso_local noundef ptr @notprofiled() #0 !dbg !66 { +entry: + ;; When foo is inlined, both the memprof and callsite metadata should be + ;; stripped from the inlined call to new, as there is no callsite metadata on + ;; the call. + ; CHECK: call {{.*}} @_Znam + ; CHECK-NOT: !memprof + ; CHECK-NOT: !callsite + %call = call noundef ptr @_Z3foov(), !dbg !67 + ;; When baz is inlined, the callsite metadata should be stripped from the + ;; inlined call to foo2, as there is no callsite metadata on the call. + ; CHECK: call {{.*}} @_Z4foo2v + ; CHECK-NOT: !callsite + %call2 = call noundef ptr @_Z3bazv() + ; CHECK-NEXT: ret + ret ptr %call, !dbg !68 +} + +; Function Attrs: mustprogress noinline norecurse optnone uwtable +define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #3 !dbg !69 { +entry: + %retval = alloca i32, align 4 + %argc.addr = alloca i32, align 4 + %argv.addr = alloca ptr, align 8 + %c = alloca ptr, align 8 + %d = alloca ptr, align 8 + %e = alloca ptr, align 8 + %f = alloca ptr, align 8 + store i32 0, ptr %retval, align 4 + store i32 %argc, ptr %argc.addr, align 4 + store ptr %argv, ptr %argv.addr, align 8 + ;; The below 4 callsites are all annotated as noinline + %call = call noundef ptr @_Z3foov() #8, !dbg !70, !callsite !71 + store ptr %call, ptr %c, align 8, !dbg !72 + %call1 = call noundef ptr @_Z3foov() #8, !dbg !73, !callsite !74 + store ptr %call1, ptr %d, align 8, !dbg !75 + %call2 = call noundef ptr @_Z3barv() #8, !dbg !76, !callsite !77 + store ptr %call2, ptr %e, align 8, !dbg !78 + %call3 = call noundef ptr @_Z3bazv() #8, !dbg !79, !callsite !80 + store ptr %call3, ptr %f, align 8, !dbg !81 + %0 = load ptr, ptr %c, align 8, !dbg !82 + call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false), !dbg !83 + %1 = load ptr, ptr %d, align 8, !dbg !84 + call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false), !dbg !85 + %2 = load ptr, ptr %e, align 8, !dbg !86 + call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false), !dbg !87 + %3 = load ptr, ptr %f, align 8, !dbg !88 + call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false), !dbg !89 + %4 = load ptr, ptr %c, align 8, !dbg !90 + %isnull = icmp eq ptr %4, null, !dbg !91 + br i1 %isnull, label %delete.end, label %delete.notnull, !dbg !91 + +delete.notnull: ; preds = %entry + call void @_ZdaPv(ptr noundef %4) #9, !dbg !92 + br label %delete.end, !dbg !92 + +delete.end: ; preds = %delete.notnull, %entry + %call4 = call i32 @sleep(i32 noundef 200), !dbg !94 + %5 = load ptr, ptr %d, align 8, !dbg !95 + %isnull5 = icmp eq ptr %5, null, !dbg !96 + br i1 %isnull5, label %delete.end7, label %delete.notnull6, !dbg !96 + +delete.notnull6: ; preds = %delete.end + call void @_ZdaPv(ptr noundef %5) #9, !dbg !97 + br label %delete.end7, !dbg !97 + +delete.end7: ; preds = %delete.notnull6, %delete.end + %6 = load ptr, ptr %e, align 8, !dbg !98 + %isnull8 = icmp eq ptr %6, null, !dbg !99 + br i1 %isnull8, label %delete.end10, label %delete.notnull9, !dbg !99 + +delete.notnull9: ; preds = %delete.end7 + call void @_ZdaPv(ptr noundef %6) #9, !dbg !100 + br label %delete.end10, !dbg !100 + +delete.end10: ; preds = %delete.notnull9, %delete.end7 + %7 = load ptr, ptr %f, align 8, !dbg !101 + %isnull11 = icmp eq ptr %7, null, !dbg !102 + br i1 %isnull11, label %delete.end13, label %delete.notnull12, !dbg !102 + +delete.notnull12: ; preds = %delete.end10 + call void @_ZdaPv(ptr noundef %7) #9, !dbg !103 + br label %delete.end13, !dbg !103 + +delete.end13: ; preds = %delete.notnull12, %delete.end10 + ret i32 0, !dbg !104 +} + +; Function Attrs: argmemonly nofree nounwind willreturn writeonly +declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #4 + +; Function Attrs: nobuiltin nounwind +declare void @_ZdaPv(ptr noundef) #5 + +declare i32 @sleep(i32 noundef) #6 + +attributes #0 = { mustprogress uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #2 = { mustprogress noinline uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #3 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #4 = { argmemonly nofree nounwind willreturn writeonly } +attributes #5 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #6 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #7 = { builtin allocsize(0) } +attributes #8 = { noinline } +attributes #9 = { builtin nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8, !9} +!llvm.ident = !{!38} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "memprof_inline.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "8711f6fd269e6cb5611fef48bc906eab") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{i32 7, !"frame-pointer", i32 2} +!9 = !{i32 1, !"ProfileSummary", !10} +!10 = !{!11, !12, !13, !14, !15, !16, !17, !18, !19, !20} +!11 = !{!"ProfileFormat", !"InstrProf"} +!12 = !{!"TotalCount", i64 0} +!13 = !{!"MaxCount", i64 0} +!14 = !{!"MaxInternalCount", i64 0} +!15 = !{!"MaxFunctionCount", i64 0} +!16 = !{!"NumCounts", i64 0} +!17 = !{!"NumFunctions", i64 0} +!18 = !{!"IsPartialProfile", i64 0} +!19 = !{!"PartialProfileRatio", double 0.000000e+00} +!20 = !{!"DetailedSummary", !21} +!21 = !{!22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37} +!22 = !{i32 10000, i64 0, i32 0} +!23 = !{i32 100000, i64 0, i32 0} +!24 = !{i32 200000, i64 0, i32 0} +!25 = !{i32 300000, i64 0, i32 0} +!26 = !{i32 400000, i64 0, i32 0} +!27 = !{i32 500000, i64 0, i32 0} +!28 = !{i32 600000, i64 0, i32 0} +!29 = !{i32 700000, i64 0, i32 0} +!30 = !{i32 800000, i64 0, i32 0} +!31 = !{i32 900000, i64 0, i32 0} +!32 = !{i32 950000, i64 0, i32 0} +!33 = !{i32 990000, i64 0, i32 0} +!34 = !{i32 999000, i64 0, i32 0} +!35 = !{i32 999900, i64 0, i32 0} +!36 = !{i32 999990, i64 0, i32 0} +!37 = !{i32 999999, i64 0, i32 0} +!38 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git e09c924f98ec157adeaa74819b0aec9a07a1b552)"} +!39 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !40, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!40 = !DISubroutineType(types: !41) +!41 = !{} +!42 = !DILocation(line: 5, column: 10, scope: !39) +;; The first 2 are from the direct calls to foo from main. Those stay on the +;; callsite in foo, which isn't inlined into main due to the callsites in main +;; being annotated as noinline. +;; The second 2 are from the calls from foo2, which inlines its callsite to foo +;; but is not itself inlined into its callers. Therefore they get moved to a +;; new memprof metadata within foo2. +!43 = !{!44, !46, !48, !50} +!44 = !{!45, !"cold", !105} +!105 = !{i64 123, i64 5000} +!45 = !{i64 -2458008693472584243, i64 7394638144382192936} +!46 = !{!47, !"notcold", !106} +!47 = !{i64 -2458008693472584243, i64 -8908997186479157179} +!106 = !{i64 345, i64 1} +!48 = !{!49, !"notcold", !107} +!49 = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -4805294506621015872} +!107 = !{i64 678, i64 1} +!50 = !{!51, !"cold", !108} +!51 = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -972865200055133905} +!108 = !{i64 234, i64 5000} +; CHECK: ![[ORIGMEMPROF]] = !{![[ORIGMIB1:[0-9]+]], ![[ORIGMIB2:[0-9]+]], ![[ORIGMIB3:[0-9]+]], ![[ORIGMIB4:[0-9]+]]} +; CHECK: ![[ORIGMIB1]] = !{![[ORIGMIBSTACK1:[0-9]+]], !"cold" +; CHECK: ![[ORIGMIBSTACK1]] = !{i64 -2458008693472584243, i64 7394638144382192936} +; CHECK: ![[ORIGMIB2]] = !{![[ORIGMIBSTACK2:[0-9]+]], !"notcold" +; CHECK: ![[ORIGMIBSTACK2]] = !{i64 -2458008693472584243, i64 -8908997186479157179} +; CHECK: ![[ORIGMIB3]] = !{![[ORIGMIBSTACK3:[0-9]+]], !"notcold" +; CHECK: ![[ORIGMIBSTACK3]] = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -4805294506621015872} +; CHECK: ![[ORIGMIB4]] = !{![[ORIGMIBSTACK4:[0-9]+]], !"cold" +; CHECK: ![[ORIGMIBSTACK4]] = !{i64 -2458008693472584243, i64 -8079659623765193173, i64 -972865200055133905} +; CHECK: ![[NEWMEMPROF]] = !{![[ORIGMIB3:[0-9]+]], ![[ORIGMIB4:[0-9]+]]} +; CHECK: ![[NEWCALLSITE]] = !{i64 -2458008693472584243, i64 -8079659623765193173} +!52 = !{i64 -2458008693472584243} +!53 = !DILocation(line: 5, column: 3, scope: !39) +!54 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2v", scope: !1, file: !1, line: 7, type: !40, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!55 = !DILocation(line: 8, column: 10, scope: !54) +!56 = !{i64 -8079659623765193173} +!57 = !DILocation(line: 8, column: 3, scope: !54) +!58 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 10, type: !40, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!59 = !DILocation(line: 11, column: 10, scope: !58) +!60 = !{i64 -972865200055133905} +!61 = !DILocation(line: 11, column: 3, scope: !58) +!62 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 13, type: !40, scopeLine: 13, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!63 = !DILocation(line: 14, column: 10, scope: !62) +!64 = !{i64 -4805294506621015872} +!65 = !DILocation(line: 14, column: 3, scope: !62) +!66 = distinct !DISubprogram(name: "notprofiled", linkageName: "notprofiled", scope: !1, file: !1, line: 400, type: !40, scopeLine: 400, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!67 = !DILocation(line: 401, column: 10, scope: !66) +!68 = !DILocation(line: 401, column: 3, scope: !66) +!69 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 16, type: !40, scopeLine: 16, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !41) +!70 = !DILocation(line: 17, column: 13, scope: !69) +!71 = !{i64 -8908997186479157179} +!72 = !DILocation(line: 17, column: 9, scope: !69) +!73 = !DILocation(line: 18, column: 13, scope: !69) +!74 = !{i64 7394638144382192936} +!75 = !DILocation(line: 18, column: 9, scope: !69) +!76 = !DILocation(line: 19, column: 13, scope: !69) +!77 = !{i64 -5510257407004945023} +!78 = !DILocation(line: 19, column: 9, scope: !69) +!79 = !DILocation(line: 20, column: 13, scope: !69) +!80 = !{i64 8771588133652501463} +!81 = !DILocation(line: 20, column: 9, scope: !69) +!82 = !DILocation(line: 21, column: 10, scope: !69) +!83 = !DILocation(line: 21, column: 3, scope: !69) +!84 = !DILocation(line: 22, column: 10, scope: !69) +!85 = !DILocation(line: 22, column: 3, scope: !69) +!86 = !DILocation(line: 23, column: 10, scope: !69) +!87 = !DILocation(line: 23, column: 3, scope: !69) +!88 = !DILocation(line: 24, column: 10, scope: !69) +!89 = !DILocation(line: 24, column: 3, scope: !69) +!90 = !DILocation(line: 25, column: 12, scope: !69) +!91 = !DILocation(line: 25, column: 3, scope: !69) +!92 = !DILocation(line: 25, column: 3, scope: !93) +!93 = !DILexicalBlockFile(scope: !69, file: !1, discriminator: 2) +!94 = !DILocation(line: 26, column: 3, scope: !69) +!95 = !DILocation(line: 27, column: 12, scope: !69) +!96 = !DILocation(line: 27, column: 3, scope: !69) +!97 = !DILocation(line: 27, column: 3, scope: !93) +!98 = !DILocation(line: 28, column: 12, scope: !69) +!99 = !DILocation(line: 28, column: 3, scope: !69) +!100 = !DILocation(line: 28, column: 3, scope: !93) +!101 = !DILocation(line: 29, column: 12, scope: !69) +!102 = !DILocation(line: 29, column: 3, scope: !69) +!103 = !DILocation(line: 29, column: 3, scope: !93) +!104 = !DILocation(line: 30, column: 3, scope: !69) From a5481e7d5af07161b5d135100cb8754ae614accf Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Fri, 18 Jul 2025 21:53:53 -0700 Subject: [PATCH 429/813] [NFCI] [HWASan] add test for custom section global (#149625) --- llvm/test/Instrumentation/HWAddressSanitizer/globals.ll | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/globals.ll b/llvm/test/Instrumentation/HWAddressSanitizer/globals.ll index 4e22f5fec0067..f5ae1c0f80497 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/globals.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/globals.ll @@ -4,7 +4,7 @@ ; CHECK29: @four = global ; CHECK: @specialcaselisted = global i16 2, no_sanitize_hwaddress - +; CHECK: @insection = global i16 2, section "custom" ; CHECK: @__start_hwasan_globals = external hidden constant [0 x i8] ; CHECK: @__stop_hwasan_globals = external hidden constant [0 x i8] @@ -37,3 +37,4 @@ source_filename = "foo" @sixteen = global [16 x i8] zeroinitializer @huge = global [16777232 x i8] zeroinitializer @specialcaselisted = global i16 2, no_sanitize_hwaddress +@insection = global i16 2, section "custom" From ef49ed4829bc1b111e31a08d70b0ccae66427ebf Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Sat, 19 Jul 2025 07:18:51 +0200 Subject: [PATCH 430/813] =?UTF-8?q?[clang][bytecode]=20Use=20bytecode=20in?= =?UTF-8?q?terpreter=20in=20isPotentialConstantExprU=E2=80=A6=20(#149462)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …nevaluated Fake a function call to the given function and evaluate the given expression as if it was part of that function call. Fixes #149383 --- clang/lib/AST/ByteCode/Compiler.cpp | 5 +++++ clang/lib/AST/ByteCode/Context.cpp | 13 +++++++++++++ clang/lib/AST/ByteCode/Context.h | 4 +++- clang/lib/AST/ByteCode/EvalEmitter.cpp | 13 +++++++++++++ clang/lib/AST/ByteCode/EvalEmitter.h | 3 +++ clang/lib/AST/ByteCode/Interp.cpp | 9 ++++++++- clang/lib/AST/ByteCode/Interp.h | 2 +- clang/lib/AST/ExprConstant.cpp | 5 +++++ clang/test/AST/ByteCode/builtin-constant-p.cpp | 8 ++++++++ clang/test/Sema/diagnose_if.c | 1 + clang/test/SemaCXX/diagnose_if-ext.cpp | 1 + clang/test/SemaCXX/diagnose_if.cpp | 1 + 12 files changed, 62 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index ea473730350b6..65ad7caf8913b 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -6670,6 +6670,11 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { } // Function parameters. if (const auto *PVD = dyn_cast(D)) { + if (Ctx.getLangOpts().CPlusPlus && !Ctx.getLangOpts().CPlusPlus11 && + !D->getType()->isIntegralOrEnumerationType()) { + return this->emitInvalidDeclRef(cast(E), + /*InitializerFailed=*/false, E); + } if (auto It = this->Params.find(PVD); It != this->Params.end()) { if (IsReference || !It->second.IsPtr) return this->emitGetParam(classifyPrim(E), It->second.Offset, E); diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp index a629ff9569428..ead6e4af5d403 100644 --- a/clang/lib/AST/ByteCode/Context.cpp +++ b/clang/lib/AST/ByteCode/Context.cpp @@ -52,6 +52,19 @@ bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) { return Func->isValid(); } +void Context::isPotentialConstantExprUnevaluated(State &Parent, const Expr *E, + const FunctionDecl *FD) { + assert(Stk.empty()); + ++EvalID; + size_t StackSizeBefore = Stk.size(); + Compiler C(*this, *P, Parent, Stk); + + if (!C.interpretCall(FD, E)) { + C.cleanup(); + Stk.clearTo(StackSizeBefore); + } +} + bool Context::evaluateAsRValue(State &Parent, const Expr *E, APValue &Result) { ++EvalID; bool Recursing = !Stk.empty(); diff --git a/clang/lib/AST/ByteCode/Context.h b/clang/lib/AST/ByteCode/Context.h index 5898ab5e54599..acf750421f8af 100644 --- a/clang/lib/AST/ByteCode/Context.h +++ b/clang/lib/AST/ByteCode/Context.h @@ -47,7 +47,9 @@ class Context final { ~Context(); /// Checks if a function is a potential constant expression. - bool isPotentialConstantExpr(State &Parent, const FunctionDecl *FnDecl); + bool isPotentialConstantExpr(State &Parent, const FunctionDecl *FD); + void isPotentialConstantExprUnevaluated(State &Parent, const Expr *E, + const FunctionDecl *FD); /// Evaluates a toplevel expression as an rvalue. bool evaluateAsRValue(State &Parent, const Expr *E, APValue &Result); diff --git a/clang/lib/AST/ByteCode/EvalEmitter.cpp b/clang/lib/AST/ByteCode/EvalEmitter.cpp index 5498065657e0a..6e511bc7d2fab 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.cpp +++ b/clang/lib/AST/ByteCode/EvalEmitter.cpp @@ -90,6 +90,19 @@ EvaluationResult EvalEmitter::interpretAsPointer(const Expr *E, return std::move(this->EvalResult); } +bool EvalEmitter::interpretCall(const FunctionDecl *FD, const Expr *E) { + // Add parameters to the parameter map. The values in the ParamOffset don't + // matter in this case as reading from them can't ever work. + for (const ParmVarDecl *PD : FD->parameters()) { + this->Params.insert({PD, {0, false}}); + } + + if (!this->visit(E)) + return false; + PrimType T = Ctx.classify(E).value_or(PT_Ptr); + return this->emitPop(T, E); +} + void EvalEmitter::emitLabel(LabelTy Label) { CurrentLabel = Label; } EvalEmitter::LabelTy EvalEmitter::getLabel() { return NextLabel++; } diff --git a/clang/lib/AST/ByteCode/EvalEmitter.h b/clang/lib/AST/ByteCode/EvalEmitter.h index 7303adba22af7..2fe7da608c739 100644 --- a/clang/lib/AST/ByteCode/EvalEmitter.h +++ b/clang/lib/AST/ByteCode/EvalEmitter.h @@ -40,6 +40,9 @@ class EvalEmitter : public SourceMapper { EvaluationResult interpretDecl(const VarDecl *VD, bool CheckFullyInitialized); /// Interpret the given Expr to a Pointer. EvaluationResult interpretAsPointer(const Expr *E, PtrCallback PtrCB); + /// Interpret the given expression as if it was in the body of the given + /// function, i.e. the parameters of the function are available for use. + bool interpretCall(const FunctionDecl *FD, const Expr *E); /// Clean up all resources. void cleanup(); diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index df5e3be83d741..5463aecf23087 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -142,8 +142,12 @@ static bool diagnoseUnknownDecl(InterpState &S, CodePtr OpPC, return false; if (isa(D)) { - if (D->getType()->isReferenceType()) + if (D->getType()->isReferenceType()) { + if (S.inConstantContext() && S.getLangOpts().CPlusPlus && + !S.getLangOpts().CPlusPlus11) + diagnoseNonConstVariable(S, OpPC, D); return false; + } const SourceInfo &Loc = S.Current->getSource(OpPC); if (S.getLangOpts().CPlusPlus11) { @@ -661,6 +665,9 @@ bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr, if (Ptr.isInitialized()) return true; + if (Ptr.isExtern() && S.checkingPotentialConstantExpression()) + return false; + if (const auto *VD = Ptr.getDeclDesc()->asVarDecl(); VD && (VD->isConstexpr() || VD->hasGlobalStorage())) { diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index ce0ebdd8321b7..aac519d7c74fd 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1308,7 +1308,7 @@ bool Dup(InterpState &S, CodePtr OpPC) { template ::T> bool Pop(InterpState &S, CodePtr OpPC) { - S.Stk.pop(); + S.Stk.discard(); return true; } diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 8797eaddd0e18..cfc4729be4184 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -18018,6 +18018,11 @@ bool Expr::isPotentialConstantExprUnevaluated(Expr *E, Info.InConstantContext = true; Info.CheckingPotentialConstantExpression = true; + if (Info.EnableNewConstInterp) { + Info.Ctx.getInterpContext().isPotentialConstantExprUnevaluated(Info, E, FD); + return Diags.empty(); + } + // Fabricate a call stack frame to give the arguments a plausible cover story. CallStackFrame Frame(Info, SourceLocation(), FD, /*This=*/nullptr, /*CallExpr=*/nullptr, CallRef()); diff --git a/clang/test/AST/ByteCode/builtin-constant-p.cpp b/clang/test/AST/ByteCode/builtin-constant-p.cpp index 9f5521590833d..315a907949c34 100644 --- a/clang/test/AST/ByteCode/builtin-constant-p.cpp +++ b/clang/test/AST/ByteCode/builtin-constant-p.cpp @@ -140,3 +140,11 @@ void test17(void) { F("string literal" + 1); // both-warning {{adding}} \ // both-note {{use array indexing}} } + +/// FIXME +static void foo(int i) __attribute__((__diagnose_if__(!__builtin_constant_p(i), "not constant", "error"))) // expected-note {{from}} +{ +} +static void bar(int i) { + foo(15); // expected-error {{not constant}} +} diff --git a/clang/test/Sema/diagnose_if.c b/clang/test/Sema/diagnose_if.c index e9b8497d5ca4e..a4cf43e9c869f 100644 --- a/clang/test/Sema/diagnose_if.c +++ b/clang/test/Sema/diagnose_if.c @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -verify -fno-builtin +// RUN: %clang_cc1 %s -verify -fno-builtin -fexperimental-new-constant-interpreter #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__))) diff --git a/clang/test/SemaCXX/diagnose_if-ext.cpp b/clang/test/SemaCXX/diagnose_if-ext.cpp index d5625b501322e..e0f73976eea3a 100644 --- a/clang/test/SemaCXX/diagnose_if-ext.cpp +++ b/clang/test/SemaCXX/diagnose_if-ext.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -Wpedantic -fsyntax-only %s -verify +// RUN: %clang_cc1 -Wpedantic -fsyntax-only %s -verify -fexperimental-new-constant-interpreter void foo() __attribute__((diagnose_if(1, "", "error"))); // expected-warning{{'diagnose_if' is a clang extension}} void foo(int a) __attribute__((diagnose_if(a, "", "error"))); // expected-warning{{'diagnose_if' is a clang extension}} diff --git a/clang/test/SemaCXX/diagnose_if.cpp b/clang/test/SemaCXX/diagnose_if.cpp index 21897c5184b73..1b9e660c4e224 100644 --- a/clang/test/SemaCXX/diagnose_if.cpp +++ b/clang/test/SemaCXX/diagnose_if.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14 +// RUN: %clang_cc1 %s -verify -fno-builtin -std=c++14 -fexperimental-new-constant-interpreter #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__))) From c875bb8eef6c60e7cd5814fdbab149abb86efa30 Mon Sep 17 00:00:00 2001 From: Connector Switch Date: Sat, 19 Jul 2025 13:41:07 +0800 Subject: [PATCH 431/813] [flang] Minor revision of the getting started guide. (#149632) - C++17 is now the default, no need to specify explicitly. https://github.com/llvm/llvm-project/blob/b3c9ed151f18fbbfe027cf93dd7957f36bcbaccf/flang/CMakeLists.txt#L13 - The current recommended way to set up OpenMP is as a runtime, not as a project. https://github.com/llvm/llvm-project/blob/b3c9ed151f18fbbfe027cf93dd7957f36bcbaccf/llvm/CMakeLists.txt#L209-L214 --- flang/docs/GettingStarted.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/flang/docs/GettingStarted.md b/flang/docs/GettingStarted.md index 0b3b551ffbfba..2ea8093b607cf 100644 --- a/flang/docs/GettingStarted.md +++ b/flang/docs/GettingStarted.md @@ -74,15 +74,14 @@ cmake \ -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_INSTALL_PREFIX=$INSTALLDIR \ - -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,$LD_LIBRARY_PATH" \ -DFLANG_ENABLE_WERROR=ON \ -DLLVM_ENABLE_ASSERTIONS=ON \ -DLLVM_TARGETS_TO_BUILD=host \ -DLLVM_LIT_ARGS=-v \ - -DLLVM_ENABLE_PROJECTS="clang;mlir;flang;openmp" \ - -DLLVM_ENABLE_RUNTIMES="compiler-rt;flang-rt" \ + -DLLVM_ENABLE_PROJECTS="clang;mlir;flang" \ + -DLLVM_ENABLE_RUNTIMES="compiler-rt;flang-rt;openmp" \ ../llvm-project/llvm ninja @@ -141,7 +140,6 @@ cd build cmake \ -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_CXX_STANDARD=17 \ -DCMAKE_CXX_LINK_FLAGS="-Wl,-rpath,$LD_LIBRARY_PATH" \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DFLANG_ENABLE_WERROR=ON \ From 9bf7d04c4386daf1ef0acf95782a59855c98474a Mon Sep 17 00:00:00 2001 From: Vassil Vassilev Date: Sat, 19 Jul 2025 09:25:27 +0300 Subject: [PATCH 432/813] [clang-repl] Lay the basic infrastructure for pretty printing of types (#148701) The idea is to store a type-value pair in clang::Value which is updated by the interpreter runtime. The class copies builtin types and boxes non-builtin types to provide some lifetime control. The patch enables default printers for C and C++ using a very minimalistic approach. We handle enums, arrays and user types. Once we land this we can focus on enabling user-defined pretty-printers which take control over printing of types The work started as part of https://reviews.llvm.org/D146809, then we created a giant in https://github.com/llvm/llvm-project/pull/84769 --- clang/include/clang/AST/ASTContext.h | 2 + clang/include/clang/Interpreter/Interpreter.h | 39 +- clang/include/clang/Interpreter/Value.h | 10 +- clang/lib/Interpreter/CMakeLists.txt | 1 + clang/lib/Interpreter/Interpreter.cpp | 43 +- clang/lib/Interpreter/InterpreterUtils.cpp | 8 +- clang/lib/Interpreter/InterpreterUtils.h | 2 +- .../Interpreter/InterpreterValuePrinter.cpp | 395 ++++++++++++++++-- clang/lib/Interpreter/Value.cpp | 47 ++- clang/lib/Parse/ParseStmt.cpp | 3 +- clang/test/Interpreter/pretty-print.c | 83 +++- clang/test/Interpreter/pretty-print.cpp | 70 ++++ .../unittests/Interpreter/InterpreterTest.cpp | 20 + 13 files changed, 640 insertions(+), 83 deletions(-) create mode 100644 clang/test/Interpreter/pretty-print.cpp diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 66ec3395571ea..27360a15b3a5c 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -1192,6 +1192,8 @@ class ASTContext : public RefCountedBase { bool isInSameModule(const Module *M1, const Module *M2) const; TranslationUnitDecl *getTranslationUnitDecl() const { + assert(TUDecl->getMostRecentDecl() == TUDecl && + "The active TU is not current one!"); return TUDecl->getMostRecentDecl(); } void addTranslationUnitDecl() { diff --git a/clang/include/clang/Interpreter/Interpreter.h b/clang/include/clang/Interpreter/Interpreter.h index 78dff1165dcf5..83d2962cbf3ba 100644 --- a/clang/include/clang/Interpreter/Interpreter.h +++ b/clang/include/clang/Interpreter/Interpreter.h @@ -175,31 +175,42 @@ class Interpreter { llvm::Expected getSymbolAddressFromLinkerName(llvm::StringRef LinkerName) const; - const llvm::SmallVectorImpl &getValuePrintingInfo() const { - return ValuePrintingInfo; - } - - Expr *SynthesizeExpr(Expr *E); + std::unique_ptr GenModule(IncrementalAction *Action = nullptr); + PartialTranslationUnit &RegisterPTU(TranslationUnitDecl *TU, + std::unique_ptr M = {}, + IncrementalAction *Action = nullptr); private: size_t getEffectivePTUSize() const; void markUserCodeStart(); llvm::Expected ExtractValueFromExpr(Expr *E); - llvm::Expected CompileDtorCall(CXXRecordDecl *CXXRD); - - CodeGenerator *getCodeGen(IncrementalAction *Action = nullptr) const; - std::unique_ptr GenModule(IncrementalAction *Action = nullptr); - PartialTranslationUnit &RegisterPTU(TranslationUnitDecl *TU, - std::unique_ptr M = {}, - IncrementalAction *Action = nullptr); // A cache for the compiled destructors used to for de-allocation of managed // clang::Values. - llvm::DenseMap Dtors; + mutable llvm::DenseMap Dtors; - llvm::SmallVector ValuePrintingInfo; + std::array ValuePrintingInfo = {0}; std::unique_ptr JITBuilder; + + /// @} + /// @name Value and pretty printing support + /// @{ + + std::string ValueDataToString(const Value &V) const; + std::string ValueTypeToString(const Value &V) const; + + llvm::Expected convertExprToValue(Expr *E); + + // When we deallocate clang::Value we need to run the destructor of the type. + // This function forces emission of the needed dtor. + llvm::Expected + CompileDtorCall(CXXRecordDecl *CXXRD) const; + + /// @} + /// @name Code generation + /// @{ + CodeGenerator *getCodeGen(IncrementalAction *Action = nullptr) const; }; } // namespace clang diff --git a/clang/include/clang/Interpreter/Value.h b/clang/include/clang/Interpreter/Value.h index a93c0841915fc..b91301e6096eb 100644 --- a/clang/include/clang/Interpreter/Value.h +++ b/clang/include/clang/Interpreter/Value.h @@ -35,6 +35,7 @@ #include "llvm/Config/llvm-config.h" // for LLVM_BUILD_LLVM_DYLIB, LLVM_BUILD_SHARED_LIBS #include "llvm/Support/Compiler.h" +#include #include // NOTE: Since the REPL itself could also include this runtime, extreme caution @@ -97,6 +98,7 @@ class REPL_EXTERNAL_VISIBILITY Value { REPL_BUILTIN_TYPES #undef X void *m_Ptr; + unsigned char m_RawBits[sizeof(long double) * 8]; // widest type }; public: @@ -111,7 +113,7 @@ class REPL_EXTERNAL_VISIBILITY Value { }; Value() = default; - Value(Interpreter *In, void *Ty); + Value(const Interpreter *In, void *Ty); Value(const Value &RHS); Value(Value &&RHS) noexcept; Value &operator=(const Value &RHS); @@ -124,9 +126,7 @@ class REPL_EXTERNAL_VISIBILITY Value { void dump() const; void clear(); - ASTContext &getASTContext(); const ASTContext &getASTContext() const; - Interpreter &getInterpreter(); const Interpreter &getInterpreter() const; QualType getType() const; @@ -140,6 +140,7 @@ class REPL_EXTERNAL_VISIBILITY Value { void *getPtr() const; void setPtr(void *Ptr) { Data.m_Ptr = Ptr; } + void setRawBits(void *Ptr, unsigned NBits = sizeof(Storage)); #define X(type, name) \ void set##name(type Val) { Data.m_##name = Val; } \ @@ -193,7 +194,7 @@ class REPL_EXTERNAL_VISIBILITY Value { } }; - Interpreter *Interp = nullptr; + const Interpreter *Interp = nullptr; void *OpaqueType = nullptr; Storage Data; Kind ValueKind = K_Unspecified; @@ -205,6 +206,5 @@ template <> inline void *Value::as() const { return Data.m_Ptr; return (void *)as(); } - } // namespace clang #endif diff --git a/clang/lib/Interpreter/CMakeLists.txt b/clang/lib/Interpreter/CMakeLists.txt index 38cf139fa86a6..70de4a2aaa541 100644 --- a/clang/lib/Interpreter/CMakeLists.txt +++ b/clang/lib/Interpreter/CMakeLists.txt @@ -29,6 +29,7 @@ add_clang_library(clangInterpreter InterpreterUtils.cpp RemoteJITUtils.cpp Value.cpp + InterpreterValuePrinter.cpp ${WASM_SRC} PARTIAL_SOURCES_INTENDED diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp index ed3bae59a144c..db6a2bb914f43 100644 --- a/clang/lib/Interpreter/Interpreter.cpp +++ b/clang/lib/Interpreter/Interpreter.cpp @@ -264,7 +264,7 @@ class InProcessPrintingASTConsumer final : public MultiplexConsumer { if (auto *TLSD = llvm::dyn_cast(D)) if (TLSD && TLSD->isSemiMissing()) { auto ExprOrErr = - Interp.ExtractValueFromExpr(cast(TLSD->getStmt())); + Interp.convertExprToValue(cast(TLSD->getStmt())); if (llvm::Error E = ExprOrErr.takeError()) { llvm::logAllUnhandledErrors(std::move(E), llvm::errs(), "Value printing failed: "); @@ -440,11 +440,10 @@ const char *const Runtimes = R"( #define __CLANG_REPL__ 1 #ifdef __cplusplus #define EXTERN_C extern "C" - void *__clang_Interpreter_SetValueWithAlloc(void*, void*, void*); struct __clang_Interpreter_NewTag{} __ci_newtag; void* operator new(__SIZE_TYPE__, void* __p, __clang_Interpreter_NewTag) noexcept; template - void __clang_Interpreter_SetValueCopyArr(T* Src, void* Placement, unsigned long Size) { + void __clang_Interpreter_SetValueCopyArr(const T* Src, void* Placement, unsigned long Size) { for (auto Idx = 0; Idx < Size; ++Idx) new ((void*)(((T*)Placement) + Idx), __ci_newtag) T(Src[Idx]); } @@ -454,8 +453,12 @@ const char *const Runtimes = R"( } #else #define EXTERN_C extern + EXTERN_C void *memcpy(void *restrict dst, const void *restrict src, __SIZE_TYPE__ n); + EXTERN_C inline void __clang_Interpreter_SetValueCopyArr(const void* Src, void* Placement, unsigned long Size) { + memcpy(Placement, Src, Size); + } #endif // __cplusplus - + EXTERN_C void *__clang_Interpreter_SetValueWithAlloc(void*, void*, void*); EXTERN_C void __clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, ...); )"; @@ -470,12 +473,12 @@ Interpreter::create(std::unique_ptr CI, // Add runtime code and set a marker to hide it from user code. Undo will not // go through that. - auto PTU = Interp->Parse(Runtimes); - if (!PTU) - return PTU.takeError(); + Err = Interp->ParseAndExecute(Runtimes); + if (Err) + return std::move(Err); + Interp->markUserCodeStart(); - Interp->ValuePrintingInfo.resize(4); return std::move(Interp); } @@ -524,12 +527,11 @@ Interpreter::createWithCUDA(std::unique_ptr CI, return std::move(Interp); } +CompilerInstance *Interpreter::getCompilerInstance() { return CI.get(); } const CompilerInstance *Interpreter::getCompilerInstance() const { - return CI.get(); + return const_cast(this)->getCompilerInstance(); } -CompilerInstance *Interpreter::getCompilerInstance() { return CI.get(); } - llvm::Expected Interpreter::getExecutionEngine() { if (!IncrExecutor) { if (auto Err = CreateExecutor()) @@ -610,7 +612,14 @@ Interpreter::Parse(llvm::StringRef Code) { if (!TuOrErr) return TuOrErr.takeError(); - return RegisterPTU(*TuOrErr); + PTUs.emplace_back(PartialTranslationUnit()); + PartialTranslationUnit &LastPTU = PTUs.back(); + LastPTU.TUPart = *TuOrErr; + + if (std::unique_ptr M = GenModule()) + LastPTU.TheModule = std::move(M); + + return LastPTU; } static llvm::Expected @@ -808,10 +817,10 @@ Interpreter::GenModule(IncrementalAction *Action) { // sure it always stays empty. assert(((!CachedInCodeGenModule || !getCompilerInstance()->getPreprocessorOpts().Includes.empty()) || - (CachedInCodeGenModule->empty() && - CachedInCodeGenModule->global_empty() && - CachedInCodeGenModule->alias_empty() && - CachedInCodeGenModule->ifunc_empty())) && + ((CachedInCodeGenModule->empty() && + CachedInCodeGenModule->global_empty() && + CachedInCodeGenModule->alias_empty() && + CachedInCodeGenModule->ifunc_empty()))) && "CodeGen wrote to a readonly module"); std::unique_ptr M(CG->ReleaseModule()); CG->StartModule("incr_module_" + std::to_string(ID++), M->getContext()); @@ -828,4 +837,4 @@ CodeGenerator *Interpreter::getCodeGen(IncrementalAction *Action) const { return nullptr; return static_cast(WrappedAct)->getCodeGenerator(); } -} // namespace clang +} // end namespace clang diff --git a/clang/lib/Interpreter/InterpreterUtils.cpp b/clang/lib/Interpreter/InterpreterUtils.cpp index 45f6322b8461e..a19f96c80b94f 100644 --- a/clang/lib/Interpreter/InterpreterUtils.cpp +++ b/clang/lib/Interpreter/InterpreterUtils.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "InterpreterUtils.h" +#include "clang/AST/QualTypeNames.h" namespace clang { @@ -81,7 +82,7 @@ NamedDecl *LookupNamed(Sema &S, llvm::StringRef Name, else { const DeclContext *PrimaryWithin = nullptr; if (const auto *TD = dyn_cast(Within)) - PrimaryWithin = llvm::dyn_cast_or_null(TD->getDefinition()); + PrimaryWithin = dyn_cast_if_present(TD->getDefinition()); else PrimaryWithin = Within->getPrimaryContext(); @@ -97,15 +98,16 @@ NamedDecl *LookupNamed(Sema &S, llvm::StringRef Name, R.resolveKind(); if (R.isSingleResult()) - return llvm::dyn_cast(R.getFoundDecl()); + return dyn_cast(R.getFoundDecl()); return nullptr; } std::string GetFullTypeName(ASTContext &Ctx, QualType QT) { + QualType FQT = TypeName::getFullyQualifiedType(QT, Ctx); PrintingPolicy Policy(Ctx.getPrintingPolicy()); Policy.SuppressScope = false; Policy.AnonymousTagLocations = false; - return QT.getAsString(Policy); + return FQT.getAsString(Policy); } } // namespace clang diff --git a/clang/lib/Interpreter/InterpreterUtils.h b/clang/lib/Interpreter/InterpreterUtils.h index c7b405b486d93..fbf9814b0d4a7 100644 --- a/clang/lib/Interpreter/InterpreterUtils.h +++ b/clang/lib/Interpreter/InterpreterUtils.h @@ -45,7 +45,7 @@ NamespaceDecl *LookupNamespace(Sema &S, llvm::StringRef Name, const DeclContext *Within = nullptr); NamedDecl *LookupNamed(Sema &S, llvm::StringRef Name, - const DeclContext *Within); + const DeclContext *Within = nullptr); std::string GetFullTypeName(ASTContext &Ctx, QualType QT); } // namespace clang diff --git a/clang/lib/Interpreter/InterpreterValuePrinter.cpp b/clang/lib/Interpreter/InterpreterValuePrinter.cpp index 3e7e32b2e8557..34ffd62da85e4 100644 --- a/clang/lib/Interpreter/InterpreterValuePrinter.cpp +++ b/clang/lib/Interpreter/InterpreterValuePrinter.cpp @@ -18,6 +18,7 @@ #include "clang/Frontend/CompilerInstance.h" #include "clang/Interpreter/Interpreter.h" #include "clang/Interpreter/Value.h" +#include "clang/Lex/Preprocessor.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/Sema.h" @@ -25,13 +26,335 @@ #include "llvm/Support/raw_ostream.h" #include - +#include #include +#include +#include + +#define DEBUG_TYPE "interp-value" + +using namespace clang; + +static std::string DeclTypeToString(const QualType &QT, NamedDecl *D) { + std::string Str; + llvm::raw_string_ostream SS(Str); + if (QT.hasQualifiers()) + SS << QT.getQualifiers().getAsString() << " "; + SS << D->getQualifiedNameAsString(); + return Str; +} + +static std::string QualTypeToString(ASTContext &Ctx, QualType QT) { + PrintingPolicy Policy(Ctx.getPrintingPolicy()); + // Print the Allocator in STL containers, for instance. + Policy.SuppressDefaultTemplateArgs = false; + Policy.SuppressUnwrittenScope = true; + // Print 'a >' rather than 'a>'. + Policy.SplitTemplateClosers = true; + + struct LocalPrintingPolicyRAII { + ASTContext &Context; + PrintingPolicy Policy; + + LocalPrintingPolicyRAII(ASTContext &Ctx, PrintingPolicy &PP) + : Context(Ctx), Policy(Ctx.getPrintingPolicy()) { + Context.setPrintingPolicy(PP); + } + ~LocalPrintingPolicyRAII() { Context.setPrintingPolicy(Policy); } + } X(Ctx, Policy); + + const QualType NonRefTy = QT.getNonReferenceType(); + + if (const auto *TTy = llvm::dyn_cast(NonRefTy)) + return DeclTypeToString(NonRefTy, TTy->getDecl()); + + if (const auto *TRy = dyn_cast(NonRefTy)) + return DeclTypeToString(NonRefTy, TRy->getDecl()); + + const QualType Canon = NonRefTy.getCanonicalType(); + + // FIXME: How a builtin type can be a function pointer type? + if (Canon->isBuiltinType() && !NonRefTy->isFunctionPointerType() && + !NonRefTy->isMemberPointerType()) + return Canon.getAsString(Ctx.getPrintingPolicy()); + + if (const auto *TDTy = dyn_cast(NonRefTy)) { + // FIXME: TemplateSpecializationType & SubstTemplateTypeParmType checks + // are predominately to get STL containers to print nicer and might be + // better handled in GetFullyQualifiedName. + // + // std::vector::iterator is a TemplateSpecializationType + // std::vector::value_type is a SubstTemplateTypeParmType + // + QualType SSDesugar = TDTy->getLocallyUnqualifiedSingleStepDesugaredType(); + if (llvm::isa(SSDesugar)) + return GetFullTypeName(Ctx, Canon); + else if (llvm::isa(SSDesugar)) + return GetFullTypeName(Ctx, NonRefTy); + return DeclTypeToString(NonRefTy, TDTy->getDecl()); + } + return GetFullTypeName(Ctx, NonRefTy); +} + +static std::string EnumToString(const Value &V) { + std::string Str; + llvm::raw_string_ostream SS(Str); + ASTContext &Ctx = const_cast(V.getASTContext()); + + QualType DesugaredTy = V.getType().getDesugaredType(Ctx); + const EnumType *EnumTy = DesugaredTy.getNonReferenceType()->getAs(); + assert(EnumTy && "Fail to cast to enum type"); + + EnumDecl *ED = EnumTy->getDecl(); + uint64_t Data = V.getULongLong(); + bool IsFirst = true; + llvm::APSInt AP = Ctx.MakeIntValue(Data, DesugaredTy); + + for (auto I = ED->enumerator_begin(), E = ED->enumerator_end(); I != E; ++I) { + if (I->getInitVal() == AP) { + if (!IsFirst) + SS << " ? "; + SS << "(" + I->getQualifiedNameAsString() << ")"; + IsFirst = false; + } + } + llvm::SmallString<64> APStr; + AP.toString(APStr, /*Radix=*/10); + SS << " : " << QualTypeToString(Ctx, ED->getIntegerType()) << " " << APStr; + return Str; +} + +static std::string FunctionToString(const Value &V, const void *Ptr) { + std::string Str; + llvm::raw_string_ostream SS(Str); + SS << "Function @" << Ptr; + + const DeclContext *PTU = V.getASTContext().getTranslationUnitDecl(); + // Find the last top-level-stmt-decl. This is a forward iterator but the + // partial translation unit should not be large. + const TopLevelStmtDecl *TLSD = nullptr; + for (const Decl *D : PTU->noload_decls()) + if (isa(D)) + TLSD = cast(D); + + // Get __clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void + // *OpaqueType, void *Val); + const FunctionDecl *FD = nullptr; + if (auto *InterfaceCall = llvm::dyn_cast(TLSD->getStmt())) { + const auto *Arg = InterfaceCall->getArg(/*Val*/ 3); + // Get rid of cast nodes. + while (const CastExpr *CastE = llvm::dyn_cast(Arg)) + Arg = CastE->getSubExpr(); + if (const DeclRefExpr *DeclRefExp = llvm::dyn_cast(Arg)) + FD = llvm::dyn_cast(DeclRefExp->getDecl()); + + if (FD) { + SS << '\n'; + const clang::FunctionDecl *FDef; + if (FD->hasBody(FDef)) + FDef->print(SS); + } + } + return Str; +} + +static std::string VoidPtrToString(const void *Ptr) { + std::string Str; + llvm::raw_string_ostream SS(Str); + SS << Ptr; + return Str; +} + +static std::string CharPtrToString(const char *Ptr) { + if (!Ptr) + return "0"; + + std::string Result = "\""; + Result += Ptr; + Result += '"'; + return Result; +} namespace clang { +struct ValueRef : public Value { + ValueRef(const Interpreter *In, void *Ty) : Value(In, Ty) { + // Tell the base class to not try to deallocate if it manages the value. + IsManuallyAlloc = false; + } +}; + +std::string Interpreter::ValueDataToString(const Value &V) const { + Sema &S = getCompilerInstance()->getSema(); + ASTContext &Ctx = S.getASTContext(); + + QualType QT = V.getType(); + + if (const ConstantArrayType *CAT = Ctx.getAsConstantArrayType(QT)) { + QualType ElemTy = CAT->getElementType(); + size_t ElemCount = Ctx.getConstantArrayElementCount(CAT); + const Type *BaseTy = CAT->getBaseElementTypeUnsafe(); + size_t ElemSize = Ctx.getTypeSizeInChars(BaseTy).getQuantity(); + + // Treat null terminated char arrays as strings basically. + if (ElemTy->isCharType()) { + char last = *(char *)(((uintptr_t)V.getPtr()) + ElemCount * ElemSize - 1); + if (last == '\0') + return CharPtrToString((char *)V.getPtr()); + } + + std::string Result = "{ "; + for (unsigned Idx = 0, N = CAT->getZExtSize(); Idx < N; ++Idx) { + ValueRef InnerV = ValueRef(this, ElemTy.getAsOpaquePtr()); + if (ElemTy->isBuiltinType()) { + // Single dim arrays, advancing. + uintptr_t Offset = (uintptr_t)V.getPtr() + Idx * ElemSize; + InnerV.setRawBits((void *)Offset, ElemSize * 8); + } else { + // Multi dim arrays, position to the next dimension. + size_t Stride = ElemCount / N; + uintptr_t Offset = ((uintptr_t)V.getPtr()) + Idx * Stride * ElemSize; + InnerV.setPtr((void *)Offset); + } + + Result += ValueDataToString(InnerV); + + // Skip the \0 if the char types + if (Idx < N - 1) + Result += ", "; + } + Result += " }"; + return Result; + } + + QualType DesugaredTy = QT.getDesugaredType(Ctx); + QualType NonRefTy = DesugaredTy.getNonReferenceType(); + + // FIXME: Add support for user defined printers. + // LookupResult R = LookupUserDefined(S, QT); + // if (!R.empty()) + // return CallUserSpecifiedPrinter(R, V); + + // If it is a builtin type dispatch to the builtin overloads. + if (auto *BT = DesugaredTy.getCanonicalType()->getAs()) { + + auto formatFloating = [](auto Val, char Suffix = '\0') -> std::string { + std::string Out; + llvm::raw_string_ostream SS(Out); + + if (std::isnan(Val) || std::isinf(Val)) { + SS << llvm::format("%g", Val); + return SS.str(); + } + if (Val == static_cast(static_cast(Val))) + SS << llvm::format("%.1f", Val); + else if (std::abs(Val) < 1e-4 || std::abs(Val) > 1e6 || Suffix == 'f') + SS << llvm::format("%#.6g", Val); + else if (Suffix == 'L') + SS << llvm::format("%#.12Lg", Val); + else + SS << llvm::format("%#.8g", Val); + + if (Suffix != '\0') + SS << Suffix; + return SS.str(); + }; + + std::string Str; + llvm::raw_string_ostream SS(Str); + switch (BT->getKind()) { + default: + return "{ error: unknown builtin type '" + std::to_string(BT->getKind()) + + " '}"; + case clang::BuiltinType::Bool: + SS << ((V.getBool()) ? "true" : "false"); + return Str; + case clang::BuiltinType::Char_S: + SS << '\'' << V.getChar_S() << '\''; + return Str; + case clang::BuiltinType::SChar: + SS << '\'' << V.getSChar() << '\''; + return Str; + case clang::BuiltinType::Char_U: + SS << '\'' << V.getChar_U() << '\''; + return Str; + case clang::BuiltinType::UChar: + SS << '\'' << V.getUChar() << '\''; + return Str; + case clang::BuiltinType::Short: + SS << V.getShort(); + return Str; + case clang::BuiltinType::UShort: + SS << V.getUShort(); + return Str; + case clang::BuiltinType::Int: + SS << V.getInt(); + return Str; + case clang::BuiltinType::UInt: + SS << V.getUInt(); + return Str; + case clang::BuiltinType::Long: + SS << V.getLong(); + return Str; + case clang::BuiltinType::ULong: + SS << V.getULong(); + return Str; + case clang::BuiltinType::LongLong: + SS << V.getLongLong(); + return Str; + case clang::BuiltinType::ULongLong: + SS << V.getULongLong(); + return Str; + case clang::BuiltinType::Float: + return formatFloating(V.getFloat(), /*suffix=*/'f'); + + case clang::BuiltinType::Double: + return formatFloating(V.getDouble()); + + case clang::BuiltinType::LongDouble: + return formatFloating(V.getLongDouble(), /*suffix=*/'L'); + } + } + + if ((NonRefTy->isPointerType() || NonRefTy->isMemberPointerType()) && + NonRefTy->getPointeeType()->isFunctionProtoType()) + return FunctionToString(V, V.getPtr()); + + if (NonRefTy->isFunctionType()) + return FunctionToString(V, &V); + + if (NonRefTy->isEnumeralType()) + return EnumToString(V); + + if (NonRefTy->isNullPtrType()) + return "nullptr\n"; + + // FIXME: Add support for custom printers in C. + if (NonRefTy->isPointerType()) { + if (NonRefTy->getPointeeType()->isCharType()) + return CharPtrToString((char *)V.getPtr()); + + return VoidPtrToString(V.getPtr()); + } + + // Fall back to printing just the address of the unknown object. + return "@" + VoidPtrToString(V.getPtr()); +} + +std::string Interpreter::ValueTypeToString(const Value &V) const { + ASTContext &Ctx = const_cast(V.getASTContext()); + QualType QT = V.getType(); + + std::string QTStr = QualTypeToString(Ctx, QT); + + if (QT->isReferenceType()) + QTStr += " &"; + + return QTStr; +} + llvm::Expected -Interpreter::CompileDtorCall(CXXRecordDecl *CXXRD) { +Interpreter::CompileDtorCall(CXXRecordDecl *CXXRD) const { assert(CXXRD && "Cannot compile a destructor for a nullptr"); if (auto Dtor = Dtors.find(CXXRD); Dtor != Dtors.end()) return Dtor->getSecond(); @@ -81,7 +404,7 @@ class InterfaceKindVisitor return InterfaceKind::CopyArray; } - InterfaceKind VisitFunctionProtoType(const FunctionProtoType *Ty) { + InterfaceKind VisitFunctionType(const FunctionType *Ty) { HandlePtrType(Ty); return InterfaceKind::NoAlloc; } @@ -141,9 +464,14 @@ class InterfaceKindVisitor } }; +static constexpr llvm::StringRef VPName[] = { + "__clang_Interpreter_SetValueNoAlloc", + "__clang_Interpreter_SetValueWithAlloc", + "__clang_Interpreter_SetValueCopyArr", "__ci_newtag"}; + // This synthesizes a call expression to a speciall // function that is responsible for generating the Value. -// In general, we transform: +// In general, we transform c++: // clang-repl> x // To: // // 1. If x is a built-in type like int, float. @@ -154,7 +482,7 @@ class InterfaceKindVisitor // // 3. If x is a struct, but a rvalue. // new (__clang_Interpreter_SetValueWithAlloc(ThisInterp, OpaqueValue, // xQualType)) (x); -llvm::Expected Interpreter::ExtractValueFromExpr(Expr *E) { +llvm::Expected Interpreter::convertExprToValue(Expr *E) { Sema &S = getCompilerInstance()->getSema(); ASTContext &Ctx = S.getASTContext(); @@ -176,23 +504,21 @@ llvm::Expected Interpreter::ExtractValueFromExpr(Expr *E) { Interface = S.BuildDeclarationNameExpr(CSS, R, /*ADL=*/false).get(); return llvm::Error::success(); }; - static constexpr llvm::StringRef Builtin[] = { - "__clang_Interpreter_SetValueNoAlloc", - "__clang_Interpreter_SetValueWithAlloc", - "__clang_Interpreter_SetValueCopyArr", "__ci_newtag"}; if (llvm::Error Err = - LookupInterface(ValuePrintingInfo[NoAlloc], Builtin[NoAlloc])) + LookupInterface(ValuePrintingInfo[NoAlloc], VPName[NoAlloc])) + return std::move(Err); + + if (llvm::Error Err = + LookupInterface(ValuePrintingInfo[CopyArray], VPName[CopyArray])) + return std::move(Err); + + if (llvm::Error Err = + LookupInterface(ValuePrintingInfo[WithAlloc], VPName[WithAlloc])) return std::move(Err); if (Ctx.getLangOpts().CPlusPlus) { if (llvm::Error Err = - LookupInterface(ValuePrintingInfo[WithAlloc], Builtin[WithAlloc])) - return std::move(Err); - if (llvm::Error Err = - LookupInterface(ValuePrintingInfo[CopyArray], Builtin[CopyArray])) - return std::move(Err); - if (llvm::Error Err = - LookupInterface(ValuePrintingInfo[NewTag], Builtin[NewTag])) + LookupInterface(ValuePrintingInfo[NewTag], VPName[NewTag])) return std::move(Err); } } @@ -211,7 +537,7 @@ llvm::Expected Interpreter::ExtractValueFromExpr(Expr *E) { if (auto *EWC = llvm::dyn_cast_if_present(E)) E = EWC->getSubExpr(); - QualType Ty = E->getType(); + QualType Ty = E->IgnoreImpCasts()->getType(); QualType DesugaredTy = Ty.getDesugaredType(Ctx); // For lvalue struct, we treat it as a reference. @@ -239,7 +565,10 @@ llvm::Expected Interpreter::ExtractValueFromExpr(Expr *E) { ExprResult AllocCall = S.ActOnCallExpr(Scope, ValuePrintingInfo[InterfaceKind::WithAlloc], E->getBeginLoc(), AdjustedArgs, E->getEndLoc()); - assert(!AllocCall.isInvalid() && "Can't create runtime interface call!"); + if (AllocCall.isInvalid()) + return llvm::make_error( + "Cannot call to " + VPName[WithAlloc], + llvm::inconvertibleErrorCode()); TypeSourceInfo *TSI = Ctx.getTrivialTypeSourceInfo(Ty, SourceLocation()); @@ -253,14 +582,23 @@ llvm::Expected Interpreter::ExtractValueFromExpr(Expr *E) { // __clang_Interpreter_SetValueCopyArr. if (Kind == InterfaceKind::CopyArray) { - const auto *ConstantArrTy = - cast(DesugaredTy.getTypePtr()); - size_t ArrSize = Ctx.getConstantArrayElementCount(ConstantArrTy); + const auto *CATy = cast(DesugaredTy.getTypePtr()); + size_t ArrSize = Ctx.getConstantArrayElementCount(CATy); + + if (!Ctx.getLangOpts().CPlusPlus) + ArrSize *= Ctx.getTypeSizeInChars(CATy->getBaseElementTypeUnsafe()) + .getQuantity(); + Expr *ArrSizeExpr = IntegerLiteralExpr(Ctx, ArrSize); Expr *Args[] = {E, AllocCall.get(), ArrSizeExpr}; SetValueE = S.ActOnCallExpr(Scope, ValuePrintingInfo[InterfaceKind::CopyArray], SourceLocation(), Args, SourceLocation()); + if (SetValueE.isInvalid()) + return llvm::make_error( + "Cannot call to " + VPName[CopyArray], + llvm::inconvertibleErrorCode()); + break; } Expr *Args[] = {AllocCall.get(), ValuePrintingInfo[InterfaceKind::NewTag]}; ExprResult CXXNewCall = S.BuildCXXNew( @@ -270,8 +608,10 @@ llvm::Expected Interpreter::ExtractValueFromExpr(Expr *E) { /*TypeIdParens=*/SourceRange(), TSI->getType(), TSI, std::nullopt, E->getSourceRange(), E); - assert(!CXXNewCall.isInvalid() && - "Can't create runtime placement new call!"); + if (CXXNewCall.isInvalid()) + return llvm::make_error( + "Cannot build a call to placement new", + llvm::inconvertibleErrorCode()); SetValueE = S.ActOnFinishFullExpr(CXXNewCall.get(), /*DiscardedValue=*/false); @@ -300,6 +640,7 @@ llvm::Expected Interpreter::ExtractValueFromExpr(Expr *E) { using namespace clang; // Temporary rvalue struct that need special care. +extern "C" { REPL_EXTERNAL_VISIBILITY void * __clang_Interpreter_SetValueWithAlloc(void *This, void *OutVal, void *OpaqueType) { @@ -308,8 +649,9 @@ __clang_Interpreter_SetValueWithAlloc(void *This, void *OutVal, return VRef.getPtr(); } -extern "C" void REPL_EXTERNAL_VISIBILITY __clang_Interpreter_SetValueNoAlloc( - void *This, void *OutVal, void *OpaqueType, ...) { +REPL_EXTERNAL_VISIBILITY void +__clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, + ...) { Value &VRef = *(Value *)OutVal; Interpreter *I = static_cast(This); VRef = Value(I, OpaqueType); @@ -384,6 +726,7 @@ extern "C" void REPL_EXTERNAL_VISIBILITY __clang_Interpreter_SetValueNoAlloc( } va_end(args); } +} // A trampoline to work around the fact that operator placement new cannot // really be forward declared due to libc++ and libstdc++ declaration mismatch. diff --git a/clang/lib/Interpreter/Value.cpp b/clang/lib/Interpreter/Value.cpp index afdf406b37253..be2ab5587a980 100644 --- a/clang/lib/Interpreter/Value.cpp +++ b/clang/lib/Interpreter/Value.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "clang/Interpreter/Value.h" +#include "InterpreterUtils.h" #include "clang/AST/ASTContext.h" #include "clang/AST/Type.h" #include "clang/Interpreter/Interpreter.h" @@ -19,6 +20,8 @@ #include #include +using namespace clang; + namespace { // This is internal buffer maintained by Value, used to hold temporaries. @@ -117,8 +120,9 @@ static Value::Kind ConvertQualTypeToKind(const ASTContext &Ctx, QualType QT) { } } -Value::Value(Interpreter *In, void *Ty) : Interp(In), OpaqueType(Ty) { - setKind(ConvertQualTypeToKind(getASTContext(), getType())); +Value::Value(const Interpreter *In, void *Ty) : Interp(In), OpaqueType(Ty) { + const ASTContext &C = getASTContext(); + setKind(ConvertQualTypeToKind(C, getType())); if (ValueKind == K_PtrOrObj) { QualType Canon = getType().getCanonicalType(); if ((Canon->isPointerType() || Canon->isObjectType() || @@ -127,7 +131,7 @@ Value::Value(Interpreter *In, void *Ty) : Interp(In), OpaqueType(Ty) { Canon->isMemberPointerType())) { IsManuallyAlloc = true; // Compile dtor function. - Interpreter &Interp = getInterpreter(); + const Interpreter &Interp = getInterpreter(); void *DtorF = nullptr; size_t ElementsSize = 1; QualType DtorTy = getType(); @@ -228,14 +232,13 @@ void *Value::getPtr() const { return Data.m_Ptr; } -QualType Value::getType() const { - return QualType::getFromOpaquePtr(OpaqueType); +void Value::setRawBits(void *Ptr, unsigned NBits /*= sizeof(Storage)*/) { + assert(NBits <= sizeof(Storage) && "Greater than the total size"); + memcpy(/*dest=*/Data.m_RawBits, /*src=*/Ptr, /*nbytes=*/NBits / 8); } -Interpreter &Value::getInterpreter() { - assert(Interp != nullptr && - "Can't get interpreter from a default constructed value"); - return *Interp; +QualType Value::getType() const { + return QualType::getFromOpaquePtr(OpaqueType); } const Interpreter &Value::getInterpreter() const { @@ -244,8 +247,6 @@ const Interpreter &Value::getInterpreter() const { return *Interp; } -ASTContext &Value::getASTContext() { return getInterpreter().getASTContext(); } - const ASTContext &Value::getASTContext() const { return getInterpreter().getASTContext(); } @@ -253,14 +254,32 @@ const ASTContext &Value::getASTContext() const { void Value::dump() const { print(llvm::outs()); } void Value::printType(llvm::raw_ostream &Out) const { - Out << "Not implement yet.\n"; + Out << Interp->ValueTypeToString(*this); } + void Value::printData(llvm::raw_ostream &Out) const { - Out << "Not implement yet.\n"; + Out << Interp->ValueDataToString(*this); } +// FIXME: We do not support the multiple inheritance case where one of the base +// classes has a pretty-printer and the other does not. void Value::print(llvm::raw_ostream &Out) const { assert(OpaqueType != nullptr && "Can't print default Value"); - Out << "Not implement yet.\n"; + + // Don't even try to print a void or an invalid type, it doesn't make sense. + if (getType()->isVoidType() || !isValid()) + return; + + // We need to get all the results together then print it, since `printType` is + // much faster than `printData`. + std::string Str; + llvm::raw_string_ostream SS(Str); + + SS << "("; + printType(SS); + SS << ") "; + printData(SS); + SS << "\n"; + Out << Str; } } // namespace clang diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index 31b84b6f2ede0..bf1978c22ee9f 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -541,7 +541,8 @@ StmtResult Parser::ParseExprStatement(ParsedStmtContext StmtCtx) { } Token *CurTok = nullptr; - // Note we shouldn't eat the token since the callback needs it. + // If the semicolon is missing at the end of REPL input, we want to print + // the result. Note we shouldn't eat the token since the callback needs it. if (Tok.is(tok::annot_repl_input_end)) CurTok = &Tok; else diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c index d21749a649e1c..56488a164719b 100644 --- a/clang/test/Interpreter/pretty-print.c +++ b/clang/test/Interpreter/pretty-print.c @@ -3,9 +3,88 @@ // RUN: cat %s | clang-repl -Xcc -xc | FileCheck %s // RUN: cat %s | clang-repl -Xcc -std=c++11 | FileCheck %s -// Fails with `Symbols not found: [ __clang_Interpreter_SetValueNoAlloc ]`. // UNSUPPORTED: hwasan + +char c = 'a'; c +// CHECK: (char) 'a' + const char* c_str = "Hello, world!"; c_str +// CHECK-NEXT: (const char *) "Hello, world!" + +c_str = "Goodbye, world!"; c_str +// CHECK-NEXT: (const char *) "Goodbye, world!" + +const char* c_null_str = 0; c_null_str +// CHECK-NEXT: (const char *) 0 + +"Hello, world" +// CHECK-NEXT: ({{(const )?}}char[13]) "Hello, world" + +int x = 42; x +// CHECK-NEXT: (int) 42 + +&x +// CHECK-NEXT: (int *) 0x{{[0-9a-f]+}} + +x - 2 +// CHECK-NEXT: (int) 40 + +float f = 4.2f; f +// CHECK-NEXT: (float) 4.20000f + +double d = 4.21; d +// CHECK-NEXT: (double) 4.2100000 + +long double tau = 6.2831853; tau +// CHECK-NEXT: (long double) 6.28318530000L + +int foo() { return 42; } foo() +// CHECK-NEXT: (int) 42 + +void bar(int a, float b) {} bar +// CHECK-NEXT: (void (int, float)) Function @0x{{[0-9a-f]+}} +// CHECK-NEXT: void bar(int a, float b) { + +bar +// CHECK: (void (int, float)) Function @0x{{[0-9a-f]+}} +// CHECK-NEXT: void bar(int a, float b) { + +// Arrays. + +int arr[3] = {1,2,3}; arr +// CHECK: (int[3]) { 1, 2, 3 } + +double darr[3][4] = { {1,2,3,4}, {5,6,7,8}, {9,10,11,12} }; darr +// CHECK-NEXT: (double[3][4]) { { 1.0, 2.0, 3.0, 4.0 }, { 5.0, 6.0, 7.0, 8.0 }, { 9.0, 10.0, 11.0, 12.0 } } + +float farr[2][1] = { {0}, {3.14}}; farr +// CHECK-NEXT: (float[2][1]) { { 0.0f }, { 3.14000f } } + +0./0. +// CHECK-NEXT: (double) nan + +1.0f / 0.0f +// CHECK-NEXT: (float) inf + +0.00001f +// CHECK-NEXT: (float) 1.00000e-05f + +int * ptr = (int*)0x123; ptr +// CHECK-NEXT: (int *) 0x123 + +int * null_ptr = (int*)0; null_ptr +// CHECK-NEXT: (int *) 0x0 + +// TODO: _Bool, _Complex, _Atomic, and _BitInt +// union U { int I; float F; } u; u.I = 12; u.I +// TODO-CHECK-NEXT: (int) 12 +// struct S1{} s1; s1 +// TODO-CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}} + +// struct S2 {int d;} E = {22}; E +// TODO-CHECK-NEXT: (struct S2 &) @0x{{[0-9a-f]+}} +// E.d +// TODO-CHECK-NEXT: (int) 22 -// CHECK: Not implement yet. +%quit diff --git a/clang/test/Interpreter/pretty-print.cpp b/clang/test/Interpreter/pretty-print.cpp new file mode 100644 index 0000000000000..0882a3f9e462e --- /dev/null +++ b/clang/test/Interpreter/pretty-print.cpp @@ -0,0 +1,70 @@ +// RUN: clang-repl "int i = 10;" 'extern "C" int printf(const char*,...);' \ +// RUN: 'auto r1 = printf("i = %d\n", i);' | FileCheck --check-prefix=CHECK-DRIVER %s +// UNSUPPORTED: system-aix +// CHECK-DRIVER: i = 10 +// RUN: cat %s | clang-repl -Xcc -std=c++11 -Xcc -fno-delayed-template-parsing | FileCheck %s +extern "C" int printf(const char*,...); + +"ab" +// CHECK: (const char[3]) "ab" + +char ch[2] = {'1','a'}; ch +// CHECK-NEXT: (char[2]) { '1', 'a' } + +char chnull[3] = {'1','a', '\0'}; chnull +// CHECK-NEXT: (char[3]) "1a" + +char ch_arr[2][3][1] = {{{'a'}, {'b'}, {'c'}}, {{'d'}, {'e'}, {'f'}}}; ch_arr +// CHECK: (char[2][3][1]) { { { 'a' }, { 'b' }, { 'c' } }, { { 'd' }, { 'e' }, { 'f' } } } +struct S3 { int* p; S3() { p = new int(42); } ~S3() { delete p; } }; +S3{} +// CHECK-NEXT: (S3) @0x{{[0-9a-f]+}} +S3 s3; +s3 +// CHECK-NEXT: (S3 &) @0x{{[0-9a-f]+}} + +struct S4 { ~S4() { printf("~S4()\n"); }}; +S4{} +// CHECK-NEXT: (S4) @0x{{[0-9a-f]+}} +// TODO-CHECK-NEXT: ~S4() + +enum Enum{ e1 = -12, e2, e3=33, e4, e5 = 33}; +e2 +// CHECK-NEXT: (Enum) (e2) : int -11 +::e1 +// CHECK-NEXT: (Enum) (e1) : int -12 + +enum class Color { R = 0, G, B }; +Color::R +// CHECK-NEXT: (Color) (Color::R) : int 0 + + +// Lambdas. + +auto Lambda1 = []{}; +Lambda1 +// CHECK-NEXT: ((lambda) &) @0x{{[0-9a-f]+}} +[]{} +// CHECK-NEXT: ((lambda at input_line_{{[0-9]+}}:1:1)) @0x{{[0-9a-f]+}} + +template struct F{ enum {RET=F::RET*n} ; }; +template<> struct F<0> { enum {RET = 1}; }; +F<7>::RET +// CHECK-NEXT: (F<7>::(unnamed enum at input_line_{{[0-9]+}}:1:27)) (F<7>::RET) : unsigned int 5040 + +struct S5 { int foo() { return 42; }}; +&S5::foo +// CHECK-NEXT: (int (S5::*)()) Function @0x{{[0-9a-f]+}} + +// int i = 12; +// int &iref = i; +// iref +// // TODO-CHECK-NEXT: (int &) 12 + +// int &&rref = 100; +// rref + +// // TODO-CHECK-NEXT: (int &&) 100 + +%quit + diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp index b97f5ae17c9f0..8711f6660c97d 100644 --- a/clang/unittests/Interpreter/InterpreterTest.cpp +++ b/clang/unittests/Interpreter/InterpreterTest.cpp @@ -389,6 +389,26 @@ TEST_F(InterpreterTest, Value) { EXPECT_TRUE(V9.getType()->isMemberFunctionPointerType()); EXPECT_EQ(V9.getKind(), Value::K_PtrOrObj); EXPECT_TRUE(V9.isManuallyAlloc()); + + Value V10; + llvm::cantFail(Interp->ParseAndExecute( + "enum D : unsigned int {Zero = 0, One}; One", &V10)); + + std::string prettyType; + llvm::raw_string_ostream OSType(prettyType); + V10.printType(OSType); + EXPECT_STREQ(prettyType.c_str(), "D"); + + // FIXME: We should print only the value or the constant not the type. + std::string prettyData; + llvm::raw_string_ostream OSData(prettyData); + V10.printData(OSData); + EXPECT_STREQ(prettyData.c_str(), "(One) : unsigned int 1"); + + std::string prettyPrint; + llvm::raw_string_ostream OSPrint(prettyPrint); + V10.print(OSPrint); + EXPECT_STREQ(prettyPrint.c_str(), "(D) (One) : unsigned int 1\n"); } TEST_F(InterpreterTest, TranslationUnit_CanonicalDecl) { From 7c402b8b81d2b69b55eb5bac39830fbc631f8cde Mon Sep 17 00:00:00 2001 From: YexuanXiao Date: Sat, 19 Jul 2025 14:44:14 +0800 Subject: [PATCH 433/813] Reland [Clang] Make the SizeType, SignedSizeType and PtrdiffType be named sugar types (#149613) The checks for the 'z' and 't' format specifiers added in the original PR #143653 had some issues and were overly strict, causing some build failures and were consequently reverted at https://github.com/llvm/llvm-project/commit/4c85bf2fe8042c855c9dd5be4b02191e9d071ffd. In the latest commit https://github.com/llvm/llvm-project/pull/149613/commits/27c58629ec76a703fde9c0b99b170573170b4a7a, I relaxed the checks for the 'z' and 't' format specifiers, so warnings are now only issued when they are used with mismatched types. The original intent of these checks was to diagnose code that assumes the underlying type of `size_t` is `unsigned` or `unsigned long`, for example: ```c printf("%zu", 1ul); // Not portable, but not an error when size_t is unsigned long ``` However, it produced a significant number of false positives. This was partly because Clang does not treat the `typedef` `size_t` and `__size_t` as having a common "sugar" type, and partly because a large amount of existing code either assumes `unsigned` (or `unsigned long`) is `size_t`, or they define the equivalent of size_t in their own way (such as sanitizer_internal_defs.h).https://github.com/llvm/llvm-project/blob/2e67dcfdcd023df2f06e0823eeea23990ce41534/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h#L203 --- .../clangd/unittests/FindTargetTests.cpp | 2 +- .../clangd/unittests/HoverTests.cpp | 4 +- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/AST/ASTContext.h | 13 +- clang/include/clang/AST/FormatString.h | 3 +- clang/include/clang/AST/RecursiveASTVisitor.h | 4 + clang/include/clang/AST/Type.h | 56 ++++ clang/include/clang/AST/TypeLoc.h | 10 + clang/include/clang/AST/TypeProperties.td | 9 + clang/include/clang/Basic/TypeNodes.td | 1 + .../clang/Serialization/TypeBitCodes.def | 1 + clang/lib/AST/ASTContext.cpp | 76 +++-- clang/lib/AST/ASTImporter.cpp | 5 + clang/lib/AST/ASTStructuralEquivalence.cpp | 7 + clang/lib/AST/FormatString.cpp | 108 +++++-- clang/lib/AST/ItaniumMangle.cpp | 4 + clang/lib/AST/PrintfFormatString.cpp | 11 +- clang/lib/AST/ScanfFormatString.cpp | 21 +- clang/lib/AST/Type.cpp | 12 + clang/lib/AST/TypePrinter.cpp | 10 + clang/lib/CodeGen/CGCall.cpp | 2 +- clang/lib/CodeGen/CGCoroutine.cpp | 8 +- clang/lib/CodeGen/CGDebugInfo.cpp | 3 +- clang/lib/CodeGen/CGObjCMac.cpp | 2 +- clang/lib/CodeGen/CodeGenFunction.cpp | 3 +- clang/lib/Sema/SemaChecking.cpp | 4 +- clang/lib/Sema/SemaExpr.cpp | 3 + clang/lib/Sema/SemaExprCXX.cpp | 10 +- clang/lib/Sema/TreeTransform.h | 6 + clang/lib/Serialization/ASTReader.cpp | 5 + clang/lib/Serialization/ASTWriter.cpp | 6 +- .../StaticAnalyzer/Checkers/MallocChecker.cpp | 25 +- .../Checkers/StdLibraryFunctionsChecker.cpp | 80 +++--- .../Checkers/VLASizeChecker.cpp | 2 +- ...d_resource_element_compatible_concept.hlsl | 2 +- clang/test/AST/ast-dump-array.cpp | 2 +- clang/test/AST/ast-dump-expr-json.c | 9 +- clang/test/AST/ast-dump-expr-json.cpp | 24 +- clang/test/AST/ast-dump-expr.c | 6 +- clang/test/AST/ast-dump-expr.cpp | 16 +- ...dump-openmp-distribute-parallel-for-simd.c | 20 +- .../ast-dump-openmp-distribute-parallel-for.c | 20 +- ...arget-teams-distribute-parallel-for-simd.c | 160 +++++------ ...nmp-target-teams-distribute-parallel-for.c | 160 +++++------ ...penmp-teams-distribute-parallel-for-simd.c | 160 +++++------ ...ump-openmp-teams-distribute-parallel-for.c | 160 +++++------ clang/test/AST/ast-dump-stmt-json.cpp | 71 +++-- clang/test/AST/ast-dump-stmt.cpp | 4 +- clang/test/AST/ast-dump-traits.cpp | 8 +- clang/test/AST/ast-dump-types-errors-json.cpp | 3 +- clang/test/Analysis/cfg.cpp | 2 +- clang/test/Analysis/explain-svals.cpp | 2 +- .../std-c-library-functions-arg-weakdeps.c | 2 +- .../Analysis/std-c-library-functions-lookup.c | 2 +- ...td-c-library-functions-vs-stream-checker.c | 4 +- clang/test/Analysis/std-c-library-functions.c | 4 +- clang/test/CXX/drs/cwg2xx.cpp | 2 +- clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp | 10 +- clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp | 6 +- clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp | 2 +- .../test/FixIt/fixit-format-ios-nopedantic.m | 2 +- clang/test/FixIt/format.m | 6 +- .../test/Sema/format-strings-fixit-ssize_t.c | 2 +- clang/test/Sema/format-strings-scanf.c | 4 +- clang/test/Sema/format-strings-size_t.c | 6 +- clang/test/Sema/matrix-type-builtins.c | 8 +- clang/test/Sema/ptrauth-atomic-ops.c | 2 +- clang/test/Sema/ptrauth.c | 2 +- .../SemaCXX/cxx2c-trivially-relocatable.cpp | 2 +- clang/test/SemaCXX/enum-scoped.cpp | 4 +- .../SemaCXX/microsoft-varargs-diagnostics.cpp | 6 +- clang/test/SemaCXX/new-delete.cpp | 2 +- clang/test/SemaCXX/static-assert-cxx26.cpp | 14 +- ...are-new-delete-basic-free-declarations.cpp | 2 +- .../unavailable_aligned_allocation.cpp | 24 +- clang/test/SemaHLSL/Language/AssignArray.hlsl | 4 +- clang/test/SemaHLSL/Language/InitListAST.hlsl | 264 +++++++++--------- clang/test/SemaObjC/matrix-type-builtins.m | 2 +- .../SemaOpenCL/cl20-device-side-enqueue.cl | 6 +- clang/test/SemaTemplate/type_pack_element.cpp | 12 +- clang/tools/libclang/CIndex.cpp | 4 + .../deque/spare_block_handling.pass.cpp | 8 +- .../TypeSystem/Clang/TypeSystemClang.cpp | 4 + 83 files changed, 1036 insertions(+), 732 deletions(-) diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp index 602f61d9ecb41..4d77f9d690ca0 100644 --- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp +++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp @@ -838,7 +838,7 @@ TEST_F(TargetDeclTest, OverloadExpr) { )cpp"; // Sized deallocation is enabled by default in C++14 onwards. EXPECT_DECLS("CXXDeleteExpr", - "void operator delete(void *, unsigned long) noexcept"); + "void operator delete(void *, __size_t) noexcept"); } TEST_F(TargetDeclTest, DependentExprs) { diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp index 775278ccf694b..4a21dafed5e95 100644 --- a/clang-tools-extra/clangd/unittests/HoverTests.cpp +++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp @@ -2794,7 +2794,7 @@ TEST(Hover, All) { })cpp", [](HoverInfo &HI) { HI.Name = "expression"; - HI.Type = "unsigned long"; + HI.Type = {"__size_t", "unsigned long"}; HI.Value = "1"; }}, { @@ -2804,7 +2804,7 @@ TEST(Hover, All) { })cpp", [](HoverInfo &HI) { HI.Name = "expression"; - HI.Type = "unsigned long"; + HI.Type = {"__size_t", "unsigned long"}; HI.Value = "1"; }}, { diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index ea16029268dba..46a77673919d3 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -46,6 +46,7 @@ Potentially Breaking Changes ``endbr64`` instruction at the labels named as possible branch destinations, so it is not safe to use a register-controlled branch instruction to branch to one. (In line with gcc.) +- Added a sugar type `PredefinedSugarType` to improve diagnostic messages. (#GH143653) C/C++ Language Potentially Breaking Changes ------------------------------------------- diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 27360a15b3a5c..17cbfb2693308 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -277,6 +277,11 @@ class ASTContext : public RefCountedBase { mutable llvm::ContextualFoldingSet ArrayParameterTypes; + /// Store the unique Type corresponding to each Kind. + mutable std::array + PredefinedSugarTypes{}; + /// The set of nested name specifiers. /// /// This set is managed by the NestedNameSpecifier class. @@ -1569,6 +1574,8 @@ class ASTContext : public RefCountedBase { /// and bit count. QualType getDependentBitIntType(bool Unsigned, Expr *BitsExpr) const; + QualType getPredefinedSugarType(PredefinedSugarType::Kind KD) const; + /// Gets the struct used to keep track of the extended descriptor for /// pointer to blocks. QualType getBlockDescriptorExtendedType() const; @@ -2001,11 +2008,13 @@ class ASTContext : public RefCountedBase { /// . /// /// The sizeof operator requires this (C99 6.5.3.4p4). - CanQualType getSizeType() const; + QualType getSizeType() const; + + CanQualType getCanonicalSizeType() const; /// Return the unique signed counterpart of /// the integer type corresponding to size_t. - CanQualType getSignedSizeType() const; + QualType getSignedSizeType() const; /// Return the unique type for "intmax_t" (C99 7.18.1.5), defined in /// . diff --git a/clang/include/clang/AST/FormatString.h b/clang/include/clang/AST/FormatString.h index 3560766433fe2..a284f2c44d633 100644 --- a/clang/include/clang/AST/FormatString.h +++ b/clang/include/clang/AST/FormatString.h @@ -489,7 +489,8 @@ class FormatSpecifier { /// For a TypedefType QT, if it is a named integer type such as size_t, /// assign the appropriate value to LM and return true. - static bool namedTypeToLengthModifier(QualType QT, LengthModifier &LM); + static bool namedTypeToLengthModifier(ASTContext &Ctx, QualType QT, + LengthModifier &LM); }; } // end analyze_format_string namespace diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 519a811775c01..62991d986e675 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -1208,6 +1208,8 @@ DEF_TRAVERSE_TYPE(BitIntType, {}) DEF_TRAVERSE_TYPE(DependentBitIntType, { TRY_TO(TraverseStmt(T->getNumBitsExpr())); }) +DEF_TRAVERSE_TYPE(PredefinedSugarType, {}) + #undef DEF_TRAVERSE_TYPE // ----------------- TypeLoc traversal ----------------- @@ -1524,6 +1526,8 @@ DEF_TRAVERSE_TYPELOC(DependentBitIntType, { TRY_TO(TraverseStmt(TL.getTypePtr()->getNumBitsExpr())); }) +DEF_TRAVERSE_TYPELOC(PredefinedSugarType, {}) + #undef DEF_TRAVERSE_TYPELOC // ----------------- Decl traversal ----------------- diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 21b97102db95a..764e9d508a25a 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2258,6 +2258,30 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { unsigned NumExpansions; }; + enum class PredefinedSugarKind { + /// The "size_t" type. + SizeT, + + /// The signed integer type corresponding to "size_t". + SignedSizeT, + + /// The "ptrdiff_t" type. + PtrdiffT, + + // Indicates how many items the enum has. + Last = PtrdiffT + }; + + class PresefinedSugarTypeBitfields { + friend class PredefinedSugarType; + + LLVM_PREFERRED_TYPE(TypeBitfields) + unsigned : NumTypeBits; + + LLVM_PREFERRED_TYPE(PredefinedSugarKind) + unsigned Kind : 8; + }; + class CountAttributedTypeBitfields { friend class CountAttributedType; @@ -2297,6 +2321,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { DependentTemplateSpecializationTypeBits; PackExpansionTypeBitfields PackExpansionTypeBits; CountAttributedTypeBitfields CountAttributedTypeBits; + PresefinedSugarTypeBitfields PredefinedSugarTypeBits; }; private: @@ -8038,6 +8063,37 @@ class DependentBitIntType final : public Type, public llvm::FoldingSetNode { } }; +class PredefinedSugarType final : public Type { +public: + friend class ASTContext; + using Kind = PredefinedSugarKind; + +private: + PredefinedSugarType(Kind KD, const IdentifierInfo *IdentName, + QualType CanonicalType) + : Type(PredefinedSugar, CanonicalType, TypeDependence::None), + Name(IdentName) { + PredefinedSugarTypeBits.Kind = llvm::to_underlying(KD); + } + + static StringRef getName(Kind KD); + + const IdentifierInfo *Name; + +public: + bool isSugared() const { return true; } + + QualType desugar() const { return getCanonicalTypeInternal(); } + + Kind getKind() const { return Kind(PredefinedSugarTypeBits.Kind); } + + const IdentifierInfo *getIdentifier() const { return Name; } + + static bool classof(const Type *T) { + return T->getTypeClass() == PredefinedSugar; + } +}; + /// A qualifier set is used to build a set of qualifiers. class QualifierCollector : public Qualifiers { public: diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h index cf06e27758996..be0bc896de3ea 100644 --- a/clang/include/clang/AST/TypeLoc.h +++ b/clang/include/clang/AST/TypeLoc.h @@ -2783,6 +2783,16 @@ class ObjCProtocolLoc { } }; +struct PredefinedSugarTypeLocInfo {}; // Nothing. + +class PredefinedSugarTypeLoc final + : public ConcreteTypeLoc { +public: + void initializeLocal(ASTContext &Context, SourceLocation loc) {} + SourceRange getLocalSourceRange() const { return {}; } +}; + } // namespace clang #endif // LLVM_CLANG_AST_TYPELOC_H diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index a6157649060b1..3114d1180319a 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -1028,3 +1028,12 @@ let Class = DependentBitIntType in { return ctx.getDependentBitIntType(isUnsigned, numBitsExpr); }]>; } + +let Class = PredefinedSugarType in { + def : Property<"kind", UInt32> { + let Read = [{ static_cast(node->getKind()) }]; + } + def : Creator<[{ + return ctx.getPredefinedSugarType(static_cast(kind)); + }]>; +} diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td index 567b8a5ca5a4d..971ce541d4831 100644 --- a/clang/include/clang/Basic/TypeNodes.td +++ b/clang/include/clang/Basic/TypeNodes.td @@ -117,3 +117,4 @@ def PipeType : TypeNode; def AtomicType : TypeNode; def BitIntType : TypeNode; def DependentBitIntType : TypeNode, AlwaysDependent; +def PredefinedSugarType : TypeNode, NeverCanonical; diff --git a/clang/include/clang/Serialization/TypeBitCodes.def b/clang/include/clang/Serialization/TypeBitCodes.def index b8cde2e370960..613eb6af2005a 100644 --- a/clang/include/clang/Serialization/TypeBitCodes.def +++ b/clang/include/clang/Serialization/TypeBitCodes.def @@ -69,5 +69,6 @@ TYPE_BIT_CODE(CountAttributed, COUNT_ATTRIBUTED, 57) TYPE_BIT_CODE(ArrayParameter, ARRAY_PARAMETER, 58) TYPE_BIT_CODE(HLSLAttributedResource, HLSLRESOURCE_ATTRIBUTED, 59) TYPE_BIT_CODE(HLSLInlineSpirv, HLSL_INLINE_SPIRV, 60) +TYPE_BIT_CODE(PredefinedSugar, PREDEFINED_SUGAR, 61) #undef TYPE_BIT_CODE diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 232a4b6557b92..6b6275faa215a 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2597,6 +2597,9 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { } break; + case Type::PredefinedSugar: + return getTypeInfo(cast(T)->desugar().getTypePtr()); + case Type::Pipe: Width = Target->getPointerWidth(LangAS::opencl_global); Align = Target->getPointerAlign(LangAS::opencl_global); @@ -5216,6 +5219,39 @@ QualType ASTContext::getDependentBitIntType(bool IsUnsigned, return QualType(New, 0); } +QualType +ASTContext::getPredefinedSugarType(PredefinedSugarType::Kind KD) const { + using Kind = PredefinedSugarType::Kind; + + if (auto *Target = PredefinedSugarTypes[llvm::to_underlying(KD)]; + Target != nullptr) + return QualType(Target, 0); + + auto getCanonicalType = [](const ASTContext &Ctx, Kind KDI) -> QualType { + switch (KDI) { + // size_t (C99TC3 6.5.3.4), signed size_t (C++23 5.13.2) and + // ptrdiff_t (C99TC3 6.5.6) Although these types are not built-in, they + // are part of the core language and are widely used. Using + // PredefinedSugarType makes these types as named sugar types rather than + // standard integer types, enabling better hints and diagnostics. + case Kind::SizeT: + return Ctx.getFromTargetType(Ctx.Target->getSizeType()); + case Kind::SignedSizeT: + return Ctx.getFromTargetType(Ctx.Target->getSignedSizeType()); + case Kind::PtrdiffT: + return Ctx.getFromTargetType(Ctx.Target->getPtrDiffType(LangAS::Default)); + } + llvm_unreachable("unexpected kind"); + }; + + auto *New = new (*this, alignof(PredefinedSugarType)) + PredefinedSugarType(KD, &Idents.get(PredefinedSugarType::getName(KD)), + getCanonicalType(*this, static_cast(KD))); + Types.push_back(New); + PredefinedSugarTypes[llvm::to_underlying(KD)] = New; + return QualType(New, 0); +} + #ifndef NDEBUG static bool NeedsInjectedClassNameType(const RecordDecl *D) { if (!isa(D)) return false; @@ -6796,14 +6832,31 @@ QualType ASTContext::getTagDeclType(const TagDecl *Decl) const { /// getSizeType - Return the unique type for "size_t" (C99 7.17), the result /// of the sizeof operator (C99 6.5.3.4p4). The value is target dependent and /// needs to agree with the definition in . -CanQualType ASTContext::getSizeType() const { +QualType ASTContext::getSizeType() const { + return getPredefinedSugarType(PredefinedSugarType::Kind::SizeT); +} + +CanQualType ASTContext::getCanonicalSizeType() const { return getFromTargetType(Target->getSizeType()); } /// Return the unique signed counterpart of the integer type /// corresponding to size_t. -CanQualType ASTContext::getSignedSizeType() const { - return getFromTargetType(Target->getSignedSizeType()); +QualType ASTContext::getSignedSizeType() const { + return getPredefinedSugarType(PredefinedSugarType::Kind::SignedSizeT); +} + +/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17) +/// defined in . Pointer - pointer requires this (C99 6.5.6p9). +QualType ASTContext::getPointerDiffType() const { + return getPredefinedSugarType(PredefinedSugarType::Kind::PtrdiffT); +} + +/// Return the unique unsigned counterpart of "ptrdiff_t" +/// integer type. The standard (C11 7.21.6.1p7) refers to this type +/// in the definition of %tu format specifier. +QualType ASTContext::getUnsignedPointerDiffType() const { + return getFromTargetType(Target->getUnsignedPtrDiffType(LangAS::Default)); } /// getIntMaxType - Return the unique type for "intmax_t" (C99 7.18.1.5). @@ -6838,19 +6891,6 @@ QualType ASTContext::getUIntPtrType() const { return getCorrespondingUnsignedType(getIntPtrType()); } -/// getPointerDiffType - Return the unique type for "ptrdiff_t" (C99 7.17) -/// defined in . Pointer - pointer requires this (C99 6.5.6p9). -QualType ASTContext::getPointerDiffType() const { - return getFromTargetType(Target->getPtrDiffType(LangAS::Default)); -} - -/// Return the unique unsigned counterpart of "ptrdiff_t" -/// integer type. The standard (C11 7.21.6.1p7) refers to this type -/// in the definition of %tu format specifier. -QualType ASTContext::getUnsignedPointerDiffType() const { - return getFromTargetType(Target->getUnsignedPtrDiffType(LangAS::Default)); -} - /// Return the unique type for "pid_t" defined in /// . We need this to compute the correct type for vfork(). QualType ASTContext::getProcessIDType() const { @@ -14503,6 +14543,10 @@ static QualType getCommonSugarTypeNode(ASTContext &Ctx, const Type *X, DX->isCountInBytes(), DX->isOrNull(), CDX); } + case Type::PredefinedSugar: + assert(cast(X)->getKind() != + cast(Y)->getKind()); + return QualType(); } llvm_unreachable("Unhandled Type Class"); } diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index b5f6c5a8c6abe..b9bdabe0b8c06 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -2080,6 +2080,11 @@ ExpectedType clang::ASTNodeImporter::VisitDependentBitIntType( *ToNumBitsExprOrErr); } +ExpectedType clang::ASTNodeImporter::VisitPredefinedSugarType( + const clang::PredefinedSugarType *T) { + return Importer.getToContext().getPredefinedSugarType(T->getKind()); +} + ExpectedType clang::ASTNodeImporter::VisitDependentSizedMatrixType( const clang::DependentSizedMatrixType *T) { Error Err = Error::success(); diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp index 289c6d7737de7..0f2762d5c0f14 100644 --- a/clang/lib/AST/ASTStructuralEquivalence.cpp +++ b/clang/lib/AST/ASTStructuralEquivalence.cpp @@ -1477,6 +1477,13 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context, return false; break; } + case Type::PredefinedSugar: { + const auto *TP1 = cast(T1); + const auto *TP2 = cast(T2); + if (TP1->getKind() != TP2->getKind()) + return false; + break; + } } // end switch return true; diff --git a/clang/lib/AST/FormatString.cpp b/clang/lib/AST/FormatString.cpp index 5d3b56fc4e713..112b756d2be1a 100644 --- a/clang/lib/AST/FormatString.cpp +++ b/clang/lib/AST/FormatString.cpp @@ -320,6 +320,70 @@ bool clang::analyze_format_string::ParseUTF8InvalidSpecifier( // Methods on ArgType. //===----------------------------------------------------------------------===// +static bool namedTypeToLengthModifierKind(ASTContext &Ctx, QualType QT, + LengthModifier::Kind &K) { + if (!Ctx.getLangOpts().C99 && !Ctx.getLangOpts().CPlusPlus) + return false; + for (/**/; const auto *TT = QT->getAs(); QT = TT->desugar()) { + const auto *TD = TT->getDecl(); + const auto *DC = TT->getDecl()->getDeclContext(); + if (DC->isTranslationUnit() || DC->isStdNamespace()) { + StringRef Name = TD->getIdentifier()->getName(); + if (Name == "size_t") { + K = LengthModifier::AsSizeT; + return true; + } else if (Name == "ssize_t" /*Not C99, but common in Unix.*/) { + K = LengthModifier::AsSizeT; + return true; + } else if (Name == "ptrdiff_t") { + K = LengthModifier::AsPtrDiff; + return true; + } else if (Name == "intmax_t") { + K = LengthModifier::AsIntMax; + return true; + } else if (Name == "uintmax_t") { + K = LengthModifier::AsIntMax; + return true; + } + } + } + if (const auto *PST = QT->getAs()) { + using Kind = PredefinedSugarType::Kind; + switch (PST->getKind()) { + case Kind::SizeT: + case Kind::SignedSizeT: + K = LengthModifier::AsSizeT; + return true; + case Kind::PtrdiffT: + K = LengthModifier::AsPtrDiff; + return true; + } + llvm_unreachable("unexpected kind"); + } + return false; +} + +// Check whether T and E are compatible size_t/ptrdiff_t types. E must be +// consistent with LE. +// T is the type of the actual expression in the code to be checked, and E is +// the expected type parsed from the format string. +static clang::analyze_format_string::ArgType::MatchKind +matchesSizeTPtrdiffT(ASTContext &C, QualType T, QualType E) { + using MatchKind = clang::analyze_format_string::ArgType::MatchKind; + + if (!T->isIntegerType()) + return MatchKind::NoMatch; + + if (C.hasSameType(T, E)) + return MatchKind::Match; + + if (C.getCorrespondingSignedType(T.getCanonicalType()) != + C.getCorrespondingSignedType(E.getCanonicalType())) + return MatchKind::NoMatch; + + return MatchKind::NoMatchSignedness; +} + clang::analyze_format_string::ArgType::MatchKind ArgType::matchesType(ASTContext &C, QualType argTy) const { // When using the format attribute in C++, you can receive a function or an @@ -394,6 +458,10 @@ ArgType::matchesType(ASTContext &C, QualType argTy) const { } case SpecificTy: { + if (TK != TypeKind::DontCare) { + return matchesSizeTPtrdiffT(C, argTy, T); + } + if (const EnumType *ETy = argTy->getAs()) { // If the enum is incomplete we know nothing about the underlying type. // Assume that it's 'int'. Do not use the underlying type for a scoped @@ -653,6 +721,12 @@ ArgType::matchesArgType(ASTContext &C, const ArgType &Other) const { if (Left.K == AK::SpecificTy) { if (Right.K == AK::SpecificTy) { + if (Left.TK != TypeKind::DontCare) { + return matchesSizeTPtrdiffT(C, Right.T, Left.T); + } else if (Right.TK != TypeKind::DontCare) { + return matchesSizeTPtrdiffT(C, Left.T, Right.T); + } + auto Canon1 = C.getCanonicalType(Left.T); auto Canon2 = C.getCanonicalType(Right.T); if (Canon1 == Canon2) @@ -706,7 +780,11 @@ QualType ArgType::getRepresentativeType(ASTContext &C) const { Res = C.CharTy; break; case SpecificTy: - Res = T; + if (TK == TypeKind::PtrdiffT || TK == TypeKind::SizeT) + // Using Name as name, so no need to show the uglified name. + Res = T->getCanonicalTypeInternal(); + else + Res = T; break; case CStrTy: Res = C.getPointerType(C.CharTy); @@ -733,7 +811,6 @@ QualType ArgType::getRepresentativeType(ASTContext &C) const { std::string ArgType::getRepresentativeTypeName(ASTContext &C) const { std::string S = getRepresentativeType(C).getAsString(C.getPrintingPolicy()); - std::string Alias; if (Name) { // Use a specific name for this type, e.g. "size_t". @@ -1198,29 +1275,12 @@ FormatSpecifier::getCorrectedLengthModifier() const { return std::nullopt; } -bool FormatSpecifier::namedTypeToLengthModifier(QualType QT, +bool FormatSpecifier::namedTypeToLengthModifier(ASTContext &Ctx, QualType QT, LengthModifier &LM) { - for (/**/; const auto *TT = QT->getAs(); - QT = TT->getDecl()->getUnderlyingType()) { - const TypedefNameDecl *Typedef = TT->getDecl(); - const IdentifierInfo *Identifier = Typedef->getIdentifier(); - if (Identifier->getName() == "size_t") { - LM.setKind(LengthModifier::AsSizeT); - return true; - } else if (Identifier->getName() == "ssize_t") { - // Not C99, but common in Unix. - LM.setKind(LengthModifier::AsSizeT); - return true; - } else if (Identifier->getName() == "intmax_t") { - LM.setKind(LengthModifier::AsIntMax); - return true; - } else if (Identifier->getName() == "uintmax_t") { - LM.setKind(LengthModifier::AsIntMax); - return true; - } else if (Identifier->getName() == "ptrdiff_t") { - LM.setKind(LengthModifier::AsPtrDiff); - return true; - } + if (LengthModifier::Kind Out = LengthModifier::Kind::None; + namedTypeToLengthModifierKind(Ctx, QT, Out)) { + LM.setKind(Out); + return true; } return false; } diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 6d082b31a9caa..2a667934dba42 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -2514,6 +2514,10 @@ bool CXXNameMangler::mangleUnresolvedTypeOrSimpleId(QualType Ty, mangleSourceNameWithAbiTags(cast(Ty)->getDecl()); break; + case Type::PredefinedSugar: + mangleType(cast(Ty)->desugar()); + break; + case Type::UnresolvedUsing: mangleSourceNameWithAbiTags( cast(Ty)->getDecl()); diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp index 293164ddac8f8..bcd44f0a85eed 100644 --- a/clang/lib/AST/PrintfFormatString.cpp +++ b/clang/lib/AST/PrintfFormatString.cpp @@ -543,7 +543,8 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx, case LengthModifier::AsIntMax: return ArgType(Ctx.getIntMaxType(), "intmax_t"); case LengthModifier::AsSizeT: - return ArgType::makeSizeT(ArgType(Ctx.getSignedSizeType(), "ssize_t")); + return ArgType::makeSizeT( + ArgType(Ctx.getSignedSizeType(), "signed size_t")); case LengthModifier::AsInt3264: return Ctx.getTargetInfo().getTriple().isArch64Bit() ? ArgType(Ctx.LongLongTy, "__int64") @@ -626,9 +627,11 @@ ArgType PrintfSpecifier::getScalarArgType(ASTContext &Ctx, case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); + return ArgType::PtrTo(ArgType::makeSizeT( + ArgType(Ctx.getSignedSizeType(), "signed size_t"))); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); + return ArgType::PtrTo(ArgType::makePtrdiffT( + ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); case LengthModifier::AsLongDouble: return ArgType(); // FIXME: Is this a known extension? case LengthModifier::AsAllocate: @@ -917,7 +920,7 @@ bool PrintfSpecifier::fixType(QualType QT, const LangOptions &LangOpt, // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99. if (LangOpt.C99 || LangOpt.CPlusPlus11) - namedTypeToLengthModifier(QT, LM); + namedTypeToLengthModifier(Ctx, QT, LM); // If fixing the length modifier was enough, we might be done. if (hasValidLengthModifier(Ctx.getTargetInfo(), LangOpt)) { diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp index 7ee21c8c61954..1227edd47d13d 100644 --- a/clang/lib/AST/ScanfFormatString.cpp +++ b/clang/lib/AST/ScanfFormatString.cpp @@ -251,9 +251,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); + return ArgType::PtrTo(ArgType::makeSizeT( + ArgType(Ctx.getSignedSizeType(), "signed size_t"))); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); + return ArgType::PtrTo(ArgType::makePtrdiffT( + ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); case LengthModifier::AsLongDouble: // GNU extension. return ArgType::PtrTo(Ctx.LongLongTy); @@ -292,10 +294,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getUIntMaxType(), "uintmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType(Ctx.getSizeType(), "size_t")); - case LengthModifier::AsPtrDiff: return ArgType::PtrTo( - ArgType(Ctx.getUnsignedPointerDiffType(), "unsigned ptrdiff_t")); + ArgType::makeSizeT(ArgType(Ctx.getSizeType(), "size_t"))); + case LengthModifier::AsPtrDiff: + return ArgType::PtrTo(ArgType::makePtrdiffT( + ArgType(Ctx.getUnsignedPointerDiffType(), "unsigned ptrdiff_t"))); case LengthModifier::AsLongDouble: // GNU extension. return ArgType::PtrTo(Ctx.UnsignedLongLongTy); @@ -390,9 +393,11 @@ ArgType ScanfSpecifier::getArgType(ASTContext &Ctx) const { case LengthModifier::AsIntMax: return ArgType::PtrTo(ArgType(Ctx.getIntMaxType(), "intmax_t")); case LengthModifier::AsSizeT: - return ArgType::PtrTo(ArgType(Ctx.getSignedSizeType(), "ssize_t")); + return ArgType::PtrTo(ArgType::makeSizeT( + ArgType(Ctx.getSignedSizeType(), "signed size_t"))); case LengthModifier::AsPtrDiff: - return ArgType::PtrTo(ArgType(Ctx.getPointerDiffType(), "ptrdiff_t")); + return ArgType::PtrTo(ArgType::makePtrdiffT( + ArgType(Ctx.getPointerDiffType(), "ptrdiff_t"))); case LengthModifier::AsLongDouble: return ArgType(); // FIXME: Is this a known extension? case LengthModifier::AsAllocate: @@ -501,7 +506,7 @@ bool ScanfSpecifier::fixType(QualType QT, QualType RawQT, // Handle size_t, ptrdiff_t, etc. that have dedicated length modifiers in C99. if (LangOpt.C99 || LangOpt.CPlusPlus11) - namedTypeToLengthModifier(PT, LM); + namedTypeToLengthModifier(Ctx, PT, LM); // If fixing the length modifier was enough, we are done. if (hasValidLengthModifier(Ctx.getTargetInfo(), LangOpt)) { diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index e5a1ab2ff8906..7444a2f90c5dd 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -5613,3 +5613,15 @@ HLSLAttributedResourceType::findHandleTypeOnResource(const Type *RT) { } return nullptr; } + +StringRef PredefinedSugarType::getName(Kind KD) { + switch (KD) { + case Kind::SizeT: + return "__size_t"; + case Kind::SignedSizeT: + return "__signed_size_t"; + case Kind::PtrdiffT: + return "__ptrdiff_t"; + } + llvm_unreachable("unexpected kind"); +} diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 818d2139628e3..deb453fe6ee75 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -248,6 +248,7 @@ bool TypePrinter::canPrefixQualifiers(const Type *T, case Type::BTFTagAttributed: case Type::HLSLAttributedResource: case Type::HLSLInlineSpirv: + case Type::PredefinedSugar: CanPrefixQualifiers = true; break; @@ -1417,6 +1418,15 @@ void TypePrinter::printDependentBitIntBefore(const DependentBitIntType *T, void TypePrinter::printDependentBitIntAfter(const DependentBitIntType *T, raw_ostream &OS) {} +void TypePrinter::printPredefinedSugarBefore(const PredefinedSugarType *T, + raw_ostream &OS) { + OS << T->getIdentifier()->getName(); + spaceBeforePlaceHolder(OS); +} + +void TypePrinter::printPredefinedSugarAfter(const PredefinedSugarType *T, + raw_ostream &OS) {} + /// Appends the given scope to the end of a string. void TypePrinter::AppendScope(DeclContext *DC, raw_ostream &OS, DeclarationName NameInScope) { diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 19d8ba26d44d8..0bceecec6e555 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -214,7 +214,7 @@ static void appendParameterTypes( for (unsigned I = 0, E = FPT->getNumParams(); I != E; ++I) { prefix.push_back(FPT->getParamType(I)); if (ExtInfos[I].hasPassObjectSize()) - prefix.push_back(CGT.getContext().getSizeType()); + prefix.push_back(CGT.getContext().getCanonicalSizeType()); } addExtParameterInfosForCall(paramInfos, FPT.getTypePtr(), PrefixSize, diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp index 117ef3d16e21b..5ee908922b5a3 100644 --- a/clang/lib/CodeGen/CGCoroutine.cpp +++ b/clang/lib/CodeGen/CGCoroutine.cpp @@ -1006,15 +1006,15 @@ RValue CodeGenFunction::EmitCoroutineIntrinsic(const CallExpr *E, } case llvm::Intrinsic::coro_size: { auto &Context = getContext(); - CanQualType SizeTy = Context.getSizeType(); - llvm::IntegerType *T = Builder.getIntNTy(Context.getTypeSize(SizeTy)); + llvm::IntegerType *T = + Builder.getIntNTy(Context.getTypeSize(Context.getSizeType())); llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::coro_size, T); return RValue::get(Builder.CreateCall(F)); } case llvm::Intrinsic::coro_align: { auto &Context = getContext(); - CanQualType SizeTy = Context.getSizeType(); - llvm::IntegerType *T = Builder.getIntNTy(Context.getTypeSize(SizeTy)); + llvm::IntegerType *T = + Builder.getIntNTy(Context.getTypeSize(Context.getSizeType())); llvm::Function *F = CGM.getIntrinsic(llvm::Intrinsic::coro_align, T); return RValue::get(Builder.CreateCall(F)); } diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index d6a5c4c476d5c..e24c68ed02865 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -4045,7 +4045,8 @@ llvm::DIType *CGDebugInfo::CreateTypeNode(QualType Ty, llvm::DIFile *Unit) { return CreateType(cast(Ty), Unit); case Type::HLSLInlineSpirv: return CreateType(cast(Ty), Unit); - + case Type::PredefinedSugar: + return getOrCreateType(cast(Ty)->desugar(), Unit); case Type::CountAttributed: case Type::Auto: case Type::Attributed: diff --git a/clang/lib/CodeGen/CGObjCMac.cpp b/clang/lib/CodeGen/CGObjCMac.cpp index 8e71a576552d3..8c66176942cb5 100644 --- a/clang/lib/CodeGen/CGObjCMac.cpp +++ b/clang/lib/CodeGen/CGObjCMac.cpp @@ -285,7 +285,7 @@ class ObjCCommonTypesHelper { SmallVector Params; Params.push_back(Ctx.VoidPtrTy); Params.push_back(Ctx.VoidPtrTy); - Params.push_back(Ctx.getSizeType()); + Params.push_back(Ctx.getCanonicalSizeType()); Params.push_back(Ctx.BoolTy); Params.push_back(Ctx.BoolTy); llvm::FunctionType *FTy = Types.GetFunctionType( diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 0fda31c8e5fa1..ab345a598c4e8 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -720,7 +720,7 @@ static bool matchesStlAllocatorFn(const Decl *D, const ASTContext &Ctx) { (MD->getNumParams() != 1 && MD->getNumParams() != 2)) return false; - if (MD->parameters()[0]->getType().getCanonicalType() != Ctx.getSizeType()) + if (!Ctx.hasSameType(MD->parameters()[0]->getType(), Ctx.getSizeType())) return false; if (MD->getNumParams() == 2) { @@ -2491,6 +2491,7 @@ void CodeGenFunction::EmitVariablyModifiedType(QualType type) { case Type::ObjCObjectPointer: case Type::BitInt: case Type::HLSLInlineSpirv: + case Type::PredefinedSugar: llvm_unreachable("type class is never variably-modified!"); case Type::Elaborated: diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index dd5b710d7e1d4..5e523fe887318 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5239,7 +5239,9 @@ bool Sema::BuiltinVAStartARMMicrosoft(CallExpr *Call) { << 2 << Arg1->getType() << ConstCharPtrTy; const QualType SizeTy = Context.getSizeType(); - if (Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers() != SizeTy) + if (!Context.hasSameType( + Arg2Ty->getCanonicalTypeInternal().withoutLocalFastQualifiers(), + SizeTy)) Diag(Arg2->getBeginLoc(), diag::err_typecheck_convert_incompatible) << Arg2->getType() << SizeTy << 1 /* different class */ << 0 /* qualifier difference */ diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 728ada33e2e63..45c7178c6965d 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -4564,6 +4564,9 @@ static void captureVariablyModifiedType(ASTContext &Context, QualType T, case Type::Atomic: T = cast(Ty)->getValueType(); break; + case Type::PredefinedSugar: + T = cast(Ty)->desugar(); + break; } } while (!T.isNull() && T->isVariablyModifiedType()); } diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index fd95f4ec54229..0edfd6015cbd9 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -3461,11 +3461,11 @@ void Sema::DeclareGlobalAllocationFunction(DeclarationName Name, // non-templated allocation function we are trying to declare here. if (FunctionDecl *Func = dyn_cast(*Alloc)) { if (Func->getNumParams() == Params.size()) { - llvm::SmallVector FuncParams; - for (auto *P : Func->parameters()) - FuncParams.push_back( - Context.getCanonicalType(P->getType().getUnqualifiedType())); - if (llvm::ArrayRef(FuncParams) == Params) { + if (std::equal(Func->param_begin(), Func->param_end(), Params.begin(), + Params.end(), [&](ParmVarDecl *D, QualType RT) { + return Context.hasSameUnqualifiedType(D->getType(), + RT); + })) { // Make the function visible to name lookup, even if we found it in // an unimported module. It either is an implicitly-declared global // allocation function, or is suppressing that function. diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 286c2b486c0f9..c7428d1a02345 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -7245,6 +7245,12 @@ QualType TreeTransform::TransformDependentBitIntType( return Result; } +template +QualType TreeTransform::TransformPredefinedSugarType( + TypeLocBuilder &TLB, PredefinedSugarTypeLoc TL) { + llvm_unreachable("This type does not need to be transformed."); +} + /// Simple iterator that traverses the template arguments in a /// container that provides a \c getArgLoc() member function. /// diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 3596d2240167e..10aedb68fcd9d 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -7574,11 +7574,16 @@ void TypeLocReader::VisitPipeTypeLoc(PipeTypeLoc TL) { void TypeLocReader::VisitBitIntTypeLoc(clang::BitIntTypeLoc TL) { TL.setNameLoc(readSourceLocation()); } + void TypeLocReader::VisitDependentBitIntTypeLoc( clang::DependentBitIntTypeLoc TL) { TL.setNameLoc(readSourceLocation()); } +void TypeLocReader::VisitPredefinedSugarTypeLoc(PredefinedSugarTypeLoc TL) { + // Nothing to do. +} + void ASTRecordReader::readTypeLoc(TypeLoc TL) { TypeLocReader TLR(*this); for (; !TL.isNull(); TL = TL.getNextTypeLoc()) diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index e868afeb1a145..a6957e54b66f1 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -692,7 +692,6 @@ void TypeLocWriter::VisitAtomicTypeLoc(AtomicTypeLoc TL) { void TypeLocWriter::VisitPipeTypeLoc(PipeTypeLoc TL) { addSourceLocation(TL.getKWLoc()); } - void TypeLocWriter::VisitBitIntTypeLoc(clang::BitIntTypeLoc TL) { addSourceLocation(TL.getNameLoc()); } @@ -701,6 +700,11 @@ void TypeLocWriter::VisitDependentBitIntTypeLoc( addSourceLocation(TL.getNameLoc()); } +void TypeLocWriter::VisitPredefinedSugarTypeLoc( + clang::PredefinedSugarTypeLoc TL) { + // Nothing to do. +} + void ASTWriter::WriteTypeAbbrevs() { using namespace llvm; diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp index 30a04977d906d..68efdbaec341b 100644 --- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp @@ -1281,7 +1281,7 @@ SVal MallocChecker::evalMulForBufferSize(CheckerContext &C, const Expr *Blocks, SVal BlockBytesVal = C.getSVal(BlockBytes); ProgramStateRef State = C.getState(); SVal TotalSize = SB.evalBinOp(State, BO_Mul, BlocksVal, BlockBytesVal, - SB.getContext().getSizeType()); + SB.getContext().getCanonicalSizeType()); return TotalSize; } @@ -1311,11 +1311,9 @@ static bool isStandardRealloc(const CallEvent &Call) { const FunctionDecl *FD = dyn_cast(Call.getDecl()); assert(FD); ASTContext &AC = FD->getASTContext(); - - return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy && - FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy && - FD->getParamDecl(1)->getType().getDesugaredType(AC) == - AC.getSizeType(); + return AC.hasSameType(FD->getDeclaredReturnType(), AC.VoidPtrTy) && + AC.hasSameType(FD->getParamDecl(0)->getType(), AC.VoidPtrTy) && + AC.hasSameType(FD->getParamDecl(1)->getType(), AC.getSizeType()); } static bool isGRealloc(const CallEvent &Call) { @@ -1323,10 +1321,9 @@ static bool isGRealloc(const CallEvent &Call) { assert(FD); ASTContext &AC = FD->getASTContext(); - return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy && - FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy && - FD->getParamDecl(1)->getType().getDesugaredType(AC) == - AC.UnsignedLongTy; + return AC.hasSameType(FD->getDeclaredReturnType(), AC.VoidPtrTy) && + AC.hasSameType(FD->getParamDecl(0)->getType(), AC.VoidPtrTy) && + AC.hasSameType(FD->getParamDecl(1)->getType(), AC.UnsignedLongTy); } void MallocChecker::checkRealloc(ProgramStateRef State, const CallEvent &Call, @@ -2830,10 +2827,10 @@ MallocChecker::ReallocMemAux(CheckerContext &C, const CallEvent &Call, return nullptr; // Compare the size argument to 0. - DefinedOrUnknownSVal SizeZero = - svalBuilder.evalEQ(State, TotalSize.castAs(), - svalBuilder.makeIntValWithWidth( - svalBuilder.getContext().getSizeType(), 0)); + DefinedOrUnknownSVal SizeZero = svalBuilder.evalEQ( + State, TotalSize.castAs(), + svalBuilder.makeIntValWithWidth( + svalBuilder.getContext().getCanonicalSizeType(), 0)); ProgramStateRef StatePtrIsNull, StatePtrNotNull; std::tie(StatePtrIsNull, StatePtrNotNull) = State->assume(PtrEQ); diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 1c748f9bc1828..52b3d1e95942c 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -1666,7 +1666,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( const QualType IntTy = ACtx.IntTy; const QualType UnsignedIntTy = ACtx.UnsignedIntTy; const QualType LongTy = ACtx.LongTy; - const QualType SizeTy = ACtx.getSizeType(); + const QualType SizeTyCanonTy = ACtx.getCanonicalSizeType(); const QualType VoidPtrTy = getPointerTy(VoidTy); // void * const QualType IntPtrTy = getPointerTy(IntTy); // int * @@ -1684,14 +1684,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( const QualType ConstWchar_tPtrTy = getPointerTy(getConstTy(WCharTy)); // const wchar_t * const QualType ConstVoidPtrRestrictTy = getRestrictTy(ConstVoidPtrTy); - const QualType SizePtrTy = getPointerTy(SizeTy); + const QualType SizePtrTy = getPointerTy(SizeTyCanonTy); const QualType SizePtrRestrictTy = getRestrictTy(SizePtrTy); const RangeInt IntMax = BVF.getMaxValue(IntTy)->getLimitedValue(); const RangeInt UnsignedIntMax = BVF.getMaxValue(UnsignedIntTy)->getLimitedValue(); const RangeInt LongMax = BVF.getMaxValue(LongTy)->getLimitedValue(); - const RangeInt SizeMax = BVF.getMaxValue(SizeTy)->getLimitedValue(); + const RangeInt SizeMax = BVF.getMaxValue(SizeTyCanonTy)->getLimitedValue(); // Set UCharRangeMax to min of int or uchar maximum value. // The C standard states that the arguments of functions like isalpha must @@ -2057,18 +2057,19 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t fread(void *restrict ptr, size_t size, size_t nitems, // FILE *restrict stream); - addToFunctionSummaryMap( - "fread", - Signature(ArgTypes{VoidPtrRestrictTy, SizeTy, SizeTy, FilePtrRestrictTy}, - RetType{SizeTy}), - FreadSummary); + addToFunctionSummaryMap("fread", + Signature(ArgTypes{VoidPtrRestrictTy, SizeTyCanonTy, + SizeTyCanonTy, FilePtrRestrictTy}, + RetType{SizeTyCanonTy}), + FreadSummary); // size_t fwrite(const void *restrict ptr, size_t size, size_t nitems, // FILE *restrict stream); - addToFunctionSummaryMap("fwrite", - Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTy, - SizeTy, FilePtrRestrictTy}, - RetType{SizeTy}), - FreadSummary); + addToFunctionSummaryMap( + "fwrite", + Signature(ArgTypes{ConstVoidPtrRestrictTy, SizeTyCanonTy, SizeTyCanonTy, + FilePtrRestrictTy}, + RetType{SizeTyCanonTy}), + FreadSummary); std::optional Ssize_tTy = lookupTy("ssize_t"); std::optional Ssize_tMax = getMaxValue(Ssize_tTy); @@ -2083,12 +2084,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // should handle them together with the rest of the POSIX functions. // ssize_t read(int fildes, void *buf, size_t nbyte); addToFunctionSummaryMap( - "read", Signature(ArgTypes{IntTy, VoidPtrTy, SizeTy}, RetType{Ssize_tTy}), + "read", + Signature(ArgTypes{IntTy, VoidPtrTy, SizeTyCanonTy}, RetType{Ssize_tTy}), ReadSummary); // ssize_t write(int fildes, const void *buf, size_t nbyte); addToFunctionSummaryMap( "write", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy}, RetType{Ssize_tTy}), + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy}, + RetType{Ssize_tTy}), ReadSummary); auto GetLineSummary = @@ -2618,7 +2621,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *strndup(const char *s, size_t n); addToFunctionSummaryMap( "strndup", - Signature(ArgTypes{ConstCharPtrTy, SizeTy}, RetType{CharPtrTy}), + Signature(ArgTypes{ConstCharPtrTy, SizeTyCanonTy}, RetType{CharPtrTy}), Summary(NoEvalCall) .ArgConstraint(NotNull(ArgNo(0))) .ArgConstraint( @@ -2649,7 +2652,8 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *getcwd(char *buf, size_t size); addToFunctionSummaryMap( - "getcwd", Signature(ArgTypes{CharPtrTy, SizeTy}, RetType{CharPtrTy}), + "getcwd", + Signature(ArgTypes{CharPtrTy, SizeTyCanonTy}, RetType{CharPtrTy}), Summary(NoEvalCall) .Case({NotNull(0), ArgumentCondition(1, WithinRange, Range(1, SizeMax)), @@ -2957,8 +2961,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // FIXME: Improve for errno modeling. addToFunctionSummaryMap( "mmap", - Signature(ArgTypes{VoidPtrTy, SizeTy, IntTy, IntTy, IntTy, Off_tTy}, - RetType{VoidPtrTy}), + Signature( + ArgTypes{VoidPtrTy, SizeTyCanonTy, IntTy, IntTy, IntTy, Off_tTy}, + RetType{VoidPtrTy}), Summary(NoEvalCall) .ArgConstraint(ArgumentCondition(1, WithinRange, Range(1, SizeMax))) .ArgConstraint( @@ -2970,8 +2975,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // FIXME: Improve for errno modeling. addToFunctionSummaryMap( "mmap64", - Signature(ArgTypes{VoidPtrTy, SizeTy, IntTy, IntTy, IntTy, Off64_tTy}, - RetType{VoidPtrTy}), + Signature( + ArgTypes{VoidPtrTy, SizeTyCanonTy, IntTy, IntTy, IntTy, Off64_tTy}, + RetType{VoidPtrTy}), Summary(NoEvalCall) .ArgConstraint(ArgumentCondition(1, WithinRange, Range(1, SizeMax))) .ArgConstraint( @@ -3002,8 +3008,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t bufsize); addToFunctionSummaryMap( "readlink", - Signature(ArgTypes{ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTy}, - RetType{Ssize_tTy}), + Signature( + ArgTypes{ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTyCanonTy}, + RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ArgumentCondition(2, WithinRange, Range(1, IntMax)), ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3025,9 +3032,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // char *restrict buf, size_t bufsize); addToFunctionSummaryMap( "readlinkat", - Signature( - ArgTypes{IntTy, ConstCharPtrRestrictTy, CharPtrRestrictTy, SizeTy}, - RetType{Ssize_tTy}), + Signature(ArgTypes{IntTy, ConstCharPtrRestrictTy, CharPtrRestrictTy, + SizeTyCanonTy}, + RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ArgumentCondition(3, WithinRange, Range(1, IntMax)), ReturnValueCondition(LessThanOrEq, ArgNo(3)), @@ -3268,14 +3275,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // size_t length, // int flags, struct sockaddr *restrict address, // socklen_t *restrict address_len); - Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTy, IntTy, + Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTyCanonTy, IntTy, StructSockaddrPtrRestrictTy, Socklen_tPtrRestrictTy}, RetType{Ssize_tTy}), Recvfrom)) addToFunctionSummaryMap( "recvfrom", - Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTy, IntTy, + Signature(ArgTypes{IntTy, VoidPtrRestrictTy, SizeTyCanonTy, IntTy, Irrelevant, Socklen_tPtrRestrictTy}, RetType{Ssize_tTy}), Recvfrom); @@ -3297,14 +3304,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t sendto(int socket, const void *message, size_t length, // int flags, const struct sockaddr *dest_addr, // socklen_t dest_len); - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy, ConstStructSockaddrPtrTy, Socklen_tTy}, RetType{Ssize_tTy}), Sendto)) addToFunctionSummaryMap( "sendto", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy, Irrelevant, - Socklen_tTy}, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy, + Irrelevant, Socklen_tTy}, RetType{Ssize_tTy}), Sendto); @@ -3320,7 +3327,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t recv(int sockfd, void *buf, size_t len, int flags); addToFunctionSummaryMap( "recv", - Signature(ArgTypes{IntTy, VoidPtrTy, SizeTy, IntTy}, + Signature(ArgTypes{IntTy, VoidPtrTy, SizeTyCanonTy, IntTy}, RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3395,7 +3402,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // ssize_t send(int sockfd, const void *buf, size_t len, int flags); addToFunctionSummaryMap( "send", - Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTy, IntTy}, + Signature(ArgTypes{IntTy, ConstVoidPtrTy, SizeTyCanonTy, IntTy}, RetType{Ssize_tTy}), Summary(NoEvalCall) .Case({ReturnValueCondition(LessThanOrEq, ArgNo(2)), @@ -3683,7 +3690,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( // int pthread_attr_setguardsize(pthread_attr_t *attr, size_t guardsize); addToFunctionSummaryMap( {"pthread_attr_setstacksize", "pthread_attr_setguardsize"}, - Signature(ArgTypes{Pthread_attr_tPtrTy, SizeTy}, RetType{IntTy}), + Signature(ArgTypes{Pthread_attr_tPtrTy, SizeTyCanonTy}, RetType{IntTy}), Summary(NoEvalCall) .ArgConstraint(NotNull(ArgNo(0))) .ArgConstraint( @@ -3888,13 +3895,14 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( .ArgConstraint(NotNull(ArgNo(1)))); addToFunctionSummaryMap( "__buf_size_arg_constraint", - Signature(ArgTypes{ConstVoidPtrTy, SizeTy}, RetType{IntTy}), + Signature(ArgTypes{ConstVoidPtrTy, SizeTyCanonTy}, RetType{IntTy}), Summary(EvalCallAsPure) .ArgConstraint( BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1)))); addToFunctionSummaryMap( "__buf_size_arg_constraint_mul", - Signature(ArgTypes{ConstVoidPtrTy, SizeTy, SizeTy}, RetType{IntTy}), + Signature(ArgTypes{ConstVoidPtrTy, SizeTyCanonTy, SizeTyCanonTy}, + RetType{IntTy}), Summary(EvalCallAsPure) .ArgConstraint(BufferSize(/*Buffer=*/ArgNo(0), /*BufSize=*/ArgNo(1), /*BufSizeMultiplier=*/ArgNo(2)))); diff --git a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp index 1042b43680fd2..c97341f072aba 100644 --- a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp @@ -92,7 +92,7 @@ ProgramStateRef VLASizeChecker::checkVLA(CheckerContext &C, ASTContext &Ctx = C.getASTContext(); SValBuilder &SVB = C.getSValBuilder(); - CanQualType SizeTy = Ctx.getSizeType(); + QualType SizeTy = Ctx.getSizeType(); uint64_t SizeMax = SVB.getBasicValueFactory().getMaxValue(SizeTy)->getZExtValue(); diff --git a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl index a4f6e6c44794e..fa8d78f38494a 100644 --- a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl +++ b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl @@ -9,7 +9,7 @@ // CHECK: | `-TemplateTypeParm {{.*}} 'element_type' // CHECK: `-BinaryOperator {{.*}} 'bool' lvalue '>=' // CHECK: |-UnaryExprOrTypeTraitExpr {{.*}} 'bool' sizeof 'element_type' -// CHECK: `-IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK: `-IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 StructuredBuffer Buffer; diff --git a/clang/test/AST/ast-dump-array.cpp b/clang/test/AST/ast-dump-array.cpp index 15771f227df8a..5a982d34683ff 100644 --- a/clang/test/AST/ast-dump-array.cpp +++ b/clang/test/AST/ast-dump-array.cpp @@ -14,7 +14,7 @@ void testArrayInitExpr() auto l = [a]{ }; // CHECK: |-ArrayInitLoopExpr 0x{{[^ ]*}} 'int[10]' - // CHECK: | `-ArrayInitIndexExpr 0x{{[^ ]*}} <> 'unsigned long' + // CHECK: | `-ArrayInitIndexExpr 0x{{[^ ]*}} <> '__size_t':'unsigned long' } template diff --git a/clang/test/AST/ast-dump-expr-json.c b/clang/test/AST/ast-dump-expr-json.c index e910864eeed65..ecb6191c52200 100644 --- a/clang/test/AST/ast-dump-expr-json.c +++ b/clang/test/AST/ast-dump-expr-json.c @@ -3911,7 +3911,8 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3964,7 +3965,8 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3989,7 +3991,8 @@ void PrimaryExpressions(int a) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "alignof", diff --git a/clang/test/AST/ast-dump-expr-json.cpp b/clang/test/AST/ast-dump-expr-json.cpp index 5a762acad7917..11026c9d302f0 100644 --- a/clang/test/AST/ast-dump-expr-json.cpp +++ b/clang/test/AST/ast-dump-expr-json.cpp @@ -1545,7 +1545,8 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "Ts" @@ -1587,7 +1588,8 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "long" +// CHECK-NEXT: "desugaredQualType": "long", +// CHECK-NEXT: "qualType": "__ptrdiff_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "opcode": "-", @@ -1726,7 +1728,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: }, @@ -1755,7 +1757,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: }, @@ -1785,7 +1787,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1860,7 +1862,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1880,7 +1882,8 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -1937,7 +1940,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1957,7 +1960,8 @@ void TestNonADLCall3() { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -2333,7 +2337,7 @@ void TestNonADLCall3() { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ diff --git a/clang/test/AST/ast-dump-expr.c b/clang/test/AST/ast-dump-expr.c index 959d61ec9794b..e7aba39be8f68 100644 --- a/clang/test/AST/ast-dump-expr.c +++ b/clang/test/AST/ast-dump-expr.c @@ -222,15 +222,15 @@ void UnaryOperators(int a, int *b) { // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int' lvalue ParmVar 0x{{[^ ]*}} 'a' 'int' sizeof a; - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' sizeof + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' sizeof // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int' lvalue ParmVar 0x{{[^ ]*}} 'a' 'int' sizeof(int); - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' sizeof 'int' + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' sizeof 'int' _Alignof(int); // FIXME: Uses C++ spelling for alignof in C mode. - // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} 'unsigned long' alignof 'int' + // CHECK: UnaryExprOrTypeTraitExpr 0x{{[^ ]*}} '__size_t':'unsigned long' alignof 'int' } struct S { diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp index 8ccb39f8f3165..6fd429d1500a4 100644 --- a/clang/test/AST/ast-dump-expr.cpp +++ b/clang/test/AST/ast-dump-expr.cpp @@ -115,34 +115,34 @@ void Casting(const S *s) { template void UnaryExpressions(int *p) { sizeof...(Ts); - // CHECK: SizeOfPackExpr 0x{{[^ ]*}} 'unsigned long' 0x{{[^ ]*}} Ts + // CHECK: SizeOfPackExpr 0x{{[^ ]*}} '__size_t':'unsigned long' 0x{{[^ ]*}} Ts noexcept(p - p); // CHECK: CXXNoexceptExpr 0x{{[^ ]*}} 'bool' - // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'long' '-' + // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} '__ptrdiff_t':'long' '-' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' ::new int; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' global Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' global Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' new (int); - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' new int{12}; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' Function 0x{{[^ ]*}} 'operator new' 'void *(__size_t)' // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} 'int' // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 12 new int[2]; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(__size_t)' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 2 new int[2]{1, 2}; - // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(unsigned long)' + // CHECK: CXXNewExpr 0x{{[^ ]*}} 'int *' array Function 0x{{[^ ]*}} 'operator new[]' 'void *(__size_t)' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'int' 2 // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} 'int[2]' @@ -164,7 +164,7 @@ void UnaryExpressions(int *p) { // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' ::delete p; - // CHECK: CXXDeleteExpr 0x{{[^ ]*}} 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, unsigned long) noexcept' + // CHECK: CXXDeleteExpr 0x{{[^ ]*}} 'void' global Function 0x{{[^ ]*}} 'operator delete' 'void (void *, __size_t) noexcept' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'int *' diff --git a/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c index 10f27e759b5b1..672607fa90670 100644 --- a/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-distribute-parallel-for-simd.c @@ -57,8 +57,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -97,8 +97,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -144,8 +144,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -191,8 +191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -251,8 +251,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c index 419ba57191039..8eedf8ac8bc58 100644 --- a/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-distribute-parallel-for.c @@ -57,8 +57,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -97,8 +97,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -144,8 +144,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -191,8 +191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -251,8 +251,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c index c209a0456d7a0..64e19ce0a53bf 100644 --- a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for-simd.c @@ -65,8 +65,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -94,8 +94,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -123,8 +123,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -152,8 +152,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -189,8 +189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -218,8 +218,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -247,8 +247,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -276,8 +276,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -325,8 +325,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -371,8 +371,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -417,8 +417,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -463,8 +463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -517,8 +517,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -563,8 +563,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -609,8 +609,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -655,8 +655,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -711,8 +711,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -757,8 +757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -803,8 +803,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -849,8 +849,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -903,8 +903,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -949,8 +949,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -995,8 +995,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1041,8 +1041,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1097,8 +1097,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1143,8 +1143,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1189,8 +1189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1289,8 +1289,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1335,8 +1335,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1381,8 +1381,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1427,8 +1427,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1497,8 +1497,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1560,8 +1560,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1623,8 +1623,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1686,8 +1686,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1757,8 +1757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1820,8 +1820,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1883,8 +1883,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1946,8 +1946,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for-simd.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c index b13e096101e63..cf3f4bfcaf225 100644 --- a/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-target-teams-distribute-parallel-for.c @@ -65,8 +65,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -94,8 +94,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -123,8 +123,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -152,8 +152,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -189,8 +189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -218,8 +218,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -247,8 +247,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -276,8 +276,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:4:1) *const restrict' // CHECK-NEXT: | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 @@ -325,8 +325,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -371,8 +371,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -417,8 +417,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -463,8 +463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -517,8 +517,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -563,8 +563,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -609,8 +609,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -655,8 +655,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:10:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -711,8 +711,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -757,8 +757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -803,8 +803,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -849,8 +849,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -903,8 +903,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -949,8 +949,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -995,8 +995,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1041,8 +1041,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:17:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1097,8 +1097,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1143,8 +1143,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1189,8 +1189,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1289,8 +1289,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1335,8 +1335,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1381,8 +1381,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1427,8 +1427,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:24:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1497,8 +1497,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1560,8 +1560,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1623,8 +1623,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1686,8 +1686,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1757,8 +1757,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1820,8 +1820,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1883,8 +1883,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1946,8 +1946,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | `-NullStmt {{.*}} // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-target-teams-distribute-parallel-for.c:31:1) *const restrict' // CHECK-NEXT: | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c index 14356882b599a..c8da8cd1a5efa 100644 --- a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c +++ b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for-simd.c @@ -71,8 +71,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -99,8 +99,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -127,8 +127,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -155,8 +155,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -211,8 +211,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -239,8 +239,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -267,8 +267,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -295,8 +295,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:5:1) *const restrict' // CHECK-NEXT: | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -363,8 +363,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -407,8 +407,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -451,8 +451,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -495,8 +495,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -568,8 +568,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -612,8 +612,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -656,8 +656,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -700,8 +700,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:12:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -775,8 +775,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -819,8 +819,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -863,8 +863,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -907,8 +907,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -984,8 +984,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1028,8 +1028,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1072,8 +1072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1116,8 +1116,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:20:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1191,8 +1191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1279,8 +1279,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1323,8 +1323,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1419,8 +1419,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1463,8 +1463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1507,8 +1507,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1551,8 +1551,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:28:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1659,8 +1659,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1719,8 +1719,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1779,8 +1779,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1839,8 +1839,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1952,8 +1952,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2012,8 +2012,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2072,8 +2072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2132,8 +2132,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for-simd.c:36:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c index 0f983cfdff1dc..09b649cbb3660 100644 --- a/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c +++ b/clang/test/AST/ast-dump-openmp-teams-distribute-parallel-for.c @@ -71,8 +71,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -99,8 +99,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -127,8 +127,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -155,8 +155,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -211,8 +211,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -239,8 +239,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -267,8 +267,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -295,8 +295,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:5:1) *const restrict' // CHECK-NEXT: | | | `-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -363,8 +363,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -407,8 +407,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -451,8 +451,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -495,8 +495,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -568,8 +568,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -612,8 +612,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -656,8 +656,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -700,8 +700,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:12:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -775,8 +775,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -819,8 +819,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -863,8 +863,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -907,8 +907,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -984,8 +984,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1028,8 +1028,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1072,8 +1072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1116,8 +1116,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:20:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1191,8 +1191,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1235,8 +1235,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1279,8 +1279,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1323,8 +1323,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1419,8 +1419,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1463,8 +1463,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1507,8 +1507,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1551,8 +1551,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:28:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1659,8 +1659,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1719,8 +1719,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1779,8 +1779,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1839,8 +1839,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -1952,8 +1952,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2012,8 +2012,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2072,8 +2072,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | | | `-NullStmt {{.*}} // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | | | `-IntegerLiteral {{.*}} 'int' 0 @@ -2132,8 +2132,8 @@ void test_five(int x, int y, int z) { // CHECK-NEXT: | | | `-NullStmt {{.*}} // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .global_tid. 'const int *const restrict' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit .bound_tid. 'const int *const restrict' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const unsigned long' -// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.lb. 'const __size_t':'const unsigned long' +// CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit used .previous.ub. 'const __size_t':'const unsigned long' // CHECK-NEXT: | | |-ImplicitParamDecl {{.*}} col:1 implicit __context 'struct (unnamed at {{.*}}ast-dump-openmp-teams-distribute-parallel-for.c:36:1) *const restrict' // CHECK-NEXT: | | |-VarDecl {{.*}} col:12 used i 'int' cinit // CHECK-NEXT: | | | `-IntegerLiteral {{.*}} 'int' 0 diff --git a/clang/test/AST/ast-dump-stmt-json.cpp b/clang/test/AST/ast-dump-stmt-json.cpp index a473d17da9424..a8f113ce6a3d4 100644 --- a/clang/test/AST/ast-dump-stmt-json.cpp +++ b/clang/test/AST/ast-dump-stmt-json.cpp @@ -963,7 +963,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } @@ -994,7 +994,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1126,7 +1126,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1146,7 +1146,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -1337,7 +1338,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: } // CHECK-NEXT: } // CHECK-NEXT: } @@ -1369,7 +1370,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "kind": "FunctionDecl", // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ @@ -1444,7 +1445,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "mangledName": "_Znwm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1457,7 +1458,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1503,7 +1505,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new", // CHECK-NEXT: "mangledName": "_ZnwmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long, std::align_val_t)" +// CHECK-NEXT: "qualType": "void *(__size_t, std::align_val_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1516,7 +1518,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1585,7 +1588,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "mangledName": "_Znam", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long)" +// CHECK-NEXT: "qualType": "void *(__size_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1598,7 +1601,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1644,7 +1648,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator new[]", // CHECK-NEXT: "mangledName": "_ZnamSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void *(unsigned long, std::align_val_t)" +// CHECK-NEXT: "qualType": "void *(__size_t, std::align_val_t)" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1657,7 +1661,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1821,7 +1826,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "mangledName": "_ZdlPvm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1847,7 +1852,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -1874,7 +1880,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete", // CHECK-NEXT: "mangledName": "_ZdlPvmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long, std::align_val_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t, std::align_val_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -1900,7 +1906,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -2036,7 +2043,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete[]", // CHECK-NEXT: "mangledName": "_ZdaPvm", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -2062,7 +2069,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -2089,7 +2097,7 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: "name": "operator delete[]", // CHECK-NEXT: "mangledName": "_ZdaPvmSt11align_val_t", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "void (void *, unsigned long, std::align_val_t) noexcept" +// CHECK-NEXT: "qualType": "void (void *, __size_t, std::align_val_t) noexcept" // CHECK-NEXT: }, // CHECK-NEXT: "inner": [ // CHECK-NEXT: { @@ -2115,7 +2123,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: }, // CHECK-NEXT: "isImplicit": true, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: { @@ -3881,7 +3890,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -3955,7 +3965,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -4085,7 +4096,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", @@ -4159,7 +4171,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "castKind": "IntegralCast", @@ -4980,7 +4993,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "long" +// CHECK-NEXT: "desugaredQualType": "long", +// CHECK-NEXT: "qualType": "__ptrdiff_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "value": "10" @@ -6503,7 +6517,8 @@ void TestDependentGenericSelectionExpr(Ty T) { // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "long" +// CHECK-NEXT: "desugaredQualType": "long" +// CHECK-NEXT: "qualType": "__ptrdiff_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "value": "10" diff --git a/clang/test/AST/ast-dump-stmt.cpp b/clang/test/AST/ast-dump-stmt.cpp index 407584e5b82de..42c5f3b3498a4 100644 --- a/clang/test/AST/ast-dump-stmt.cpp +++ b/clang/test/AST/ast-dump-stmt.cpp @@ -206,7 +206,7 @@ void TestIteration() { // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'int *' '+' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]' - // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'long' 10 + // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} '__ptrdiff_t':'long' 10 // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'bool' '!=' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *' @@ -274,7 +274,7 @@ void TestIteration() { // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'int *' '+' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]' - // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} 'long' 10 + // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} '__ptrdiff_t':'long' 10 // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} 'bool' '!=' // CHECK-NEXT: ImplicitCastExpr // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *' diff --git a/clang/test/AST/ast-dump-traits.cpp b/clang/test/AST/ast-dump-traits.cpp index 3085e5883fd2e..72d2a2ae8603e 100644 --- a/clang/test/AST/ast-dump-traits.cpp +++ b/clang/test/AST/ast-dump-traits.cpp @@ -56,7 +56,7 @@ void test_unary_expr_or_type_trait() { // CHECK-NEXT: |-FunctionDecl {{.*}} line:20:6{{( imported)?}} test_array_type_trait 'void ()' // CHECK-NEXT: | `-CompoundStmt {{.*}} // CHECK-NEXT: | `-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-ArrayTypeTraitExpr {{.*}} 'unsigned long' __array_rank +// CHECK-NEXT: | `-ArrayTypeTraitExpr {{.*}} '__size_t':'unsigned long' __array_rank // CHECK-NEXT: |-FunctionDecl {{.*}} line:25:6{{( imported)?}} test_expression_trait 'void ()' // CHECK-NEXT: | `-CompoundStmt {{.*}} // CHECK-NEXT: | `-CStyleCastExpr {{.*}} 'void' @@ -64,8 +64,8 @@ void test_unary_expr_or_type_trait() { // CHECK-NEXT: `-FunctionDecl {{.*}} line:30:6{{( imported)?}} test_unary_expr_or_type_trait 'void ()' // CHECK-NEXT: `-CompoundStmt {{.*}} // CHECK-NEXT: |-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' sizeof 'int' +// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' sizeof 'int' // CHECK-NEXT: |-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' alignof 'int' +// CHECK-NEXT: | `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' alignof 'int' // CHECK-NEXT: `-CStyleCastExpr {{.*}} 'void' -// CHECK-NEXT: `-UnaryExprOrTypeTraitExpr {{.*}} 'unsigned long' __alignof 'int' +// CHECK-NEXT: `-UnaryExprOrTypeTraitExpr {{.*}} '__size_t':'unsigned long' __alignof 'int' diff --git a/clang/test/AST/ast-dump-types-errors-json.cpp b/clang/test/AST/ast-dump-types-errors-json.cpp index e15f8eeee20cc..d9f918f6c3d72 100644 --- a/clang/test/AST/ast-dump-types-errors-json.cpp +++ b/clang/test/AST/ast-dump-types-errors-json.cpp @@ -60,7 +60,8 @@ using TestContainsErrors = int[sizeof(undef())]; // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "unsigned long" +// CHECK-NEXT: "desugaredQualType": "unsigned long", +// CHECK-NEXT: "qualType": "__size_t" // CHECK-NEXT: }, // CHECK-NEXT: "valueCategory": "prvalue", // CHECK-NEXT: "name": "sizeof", diff --git a/clang/test/Analysis/cfg.cpp b/clang/test/Analysis/cfg.cpp index 44a89df28e3b2..d6cef88dc18a6 100644 --- a/clang/test/Analysis/cfg.cpp +++ b/clang/test/Analysis/cfg.cpp @@ -70,7 +70,7 @@ void F(EmptyE e) { // CHECK-NEXT: Succs (1): B1 // CHECK: [B1] // CHECK-NEXT: 1: __builtin_object_size -// CHECK-NEXT: 2: [B1.1] (ImplicitCastExpr, BuiltinFnToFnPtr, unsigned long (*)(const void *, int) noexcept) +// CHECK-NEXT: 2: [B1.1] (ImplicitCastExpr, BuiltinFnToFnPtr, __size_t (*)(const void *, int) noexcept) // CHECK-NEXT: 3: [B1.2](dummy(), 0) // CHECK-NEXT: 4: (void)[B1.3] (CStyleCastExpr, ToVoid, void) // CHECK-NEXT: Preds (1): B2 diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp index 267980c3b20c8..dfc650223c9e7 100644 --- a/clang/test/Analysis/explain-svals.cpp +++ b/clang/test/Analysis/explain-svals.cpp @@ -46,7 +46,7 @@ void test_1(int param, void *ptr) { void test_2(char *ptr, int ext) { clang_analyzer_explain((void *) "asdf"); // expected-warning-re{{{{^pointer to element of type 'char' with index 0 of string literal "asdf"$}}}} - clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type 'unsigned long' tied to pointee of argument 'ptr'$}}}} + clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type '__size_t' tied to pointee of argument 'ptr'$}}}} clang_analyzer_explain(conjure()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure\(\)'$}}}} clang_analyzer_explain(glob); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure\(\)'\) for global variable 'glob'$}}}} clang_analyzer_explain(glob_ptr); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure\(\)'\) for global variable 'glob_ptr'$}}}} diff --git a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c index 1f0d3627fae34..ba5bc57928b0c 100644 --- a/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c +++ b/clang/test/Analysis/std-c-library-functions-arg-weakdeps.c @@ -20,7 +20,7 @@ // RUN: -triple x86_64-unknown-linux 2>&1 | FileCheck %s // CHECK: Loaded summary for: int isalnum(int) -// CHECK: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict) __attribute__((nonnull(1))) +// CHECK: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) __attribute__((nonnull(1))) // CHECK: Loaded summary for: int fileno(FILE *stream) void initializeSummaryMap(void); diff --git a/clang/test/Analysis/std-c-library-functions-lookup.c b/clang/test/Analysis/std-c-library-functions-lookup.c index e47d9bddda91b..8182e5a1f5fde 100644 --- a/clang/test/Analysis/std-c-library-functions-lookup.c +++ b/clang/test/Analysis/std-c-library-functions-lookup.c @@ -6,7 +6,7 @@ // RUN: -analyzer-config eagerly-assume=false \ // RUN: -triple i686-unknown-linux 2>&1 | FileCheck %s -// CHECK: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) typedef typeof(sizeof(int)) size_t; typedef struct FILE FILE; diff --git a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c index b99cc30149c91..887817ba8551e 100644 --- a/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c +++ b/clang/test/Analysis/std-c-library-functions-vs-stream-checker.c @@ -31,8 +31,8 @@ // Verify that the summaries are loaded when the StdLibraryFunctionsChecker is // enabled. // CHECK: Loaded summary for: int getchar(void) -// CHECK-NEXT: Loaded summary for: unsigned long fread(void *restrict, size_t, size_t, FILE *restrict) -// CHECK-NEXT: Loaded summary for: unsigned long fwrite(const void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: __size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict) #include "Inputs/system-header-simulator.h" diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c index b03a1a5656517..b5f663493a676 100644 --- a/clang/test/Analysis/std-c-library-functions.c +++ b/clang/test/Analysis/std-c-library-functions.c @@ -59,8 +59,8 @@ // CHECK-NEXT: Loaded summary for: int tolower(int) // CHECK-NEXT: Loaded summary for: int toascii(int) // CHECK-NEXT: Loaded summary for: int getchar(void) -// CHECK-NEXT: Loaded summary for: unsigned int fread(void *restrict, size_t, size_t, FILE *restrict) -// CHECK-NEXT: Loaded summary for: unsigned int fwrite(const void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: __size_t fread(void *restrict, size_t, size_t, FILE *restrict) +// CHECK-NEXT: Loaded summary for: __size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict) // CHECK-NEXT: Loaded summary for: ssize_t read(int, void *, size_t) // CHECK-NEXT: Loaded summary for: ssize_t write(int, const void *, size_t) // CHECK-NEXT: Loaded summary for: ssize_t getline(char **restrict, size_t *restrict, FILE *restrict) diff --git a/clang/test/CXX/drs/cwg2xx.cpp b/clang/test/CXX/drs/cwg2xx.cpp index a53a8d1ed64a8..556407afa2641 100644 --- a/clang/test/CXX/drs/cwg2xx.cpp +++ b/clang/test/CXX/drs/cwg2xx.cpp @@ -1429,7 +1429,7 @@ namespace cwg299 { // cwg299: 2.8 c++11 // cxx98-11-error@#cwg299-q {{ambiguous conversion of array size expression of type 'T' to an integral or enumeration type}} // cxx98-11-note@#cwg299-int {{conversion to integral type 'int' declared here}} // cxx98-11-note@#cwg299-ushort {{conversion to integral type 'unsigned short' declared here}} - // since-cxx14-error-re@#cwg299-q {{{{conversion from 'T' to 'unsigned (long long|long|int)' is ambiguous}}}} + // since-cxx14-error-re@#cwg299-q {{conversion from 'T' to '__size_t' (aka 'unsigned {{long long|long|int}}') is ambiguous}} // since-cxx14-note@#cwg299-int {{candidate function}} // since-cxx14-note@#cwg299-ushort {{candidate function}} } // namespace cwg299 diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp index 6942b68690c5d..d439f304b5101 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p2.cpp @@ -5,11 +5,11 @@ typedef decltype(sizeof(int)) size_t; // FIXME: These diagnostics should say 'size_t' instead of 'unsigned long' int a = 123_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'unsigned long long' or 'const char *', and no matching literal operator template}} int b = 4.2_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'long double' or 'const char *', and no matching literal operator template}} -int c = "foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and 'unsigned}} -int d = L"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const wchar_t *' and 'unsigned}} -int e = u8"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and 'unsigned}} -int f = u"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char16_t *' and 'unsigned}} -int g = U"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char32_t *' and 'unsigned}} +int c = "foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and '__size_t' (aka 'unsigned}} +int d = L"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const wchar_t *' and '__size_t' (aka 'unsigned}} +int e = u8"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char *' and '__size_t' (aka 'unsigned}} +int f = u"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char16_t *' and '__size_t' (aka 'unsigned}} +int g = U"foo"_x; // expected-error {{no matching literal operator for call to 'operator""_x' with arguments of types 'const char32_t *' and '__size_t' (aka 'unsigned}} int h = 'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'char'}} int i = L'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'wchar_t'}} int j = u'y'_x; // expected-error {{no matching literal operator for call to 'operator""_x' with argument of type 'char16_t'}} diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp index afadba282e626..463d7854867a2 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p5.cpp @@ -13,7 +13,7 @@ float &operator ""_x1 (const char8_t *, size_t); using char8 = double; #endif char8 &i2 = u8"foo"_x1; -double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and 'unsigned long'}} +double &i3 = L"foo"_x1; // expected-error {{no matching literal operator for call to 'operator""_x1' with arguments of types 'const wchar_t *' and '__size_t' (aka 'unsigned long')}} char &operator ""_x1(const wchar_t *, size_t); char &i4 = L"foo"_x1; // ok @@ -46,8 +46,8 @@ template float &operator""_s(); void no_fallback() { "hello"_s; // FIXME: It'd be useful to explain what candidates were found and why they didn't work. - "xyzzy"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and 'unsigned long', and no matching literal operator template}} - "yello"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and 'unsigned long', and no matching literal operator template}} + "xyzzy"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long'), and no matching literal operator template}} + "yello"_s; // expected-error {{no matching literal operator for call to 'operator""_s' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long'), and no matching literal operator template}} } double &operator""_s(const char*, size_t); diff --git a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp index d571fcb8697eb..17d9c83055a1c 100644 --- a/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp +++ b/clang/test/CXX/lex/lex.literal/lex.ext/p7.cpp @@ -17,7 +17,7 @@ int main() { auto v1 = 1.2_w; // calls operator""_w(1.2L) auto v2 = u"one"_w; // calls operator""_w(u"one", 3) auto v3 = 12_w; // calls operator""_w("12") - "two"_w; // expected-error {{no matching literal operator for call to 'operator""_w' with arguments of types 'const char *' and 'unsigned long'}} + "two"_w; // expected-error {{no matching literal operator for call to 'operator""_w' with arguments of types 'const char *' and '__size_t' (aka 'unsigned long')}} same_type test1; same_type test2; diff --git a/clang/test/FixIt/fixit-format-ios-nopedantic.m b/clang/test/FixIt/fixit-format-ios-nopedantic.m index db9ac797c2472..836a4b5372f13 100644 --- a/clang/test/FixIt/fixit-format-ios-nopedantic.m +++ b/clang/test/FixIt/fixit-format-ios-nopedantic.m @@ -1,5 +1,5 @@ // RUN: cp %s %t -// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -Werror -fixit %t +// RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -Wformat -fixit %t int printf(const char *restrict, ...); typedef unsigned int NSUInteger; diff --git a/clang/test/FixIt/format.m b/clang/test/FixIt/format.m index 950765bad9339..e97ae10c974aa 100644 --- a/clang/test/FixIt/format.m +++ b/clang/test/FixIt/format.m @@ -237,14 +237,14 @@ void testSizeTypes(void) { printf("%zu", 0.f); // expected-warning-re{{format specifies type 'size_t' (aka '{{.+}}') but the argument has type 'float'}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f" - printf("%zd", 0.f); // expected-warning-re{{format specifies type 'ssize_t' (aka '{{.+}}') but the argument has type 'float'}} + printf("%zd", 0.f); // expected-warning-re{{format specifies type 'signed size_t' (aka '{{.+}}') but the argument has type 'float'}} // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:11-[[@LINE-1]]:14}:"%f" short x; #if !defined(__ANDROID__) && !defined(__Fuchsia__) - printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}} + printf("%zn", &x); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'short *'}} #else - printf("%zn", &x); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'short *'}} + printf("%zn", &x); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'short *'}} // expected-warning@-1 {{'%n' specifier not supported on this platform}} #endif // !defined(__ANDROID__) && !defined(__Fuchsia__) // PrintfSpecifier::fixType doesn't handle %n, so a fix-it is not emitted, diff --git a/clang/test/Sema/format-strings-fixit-ssize_t.c b/clang/test/Sema/format-strings-fixit-ssize_t.c index 2c83db0b66362..96806517b80f2 100644 --- a/clang/test/Sema/format-strings-fixit-ssize_t.c +++ b/clang/test/Sema/format-strings-fixit-ssize_t.c @@ -11,8 +11,8 @@ int printf(char const *, ...); int scanf(const char *, ...); +typedef long ssize_t; void test(void) { - typedef signed long int ssize_t; printf("%f", (ssize_t) 42); ssize_t s; scanf("%f", &s); diff --git a/clang/test/Sema/format-strings-scanf.c b/clang/test/Sema/format-strings-scanf.c index eb5b8ec36bf7a..d1f694f3595cf 100644 --- a/clang/test/Sema/format-strings-scanf.c +++ b/clang/test/Sema/format-strings-scanf.c @@ -210,13 +210,13 @@ void test_size_types(void) { scanf("%zd", &s); // No warning. double d2 = 0.; - scanf("%zd", &d2); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'double *'}} + scanf("%zd", &d2); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'double *'}} ssize_t sn = 0; scanf("%zn", &sn); // No warning. double d3 = 0.; - scanf("%zn", &d3); // expected-warning-re{{format specifies type 'ssize_t *' (aka '{{.+}}') but the argument has type 'double *'}} + scanf("%zn", &d3); // expected-warning-re{{format specifies type 'signed size_t *' (aka '{{.+}}') but the argument has type 'double *'}} } void test_ptrdiff_t_types(void) { diff --git a/clang/test/Sema/format-strings-size_t.c b/clang/test/Sema/format-strings-size_t.c index 5058a762183d3..19e3ac9e6ecd9 100644 --- a/clang/test/Sema/format-strings-size_t.c +++ b/clang/test/Sema/format-strings-size_t.c @@ -2,10 +2,14 @@ int printf(char const *, ...); +#include + void test(void) { // size_t + printf("%zu", (size_t)0); // no-warning + printf("%zu", sizeof(int)); // no-warning + printf("%zu", (size_t)0 + sizeof(int)); // no-warning printf("%zu", (double)42); // expected-warning {{format specifies type 'size_t' (aka 'unsigned long') but the argument has type 'double'}} - // intmax_t / uintmax_t printf("%jd", (double)42); // expected-warning {{format specifies type 'intmax_t' (aka 'long') but the argument has type 'double'}} printf("%ju", (double)42); // expected-warning {{format specifies type 'uintmax_t' (aka 'unsigned long') but the argument has type 'double'}} diff --git a/clang/test/Sema/matrix-type-builtins.c b/clang/test/Sema/matrix-type-builtins.c index b92f3ce6a3e8c..77e3b8a4287ed 100644 --- a/clang/test/Sema/matrix-type-builtins.c +++ b/clang/test/Sema/matrix-type-builtins.c @@ -73,13 +73,13 @@ void column_major_load(float *p1, int *p2, _Bool *p3, struct Foo *p4) { 10, // expected-error {{1st argument must be a pointer to a valid matrix element type}} 1ull << 21, // expected-error {{row dimension is outside the allowed range [1, 1048575]}} 1ull << 21, // expected-error {{column dimension is outside the allowed range [1, 1048575]}} - ""); // expected-error {{incompatible pointer to integer conversion casting 'char[1]' to type 'unsigned long'}} + ""); // expected-error {{incompatible pointer to integer conversion casting 'char[1]' to type '__size_t' (aka 'unsigned long')}} sx5x10_t a13 = __builtin_matrix_column_major_load( 10, // expected-error {{1st argument must be a pointer to a valid matrix element type}} - *p4, // expected-error {{casting 'struct Foo' to incompatible type 'unsigned long'}} + *p4, // expected-error {{casting 'struct Foo' to incompatible type '__size_t' (aka 'unsigned long')}} "", // expected-error {{column argument must be a constant unsigned integer expression}} - // expected-error@-1 {{incompatible pointer to integer conversion casting 'char[1]' to type 'unsigned long'}} + // expected-error@-1 {{incompatible pointer to integer conversion casting 'char[1]' to type '__size_t' (aka 'unsigned long')}} 10); } @@ -96,7 +96,7 @@ void column_major_store(sx5x10_t *m1, ix3x2_t *m2, float *p1, int *p2, struct Fo __builtin_matrix_column_major_store( "", // expected-error {{1st argument must be a matrix}} 10, // expected-error {{2nd argument must be a pointer to a valid matrix element type}} - *p3); // expected-error {{casting 'struct Foo' to incompatible type 'unsigned long'}} + *p3); // expected-error {{casting 'struct Foo' to incompatible type '__size_t' (aka 'unsigned long')}} __builtin_matrix_column_major_store( *m1, diff --git a/clang/test/Sema/ptrauth-atomic-ops.c b/clang/test/Sema/ptrauth-atomic-ops.c index ccb9a1abcc14d..8872090d83b8d 100644 --- a/clang/test/Sema/ptrauth-atomic-ops.c +++ b/clang/test/Sema/ptrauth-atomic-ops.c @@ -54,7 +54,7 @@ void f() { __c11_atomic_exchange(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}} __c11_atomic_fetch_add(ATOMIZE(non_addr_discriminatedauthenticated_ptr), ATOMIZE(j), memory_order_seq_cst); - // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type 'long'}} + // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile _Atomic(typeof (j)) *' to parameter of type '__ptrdiff_t'}} __c11_atomic_fetch_and(ATOMIZE(j), ATOMIZE(non_addr_discriminatedauthenticated_ptr), memory_order_seq_cst); // expected-error@-1 {{incompatible pointer to integer conversion passing 'volatile __ptrauth(2,0,200) _Atomic(int *) *' to parameter of type 'typeof (j)' (aka 'int')}} diff --git a/clang/test/Sema/ptrauth.c b/clang/test/Sema/ptrauth.c index e3932615c2962..b4e5214a7cb50 100644 --- a/clang/test/Sema/ptrauth.c +++ b/clang/test/Sema/ptrauth.c @@ -57,7 +57,7 @@ void test_string_discriminator(const char *str) { __builtin_ptrauth_string_discriminator(str); // expected-error {{argument must be a string literal}} __builtin_ptrauth_string_discriminator(L"wide test"); // expected-error {{argument must be a string literal}} expected-warning {{incompatible pointer types passing 'int[10]' to parameter of type 'const char *'}} - void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type 'unsigned long'}} + void *mismatch = __builtin_ptrauth_string_discriminator("test string"); // expected-error {{incompatible integer to pointer conversion initializing 'void *' with an expression of type '__size_t'}} } diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp index 6f4003f525930..c6919447798da 100644 --- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp +++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp @@ -372,7 +372,7 @@ void test__builtin_trivially_relocate() { __builtin_trivially_relocate((S*)0, 0, 0); //expected-error {{argument to '__builtin_trivially_relocate' must be relocatable}} __builtin_trivially_relocate((int*)0, 0, 0); //expected-error {{first and second arguments to '__builtin_trivially_relocate' must be of the same type}} - __builtin_trivially_relocate((int*)0, (int*)0, (int*)0); // expected-error-re {{cannot initialize a value of type '{{.*}}' with an rvalue of type 'int *'}} + __builtin_trivially_relocate((int*)0, (int*)0, (int*)0); // expected-error-re {{cannot initialize a value of type '__size_t' (aka '{{.*}}') with an rvalue of type 'int *'}} __builtin_trivially_relocate((int*)0, (int*)0, 0); __builtin_trivially_relocate((R*)0, (R*)0, 0); } diff --git a/clang/test/SemaCXX/enum-scoped.cpp b/clang/test/SemaCXX/enum-scoped.cpp index 0ce47274979d9..2d7b3c9557ebd 100644 --- a/clang/test/SemaCXX/enum-scoped.cpp +++ b/clang/test/SemaCXX/enum-scoped.cpp @@ -35,7 +35,7 @@ int a1[Val2]; int a2[E1::Val1]; #if __cplusplus >= 201703L -// expected-error@-3 {{type 'E1' is not implicitly convertible to 'unsigned long'}} +// expected-error@-3 {{type 'E1' is not implicitly convertible to '__size_t' (aka 'unsigned long')}} #else // expected-error@-5 {{size of array has non-integer type}} #endif @@ -44,7 +44,7 @@ int* p1 = new int[Val2]; int* p2 = new int[E1::Val1]; #if __cplusplus >= 201703L -// expected-error@-3 {{converting 'E1' to incompatible type 'unsigned long'}} +// expected-error@-3 {{converting 'E1' to incompatible type '__size_t'}} #else // expected-error@-5 {{array size expression must have integral or unscoped enumeration type, not 'E1'}} #endif diff --git a/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp b/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp index 0b76fdd92dabd..91c4ffda9d818 100644 --- a/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp +++ b/clang/test/SemaCXX/microsoft-varargs-diagnostics.cpp @@ -22,7 +22,7 @@ void test_non_last_argument(int i, int j, ...) { va_list ap; __va_start(&ap, &i, 4); // expected-error@-1{{passing 'int *' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int *' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} + // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} } void test_stack_allocated(int i, ...) { @@ -30,13 +30,13 @@ void test_stack_allocated(int i, ...) { int j; __va_start(&ap, &j, 4); // expected-error@-1{{passing 'int *' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int *' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} + // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} } void test_non_pointer_addressof(int i, ...) { va_list ap; __va_start(&ap, 1, 4); // expected-error@-1{{passing 'int' to parameter of incompatible type 'const char *': type mismatch at 2nd parameter ('int' vs 'const char *')}} - // expected-error@-2{{passing 'int' to parameter of incompatible type 'unsigned int': type mismatch at 3rd parameter ('int' vs 'unsigned int')}} + // expected-error@-2{{passing 'int' to parameter of incompatible type '__size_t' (aka 'unsigned int'): type mismatch at 3rd parameter ('int' vs '__size_t' (aka 'unsigned int'))}} } diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp index f918501554f80..c05130bb30729 100644 --- a/clang/test/SemaCXX/new-delete.cpp +++ b/clang/test/SemaCXX/new-delete.cpp @@ -109,7 +109,7 @@ void bad_news(int *ip) #elif __cplusplus <= 201103L // expected-error@-4 {{array size expression must have integral or unscoped enumeration type, not 'double'}} #else - // expected-warning@-6 {{implicit conversion from 'double' to 'unsigned int' changes value from 1.1 to 1}} + // expected-warning@-6 {{implicit conversion from 'double' to '__size_t' (aka 'unsigned int') changes value from 1.1 to 1}} #endif (void)new int[1][i]; // expected-note {{read of non-const variable 'i' is not allowed in a constant expression}} diff --git a/clang/test/SemaCXX/static-assert-cxx26.cpp b/clang/test/SemaCXX/static-assert-cxx26.cpp index b53c67ee67932..b2ebd2abb785e 100644 --- a/clang/test/SemaCXX/static-assert-cxx26.cpp +++ b/clang/test/SemaCXX/static-assert-cxx26.cpp @@ -19,7 +19,7 @@ struct InvalidSize { const char* data() const; }; static_assert(true, InvalidSize{}); // expected-error {{the message in a static assertion must have a 'size()' member function returning an object convertible to 'std::size_t'}} \ - // expected-error {{value of type 'const char *' is not implicitly convertible to 'unsigned long'}} + // expected-error {{value of type 'const char *' is not implicitly convertible to '__size_t' (aka 'unsigned long')}} struct InvalidData { unsigned long size() const; unsigned long data() const; @@ -371,13 +371,13 @@ struct E { static_assert(true, A{}); // expected-error {{the message in this static assertion is not a constant expression}} // expected-note@-1 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} +static_assert(true, B{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} +static_assert(true, C{}); // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} -static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}} +static_assert(true, D{}); // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in this static assertion is not a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} static_assert(true, E{}); // expected-error {{the message in this static assertion is not a constant expression}} @@ -391,21 +391,21 @@ static_assert( static_assert( false, // expected-error {{static assertion failed}} - B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} + B{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); static_assert( false, // expected-error {{static assertion failed}} - C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type 'unsigned long'}} + C{} // expected-error {{call to 'size()' evaluates to -1, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); static_assert( false, // expected-error {{static assertion failed}} - D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type 'unsigned long'}} + D{} // expected-error {{call to 'size()' evaluates to 340282366920938463463374607431768211455, which cannot be narrowed to type '__size_t' (aka 'unsigned long')}} // expected-error@-1 {{the message in a static assertion must be produced by a constant expression}} // expected-note@-2 {{read of dereferenced one-past-the-end pointer is not allowed in a constant expression}} ); diff --git a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp index 87dc58861ee81..281ef5fa63d6f 100644 --- a/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp +++ b/clang/test/SemaCXX/type-aware-new-delete-basic-free-declarations.cpp @@ -75,7 +75,7 @@ template void *operator new(std::type_identity, U); template void operator delete(std::type_identity, U, size_t, std::align_val_t); // expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 2nd parameter; use 'void *' instead}} template void operator delete(std::type_identity, void *, U, std::align_val_t); -// expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 3rd parameter; use 'unsigned long' instead}} +// expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 3rd parameter; use '__size_t' (aka 'unsigned long') instead}} template void operator delete(std::type_identity, void *, size_t, U); // expected-error@-1 {{type aware 'operator delete' cannot take a dependent type as its 4th parameter; use 'std::align_val_t' instead}} template void *operator new(std::type_identity, typename S::size_ty, std::align_val_t); diff --git a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp index 45fdec606ad1b..56c564f170271 100644 --- a/clang/test/SemaCXX/unavailable_aligned_allocation.cpp +++ b/clang/test/SemaCXX/unavailable_aligned_allocation.cpp @@ -65,12 +65,12 @@ void testOveraligned() { #ifdef NO_ERRORS // expected-no-diagnostics #else -// expected-error-re@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-16 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-17 {{if you supply your own aligned allocation functions}} // expected-error-re@-18 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-19 {{if you supply your own aligned allocation functions}} -// expected-error-re@-20 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-20 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-21 {{if you supply your own aligned allocation functions}} // expected-error-re@-22 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-23 {{if you supply your own aligned allocation functions}} @@ -83,12 +83,12 @@ void testOveraligned() { // expected-error-re@-28 {{aligned deallocation function of type 'void (void *, std::align_val_t, const std::nothrow_t &) noexcept' is {{only|not}} available on}} // expected-note@-29 {{if you supply your own aligned allocation functions}} -// expected-error-re@-29 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-29 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-30 {{if you supply your own aligned allocation functions}} // expected-error-re@-31 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-32 {{if you supply your own aligned allocation functions}} -// expected-error-re@-33 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-33 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-34 {{if you supply your own aligned allocation functions}} // expected-error-re@-35 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-36 {{if you supply your own aligned allocation functions}} @@ -111,19 +111,19 @@ void testOveralignedCheckOS() { // expected-no-diagnostics #else #if defined(IOS) -// expected-error@-7 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on iOS 11 or newer}} +// expected-error@-7 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on iOS 11 or newer}} // expected-error@-8 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on iOS 11 or newer}}} #elif defined(TVOS) -// expected-error@-10 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on tvOS 11 or newer}}} +// expected-error@-10 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on tvOS 11 or newer}}} // expected-error@-11 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on tvOS 11 or newer}}} #elif defined(WATCHOS) -// expected-error@-13 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on watchOS 4 or newer}}} +// expected-error@-13 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on watchOS 4 or newer}}} // expected-error@-14 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on watchOS 4 or newer}}} #elif defined(MACOS) -// expected-error@-16 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is only available on macOS 10.13 or newer}}} +// expected-error@-16 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is only available on macOS 10.13 or newer}}} // expected-error@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is only available on macOS 10.13 or newer}}} #elif defined(ZOS) -// expected-error@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is not available on z/OS}}} +// expected-error@-19 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is not available on z/OS}}} // expected-error@-20 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is not available on z/OS}}} #endif @@ -181,19 +181,19 @@ void testExplicitOperatorNewDeleteOveraligned() { #ifdef NO_ERRORS // expected-no-diagnostics #else -// expected-error-re@-11 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-11 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-12 {{if you supply your own aligned allocation functions}} // expected-error-re@-13 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-14 {{if you supply your own aligned allocation functions}} -// expected-error-re@-15 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-15 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-16 {{if you supply your own aligned allocation functions}} // expected-error-re@-17 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} // expected-note@-18 {{if you supply your own aligned allocation functions}} -// expected-error-re@-19 {{aligned allocation function of type 'void *(unsigned long, enum std::align_val_t)' is {{only|not}} available on}} +// expected-error-re@-19 {{aligned allocation function of type 'void *(__size_t, enum std::align_val_t)' is {{only|not}} available on}} // expected-note@-20 {{if you supply your own aligned allocation functions}} // expected-error-re@-21 {{aligned deallocation function of type 'void (void *, enum std::align_val_t) noexcept' is {{only|not}} available on}} diff --git a/clang/test/SemaHLSL/Language/AssignArray.hlsl b/clang/test/SemaHLSL/Language/AssignArray.hlsl index 1f813e7a350b1..16b60fe40f806 100644 --- a/clang/test/SemaHLSL/Language/AssignArray.hlsl +++ b/clang/test/SemaHLSL/Language/AssignArray.hlsl @@ -13,7 +13,7 @@ export void fn(int8 A) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' // CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' -// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} '__size_t':'unsigned long' int8 b = a; // CHECK-LABEL: VarDecl {{.*}} c 'int8':'vector[2]' cinit @@ -25,7 +25,7 @@ export void fn(int8 A) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' // CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' -// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} '__size_t':'unsigned long' int8 c = A; } diff --git a/clang/test/SemaHLSL/Language/InitListAST.hlsl b/clang/test/SemaHLSL/Language/InitListAST.hlsl index 78bf269769ae6..460ec38bb44af 100644 --- a/clang/test/SemaHLSL/Language/InitListAST.hlsl +++ b/clang/test/SemaHLSL/Language/InitListAST.hlsl @@ -97,12 +97,12 @@ TwoFloats case3(int Val) { // CHECK-NEXT: ImplicitCastExpr {{.*}}'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 TwoFloats case4(int2 TwoVals) { TwoFloats TF4 = {TwoVals}; return TF4; @@ -115,11 +115,11 @@ TwoFloats case4(int2 TwoVals) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: DeclRefExpr {{.*}} 'int2':'vector' lvalue ParmVar {{.*}} 'TwoVals' 'int2':'vector' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 TwoInts case5(int2 TwoVals) { TwoInts TI1 = {TwoVals}; return TI1; @@ -209,22 +209,22 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'unsigned int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} @@ -240,32 +240,32 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -273,32 +273,32 @@ Doggo case7(TwoInts TI1, TwoInts TI2, int Val, TwoFloats TF1, TwoFloats TF2, // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 AnimalBits case8(Doggo D1) { AnimalBits A1 = {D1}; return A1; @@ -317,22 +317,22 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -347,32 +347,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -380,32 +380,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Doggo' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -413,25 +413,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -446,43 +446,43 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh[4]' // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' @@ -490,22 +490,22 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -520,32 +520,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -553,32 +553,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -586,25 +586,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -619,65 +619,65 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'int4':'vector' lvalue .LegState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .TailState {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' @@ -692,32 +692,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent @@ -725,32 +725,32 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float4':'vector' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float4 *' // CHECK-NEXT: MemberExpr {{.*}} 'float4[2]' lvalue .EarDirection {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'Doggo' lvalue ParmVar {{.*}} 'D1' 'Doggo' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'Kitteh' // CHECK-NEXT: InitListExpr {{.*}} 'int4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' @@ -758,25 +758,25 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'int' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' // CHECK-NEXT: MemberExpr {{.*}} 'int[4]' lvalue .Legs {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'uint':'unsigned int' // CHECK-NEXT: MemberExpr {{.*}} 'uint':'unsigned int' lvalue .State {{.*}} @@ -791,43 +791,43 @@ AnimalBits case8(Doggo D1) { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .LeftDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 // CHECK-NEXT: InitListExpr {{.*}} 'float4':'vector' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' lvalue vectorcomponent // CHECK-NEXT: MemberExpr {{.*}} 'float4':'vector' lvalue .RightDir {{.*}} // CHECK-NEXT: DeclRefExpr {{.*}} 'AnimalBits' lvalue ParmVar {{.*}} 'A1' 'AnimalBits' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 Zoo case9(Doggo D1, AnimalBits A1) { Zoo Z1 = {D1, A1, D1, A1, D1, A1}; return Z1; @@ -867,28 +867,28 @@ FourFloats case10(TwoFloats TF1, TwoFloats TF2) { // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 2 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 2 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'float' xvalue vectorcomponent // CHECK-NEXT: MaterializeTemporaryExpr {{.*}} 'vector' xvalue // CHECK-NEXT: ExtVectorElementExpr {{.*}} 'vector' xxxx // CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' lvalue // CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue ParmVar {{.*}} 'F' 'float' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 3 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 3 FourFloats case11(float F) { FourFloats FF1 = {F.xxxx}; return FF1; @@ -1008,52 +1008,52 @@ FourFloats case16() { // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 0 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 0 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' // CHECK-NEXT: MemberExpr {{.*}} 'int' lvalue .A {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' // CHECK-NEXT: MemberExpr {{.*}} 'float' lvalue .B {{.*}} // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'IntAndFloat' lvalue // CHECK-NEXT: ImplicitCastExpr {{.*}} 'IntAndFloat *' // CHECK-NEXT: DeclRefExpr {{.*}} 'IntAndFloat[2]' lvalue Var {{.*}} 'Structs' 'IntAndFloat[2]' -// CHECK-NEXT: IntegerLiteral {{.*}} 'unsigned long' 1 +// CHECK-NEXT: IntegerLiteral {{.*}} '__size_t':'unsigned long' 1 float case17() { IntAndFloat Structs[] = {1,2,3,4}; float Floats[] = {Structs, Structs}; diff --git a/clang/test/SemaObjC/matrix-type-builtins.m b/clang/test/SemaObjC/matrix-type-builtins.m index 21b8bf864271d..3916017cf0fe0 100644 --- a/clang/test/SemaObjC/matrix-type-builtins.m +++ b/clang/test/SemaObjC/matrix-type-builtins.m @@ -27,5 +27,5 @@ void test_element_type_mismatch(u4x4 m, MatrixValue *mv) { __builtin_matrix_column_major_store(mv.value, mv.value, mv.value); // expected-error@-1 {{2nd argument must be a pointer to a valid matrix element type}} - // expected-error@-2 {{casting 'double4x4' (aka 'double __attribute__((matrix_type(4, 4)))') to incompatible type 'unsigned long}} + // expected-error@-2 {{casting 'double4x4' (aka 'double __attribute__((matrix_type(4, 4)))') to incompatible type '__size_t' (aka 'unsigned long')}} } diff --git a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl index a44d9dd86b86a..22569fa7b443c 100644 --- a/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl +++ b/clang/test/SemaOpenCL/cl20-device-side-enqueue.cl @@ -87,7 +87,7 @@ kernel void enqueue_kernel_tests(void) { }, 1024, 4294967296L); #ifdef B32 -// expected-warning@-2{{implicit conversion from 'long' to 'unsigned int' changes value from 4294967296 to 0}} +// expected-warning@-2{{implicit conversion from 'long' to '__size_t' (aka 'unsigned int') changes value from 4294967296 to 0}} #endif char c; @@ -97,7 +97,7 @@ kernel void enqueue_kernel_tests(void) { }, c, 1024L); #ifdef WCONV -// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to 'unsigned {{int|long}}'}} +// expected-warning-re@-2{{implicit conversion changes signedness: 'char' to '__size_t' (aka 'unsigned {{int|long}}')}} #endif #define UINT_MAX 4294967295 @@ -107,7 +107,7 @@ kernel void enqueue_kernel_tests(void) { }, sizeof(int), sizeof(int) * UINT_MAX); #ifdef B32 -// expected-warning@-2{{implicit conversion from 'long' to 'unsigned int' changes value from 17179869180 to 4294967292}} +// expected-warning@-2{{implicit conversion from 'long' to '__size_t' (aka 'unsigned int') changes value from 17179869180 to 4294967292}} #endif typedef void (^bl_A_t)(local void *); diff --git a/clang/test/SemaTemplate/type_pack_element.cpp b/clang/test/SemaTemplate/type_pack_element.cpp index 264b4dcdc044d..5ff010c7db29c 100644 --- a/clang/test/SemaTemplate/type_pack_element.cpp +++ b/clang/test/SemaTemplate/type_pack_element.cpp @@ -7,9 +7,9 @@ using test1 = __type_pack_element<0, int>; // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr '0' -// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | |-value: Int 0 -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} 'int' 0 // CHECK-NEXT: |-TemplateArgument type 'int' // CHECK-NEXT: | `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' @@ -23,7 +23,7 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr 'N' -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int' // CHECK-NEXT: `-TemplateArgument type 'Ts...' // CHECK-NEXT: `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'Ts...' dependent @@ -37,9 +37,9 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr '0' -// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | |-value: Int 0 -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} 'int' 0 // CHECK-NEXT: `-TemplateArgument type 'Ts...' // CHECK-NEXT: `-PackExpansionType 0x{{[0-9A-Fa-f]+}} 'Ts...' dependent @@ -53,7 +53,7 @@ template struct A { // CHECK-NEXT: |-name: '__type_pack_element' qualified // CHECK-NEXT: | `-BuiltinTemplateDecl {{.+}} __type_pack_element // CHECK-NEXT: |-TemplateArgument expr 'N' -// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} 'unsigned long' +// CHECK-NEXT: | `-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} '__size_t':'unsigned long' // CHECK-NEXT: | `-DeclRefExpr 0x{{[0-9A-Fa-f]+}} 'int' NonTypeTemplateParm 0x{{[0-9A-Fa-f]+}} 'N' 'int' // CHECK-NEXT: `-TemplateArgument type 'int' // CHECK-NEXT: `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int' diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 75afa87947be4..9412d9735ef82 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -1672,6 +1672,10 @@ bool CursorVisitor::VisitTypedefTypeLoc(TypedefTypeLoc TL) { return Visit(MakeCursorTypeRef(TL.getTypedefNameDecl(), TL.getNameLoc(), TU)); } +bool CursorVisitor::VisitPredefinedSugarTypeLoc(PredefinedSugarTypeLoc TL) { + return false; +} + bool CursorVisitor::VisitUnresolvedUsingTypeLoc(UnresolvedUsingTypeLoc TL) { return Visit(MakeCursorTypeRef(TL.getDecl(), TL.getNameLoc(), TU)); } diff --git a/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp index 4fa4982de88fa..c5f152a26a766 100644 --- a/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp +++ b/libcxx/test/libcxx/containers/sequences/deque/spare_block_handling.pass.cpp @@ -38,10 +38,10 @@ static void print(const Deque& d) { " : __back_spare() == %zu" " : __capacity() == %zu" " : bytes allocated == %zu\n", - d.size(), - d.__front_spare(), - d.__back_spare(), - d.__capacity(), + std::size_t(d.size()), + std::size_t(d.__front_spare()), + std::size_t(d.__back_spare()), + std::size_t(d.__capacity()), malloc_allocator_base::outstanding_bytes); } diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp index e847ede1a4ba6..3226e0accc5ea 100644 --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -2555,6 +2555,7 @@ RemoveWrappingTypes(QualType type, ArrayRef mask = {}) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: + case clang::Type::PredefinedSugar: type = type->getLocallyUnqualifiedSingleStepDesugaredType(); break; default: @@ -4130,6 +4131,7 @@ TypeSystemClang::GetTypeClass(lldb::opaque_compiler_type_t type) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: + case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: break; @@ -4840,6 +4842,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type, case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: + case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: @@ -5141,6 +5144,7 @@ lldb::Format TypeSystemClang::GetFormat(lldb::opaque_compiler_type_t type) { case clang::Type::TypeOf: case clang::Type::TypeOfExpr: case clang::Type::Using: + case clang::Type::PredefinedSugar: llvm_unreachable("Handled in RemoveWrappingTypes!"); case clang::Type::UnaryTransform: break; From 6b371cab949e3ff9e903d6c3118f4b2be2bf3cc5 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sat, 19 Jul 2025 09:44:25 +0200 Subject: [PATCH 434/813] [libc++] Move a bunch of extensions tests to test/extensions (#149275) --- .../gnu_cxx => extensions/gnu/hash_map}/hash_map.pass.cpp | 0 .../gnu/hash_map}/hash_map_name_lookup.pass.cpp | 0 .../gnu_cxx => extensions/gnu/hash_set}/hash_set.pass.cpp | 0 .../gnu/hash_set}/hash_set_name_lookup.pass.cpp | 0 .../libcxx/atomics/atomics.flag/init_bool.pass.cpp | 0 .../libcxx/containers/associative/map/scary.compile.pass.cpp | 0 .../libcxx/containers/associative/set/scary.compile.pass.cpp | 0 .../containers/associative/unord.map/scary.compile.pass.cpp | 0 .../containers/associative/unord.set/scary.compile.pass.cpp | 0 .../libcxx/containers/sequences/deque/incomplete.pass.cpp | 0 .../{ => extensions}/libcxx/depr/depr.c.headers/extern_c.pass.cpp | 0 .../libcxx/{ => depr/depr.c.headers}/include_as_c.sh.cpp | 0 .../depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp | 0 .../depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp | 0 .../depr.c.headers => extensions/msvc}/math_h.compile.pass.cpp | 0 15 files changed, 0 insertions(+), 0 deletions(-) rename libcxx/test/{libcxx/containers/gnu_cxx => extensions/gnu/hash_map}/hash_map.pass.cpp (100%) rename libcxx/test/{libcxx/containers/gnu_cxx => extensions/gnu/hash_map}/hash_map_name_lookup.pass.cpp (100%) rename libcxx/test/{libcxx/containers/gnu_cxx => extensions/gnu/hash_set}/hash_set.pass.cpp (100%) rename libcxx/test/{libcxx/containers/gnu_cxx => extensions/gnu/hash_set}/hash_set_name_lookup.pass.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/atomics/atomics.flag/init_bool.pass.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/containers/associative/map/scary.compile.pass.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/containers/associative/set/scary.compile.pass.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/containers/associative/unord.map/scary.compile.pass.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/containers/associative/unord.set/scary.compile.pass.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/containers/sequences/deque/incomplete.pass.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/depr/depr.c.headers/extern_c.pass.cpp (100%) rename libcxx/test/extensions/libcxx/{ => depr/depr.c.headers}/include_as_c.sh.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp (100%) rename libcxx/test/{ => extensions}/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp (100%) rename libcxx/test/{libcxx/depr/depr.c.headers => extensions/msvc}/math_h.compile.pass.cpp (100%) diff --git a/libcxx/test/libcxx/containers/gnu_cxx/hash_map.pass.cpp b/libcxx/test/extensions/gnu/hash_map/hash_map.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/gnu_cxx/hash_map.pass.cpp rename to libcxx/test/extensions/gnu/hash_map/hash_map.pass.cpp diff --git a/libcxx/test/libcxx/containers/gnu_cxx/hash_map_name_lookup.pass.cpp b/libcxx/test/extensions/gnu/hash_map/hash_map_name_lookup.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/gnu_cxx/hash_map_name_lookup.pass.cpp rename to libcxx/test/extensions/gnu/hash_map/hash_map_name_lookup.pass.cpp diff --git a/libcxx/test/libcxx/containers/gnu_cxx/hash_set.pass.cpp b/libcxx/test/extensions/gnu/hash_set/hash_set.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/gnu_cxx/hash_set.pass.cpp rename to libcxx/test/extensions/gnu/hash_set/hash_set.pass.cpp diff --git a/libcxx/test/libcxx/containers/gnu_cxx/hash_set_name_lookup.pass.cpp b/libcxx/test/extensions/gnu/hash_set/hash_set_name_lookup.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/gnu_cxx/hash_set_name_lookup.pass.cpp rename to libcxx/test/extensions/gnu/hash_set/hash_set_name_lookup.pass.cpp diff --git a/libcxx/test/libcxx/atomics/atomics.flag/init_bool.pass.cpp b/libcxx/test/extensions/libcxx/atomics/atomics.flag/init_bool.pass.cpp similarity index 100% rename from libcxx/test/libcxx/atomics/atomics.flag/init_bool.pass.cpp rename to libcxx/test/extensions/libcxx/atomics/atomics.flag/init_bool.pass.cpp diff --git a/libcxx/test/libcxx/containers/associative/map/scary.compile.pass.cpp b/libcxx/test/extensions/libcxx/containers/associative/map/scary.compile.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/associative/map/scary.compile.pass.cpp rename to libcxx/test/extensions/libcxx/containers/associative/map/scary.compile.pass.cpp diff --git a/libcxx/test/libcxx/containers/associative/set/scary.compile.pass.cpp b/libcxx/test/extensions/libcxx/containers/associative/set/scary.compile.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/associative/set/scary.compile.pass.cpp rename to libcxx/test/extensions/libcxx/containers/associative/set/scary.compile.pass.cpp diff --git a/libcxx/test/libcxx/containers/associative/unord.map/scary.compile.pass.cpp b/libcxx/test/extensions/libcxx/containers/associative/unord.map/scary.compile.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/associative/unord.map/scary.compile.pass.cpp rename to libcxx/test/extensions/libcxx/containers/associative/unord.map/scary.compile.pass.cpp diff --git a/libcxx/test/libcxx/containers/associative/unord.set/scary.compile.pass.cpp b/libcxx/test/extensions/libcxx/containers/associative/unord.set/scary.compile.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/associative/unord.set/scary.compile.pass.cpp rename to libcxx/test/extensions/libcxx/containers/associative/unord.set/scary.compile.pass.cpp diff --git a/libcxx/test/libcxx/containers/sequences/deque/incomplete.pass.cpp b/libcxx/test/extensions/libcxx/containers/sequences/deque/incomplete.pass.cpp similarity index 100% rename from libcxx/test/libcxx/containers/sequences/deque/incomplete.pass.cpp rename to libcxx/test/extensions/libcxx/containers/sequences/deque/incomplete.pass.cpp diff --git a/libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/extern_c.pass.cpp similarity index 100% rename from libcxx/test/libcxx/depr/depr.c.headers/extern_c.pass.cpp rename to libcxx/test/extensions/libcxx/depr/depr.c.headers/extern_c.pass.cpp diff --git a/libcxx/test/extensions/libcxx/include_as_c.sh.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/include_as_c.sh.cpp similarity index 100% rename from libcxx/test/extensions/libcxx/include_as_c.sh.cpp rename to libcxx/test/extensions/libcxx/depr/depr.c.headers/include_as_c.sh.cpp diff --git a/libcxx/test/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp similarity index 100% rename from libcxx/test/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp rename to libcxx/test/extensions/libcxx/depr/depr.c.headers/stdint_h.std_types_t.compile.pass.cpp diff --git a/libcxx/test/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp similarity index 100% rename from libcxx/test/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp rename to libcxx/test/extensions/libcxx/depr/depr.c.headers/stdint_h.xopen_source.compile.pass.cpp diff --git a/libcxx/test/libcxx/depr/depr.c.headers/math_h.compile.pass.cpp b/libcxx/test/extensions/msvc/math_h.compile.pass.cpp similarity index 100% rename from libcxx/test/libcxx/depr/depr.c.headers/math_h.compile.pass.cpp rename to libcxx/test/extensions/msvc/math_h.compile.pass.cpp From 6855b9c598b3258e8c0e3edffe5458630a0b0105 Mon Sep 17 00:00:00 2001 From: Naveen Seth Hanig Date: Sat, 19 Jul 2025 09:47:37 +0200 Subject: [PATCH 435/813] [clang][deps] Properly capture the global module and '\n' for all module directives (#148685) Previously, the newline after a module directive was not properly captured and printed by `clang::printDependencyDirectivesAsSource`. According to P1857R3, each directive must, after skipping horizontal whitespace, appear at the start of a logical line. Because the newline after module directives was missing, this invalidated the following line. This fixes tests that were previously in violation of P1857R3, including for Objective-C directives, which should also comply with P1857R3. This also ensures that the global module fragment `module;` is captured by the dependency directives scanner. --- clang/lib/Lex/DependencyDirectivesScanner.cpp | 35 +++++++++++-------- clang/lib/Lex/Preprocessor.cpp | 2 ++ clang/lib/Parse/Parser.cpp | 1 + .../Lex/DependencyDirectivesScannerTest.cpp | 29 +++++++++------ 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp index 869c9cea566b6..9ccff5e3342d5 100644 --- a/clang/lib/Lex/DependencyDirectivesScanner.cpp +++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp @@ -560,15 +560,13 @@ bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First, if (Tok.is(tok::semi)) break; } + + const auto &Tok = lexToken(First, End); pushDirective(Kind); - skipWhitespace(First, End); - if (First == End) + if (Tok.is(tok::eof) || Tok.is(tok::eod)) return false; - if (!isVerticalWhitespace(*First)) - return reportError( - DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import); - skipNewline(First, End); - return false; + return reportError(DirectiveLoc, + diag::err_dep_source_scanner_unexpected_tokens_at_import); } dependency_directives_scan::Token &Scanner::lexToken(const char *&First, @@ -735,6 +733,13 @@ bool Scanner::lexModule(const char *&First, const char *const End) { return false; break; } + case ';': { + // Handle the global module fragment `module;`. + if (Id == "module" && !Export) + break; + skipLine(First, End); + return false; + } case '<': case '"': break; @@ -905,14 +910,6 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) { CurDirToks.clear(); }); - // Handle "@import". - if (*First == '@') - return lexAt(First, End); - - // Handle module directives for C++20 modules. - if (*First == 'i' || *First == 'e' || *First == 'm') - return lexModule(First, End); - if (*First == '_') { if (isNextIdentifierOrSkipLine("_Pragma", First, End)) return lex_Pragma(First, End); @@ -925,6 +922,14 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) { auto ScEx2 = make_scope_exit( [&]() { TheLexer.setParsingPreprocessorDirective(false); }); + // Handle "@import". + if (*First == '@') + return lexAt(First, End); + + // Handle module directives for C++20 modules. + if (*First == 'i' || *First == 'e' || *First == 'm') + return lexModule(First, End); + // Lex '#'. const dependency_directives_scan::Token &HashTok = lexToken(First, End); if (HashTok.is(tok::hashhash)) { diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index bcd3ea60ce3da..e278846f6f36d 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -950,6 +950,8 @@ void Preprocessor::Lex(Token &Result) { case tok::period: ModuleDeclState.handlePeriod(); break; + case tok::eod: + break; case tok::identifier: // Check "import" and "module" when there is no open bracket. The two // identifiers are not meaningful with open brackets. diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 8834bf80c4016..ff50b3f83908c 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -2519,6 +2519,7 @@ Decl *Parser::ParseModuleImport(SourceLocation AtLoc, break; } ExpectAndConsumeSemi(diag::err_module_expected_semi); + TryConsumeToken(tok::eod); if (SeenError) return nullptr; diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp index 46dbb4d4b91b4..ddc87921ea084 100644 --- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp +++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp @@ -640,14 +640,14 @@ TEST(MinimizeSourceToDependencyDirectivesTest, AtImport) { EXPECT_STREQ("@import A;\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A\n;", Out)); - EXPECT_STREQ("@import A;\n", Out.data()); + EXPECT_STREQ("@import A\n;\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives("@import A.B;\n", Out)); EXPECT_STREQ("@import A.B;\n", Out.data()); ASSERT_FALSE(minimizeSourceToDependencyDirectives( - "@import /*x*/ A /*x*/ . /*x*/ B /*x*/ \n /*x*/ ; /*x*/", Out)); - EXPECT_STREQ("@import A.B;\n", Out.data()); + "@import /*x*/ A /*x*/ . /*x*/ B /*x*/ \\n /*x*/ ; /*x*/", Out)); + EXPECT_STREQ("@import A.B\\n;\n", Out.data()); } TEST(MinimizeSourceToDependencyDirectivesTest, EmptyIncludesAndImports) { @@ -1122,16 +1122,23 @@ ort \ )"; ASSERT_FALSE( minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives)); - EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;" - "exp\\\nort import:l[[rename]];" - "import<<=3;import a b d e d e f e;" - "import foo[[no_unique_address]];import foo();" - "import f(:sefse);import f(->a=3);" + + EXPECT_STREQ("module;\n" + "#include \"textual-header.h\"\n" + "export module m;\n" + "exp\\\nort import:l[[rename]];\n" + "import<<=3;\n" + "import a b d e d e f e;\n" + "import foo[[no_unique_address]];\n" + "import foo();\n" + "import f(:sefse);\n" + "import f(->a=3);\n" "\n", Out.data()); - ASSERT_EQ(Directives.size(), 11u); - EXPECT_EQ(Directives[0].Kind, pp_include); - EXPECT_EQ(Directives[1].Kind, cxx_export_module_decl); + ASSERT_EQ(Directives.size(), 12u); + EXPECT_EQ(Directives[0].Kind, cxx_module_decl); + EXPECT_EQ(Directives[1].Kind, pp_include); + EXPECT_EQ(Directives[2].Kind, cxx_export_module_decl); } TEST(MinimizeSourceToDependencyDirectivesTest, ObjCMethodArgs) { From 224e221f1bcb64cf20d81a4ad2855043e445d9bf Mon Sep 17 00:00:00 2001 From: Charles Zablit Date: Sat, 19 Jul 2025 10:46:28 +0200 Subject: [PATCH 436/813] [NFC][lldb] fix extra line after documentation (#149482) Formatting patch which removes an extra line after a docstring. --- lldb/include/lldb/Utility/Stream.h | 1 - 1 file changed, 1 deletion(-) diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h index 37bcdc9924171..fc547ed739239 100644 --- a/lldb/include/lldb/Utility/Stream.h +++ b/lldb/include/lldb/Utility/Stream.h @@ -270,7 +270,6 @@ class Stream { /// \param[in] suffix /// The ANSI color code to end colorization. This is /// environment-dependent. - void PutCStringColorHighlighted( llvm::StringRef text, std::optional settings = std::nullopt); From baf2953097a8d606c8e8441c61c09f607f583cc8 Mon Sep 17 00:00:00 2001 From: kissholic Date: Sat, 19 Jul 2025 16:52:06 +0800 Subject: [PATCH 437/813] Optimize fptrunc(x)>=C1 --> x>=C2 (#99475) Fix https://github.com/llvm/llvm-project/issues/85265#issue-2186848949 --- .../InstCombine/InstCombineCompares.cpp | 99 +++ .../Transforms/InstCombine/fold-fcmp-trunc.ll | 674 ++++++++++++++++++ 2 files changed, 773 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 9df08553d86e4..c90ff2a868d4c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "InstCombineInternal.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/APSInt.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" @@ -21,8 +22,10 @@ #include "llvm/Analysis/Utils/Local.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/KnownBits.h" @@ -8222,6 +8225,98 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI, return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I); } +// Transform 'fptrunc(x) cmp C' to 'x cmp ext(C)' if possible. +// Patterns include: +// fptrunc(x) < C --> x < ext(C) +// fptrunc(x) <= C --> x <= ext(C) +// fptrunc(x) > C --> x > ext(C) +// fptrunc(x) >= C --> x >= ext(C) +// where 'ext(C)' is the extension of 'C' to the type of 'x' with a small bias +// due to precision loss. +static Instruction *foldFCmpFpTrunc(FCmpInst &I, const Instruction &FPTrunc, + const Constant &C) { + FCmpInst::Predicate Pred = I.getPredicate(); + bool RoundDown = false; + + if (Pred == FCmpInst::FCMP_OGE || Pred == FCmpInst::FCMP_UGE || + Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_ULT) + RoundDown = true; + else if (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT || + Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE) + RoundDown = false; + else + return nullptr; + + const APFloat *CValue; + if (!match(&C, m_APFloat(CValue))) + return nullptr; + + if (CValue->isNaN() || CValue->isInfinity()) + return nullptr; + + auto ConvertFltSema = [](const APFloat &Src, const fltSemantics &Sema) { + bool LosesInfo; + APFloat Dest = Src; + Dest.convert(Sema, APFloat::rmNearestTiesToEven, &LosesInfo); + return Dest; + }; + + auto NextValue = [](const APFloat &Value, bool RoundDown) { + APFloat NextValue = Value; + NextValue.next(RoundDown); + return NextValue; + }; + + APFloat NextCValue = NextValue(*CValue, RoundDown); + + Type *DestType = FPTrunc.getOperand(0)->getType(); + const fltSemantics &DestFltSema = + DestType->getScalarType()->getFltSemantics(); + + APFloat ExtCValue = ConvertFltSema(*CValue, DestFltSema); + APFloat ExtNextCValue = ConvertFltSema(NextCValue, DestFltSema); + + // When 'NextCValue' is infinity, use an imaged 'NextCValue' that equals + // 'CValue + bias' to avoid the infinity after conversion. The bias is + // estimated as 'CValue - PrevCValue', where 'PrevCValue' is the previous + // value of 'CValue'. + if (NextCValue.isInfinity()) { + APFloat PrevCValue = NextValue(*CValue, !RoundDown); + APFloat Bias = ConvertFltSema(*CValue - PrevCValue, DestFltSema); + + ExtNextCValue = ExtCValue + Bias; + } + + APFloat ExtMidValue = + scalbn(ExtCValue + ExtNextCValue, -1, APFloat::rmNearestTiesToEven); + + const fltSemantics &SrcFltSema = + C.getType()->getScalarType()->getFltSemantics(); + + // 'MidValue' might be rounded to 'NextCValue'. Correct it here. + APFloat MidValue = ConvertFltSema(ExtMidValue, SrcFltSema); + if (MidValue != *CValue) + ExtMidValue.next(!RoundDown); + + // Check whether 'ExtMidValue' is a valid result since the assumption on + // imaged 'NextCValue' might not hold for new float types. + // ppc_fp128 can't pass here when converting from max float because of + // APFloat implementation. + if (NextCValue.isInfinity()) { + // ExtMidValue --- narrowed ---> Finite + if (ConvertFltSema(ExtMidValue, SrcFltSema).isInfinity()) + return nullptr; + + // NextExtMidValue --- narrowed ---> Infinity + APFloat NextExtMidValue = NextValue(ExtMidValue, RoundDown); + if (ConvertFltSema(NextExtMidValue, SrcFltSema).isFinite()) + return nullptr; + } + + return new FCmpInst(Pred, FPTrunc.getOperand(0), + ConstantFP::get(DestType, ExtMidValue), "", &I); +} + /// Optimize fabs(X) compared with zero. static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) { Value *X; @@ -8712,6 +8807,10 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) { cast(LHSI), GEP, GV, I)) return Res; break; + case Instruction::FPTrunc: + if (Instruction *NV = foldFCmpFpTrunc(I, *LHSI, *RHSC)) + return NV; + break; } } diff --git a/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll b/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll new file mode 100644 index 0000000000000..371f9b6807fe4 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/fold-fcmp-trunc.ll @@ -0,0 +1,674 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=instcombine -S < %s | FileCheck %s + + +define i1 @fcmp_trunc(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x4058FFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_ult(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_ult( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ult double [[TMP0]], 0x4068FFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ult float %trunc, 2.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_ole(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_ole( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0x4072C00010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ole float %trunc, 3.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_ogt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_ogt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x4079000010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ogt float %trunc, 4.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_zero(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_zero( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xB690000000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 0.000000 + ret i1 %result +} + +define i1 @fcmp_trunc_with_nnan(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_with_nnan( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp nnan oge double [[TMP0]], 0x4058FFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp nnan oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_with_ninf(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_with_ninf( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ninf oge double [[TMP0]], 0x4058FFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ninf oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_with_nsz(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_with_nsz( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp nsz oge double [[TMP0]], 0x4058FFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp nsz oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_with_reassoc(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_with_reassoc( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp reassoc oge double [[TMP0]], 0x4058FFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp reassoc oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_with_fast(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_with_fast( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp fast oge double [[TMP0]], 0x4058FFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp fast oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define <4 x i1> @fcmp_vec_trunc(<4 x double> %0) { +; CHECK-LABEL: define <4 x i1> @fcmp_vec_trunc( +; CHECK-SAME: <4 x double> [[TMP0:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = fcmp olt <4 x double> [[TMP0]], splat (double 0x3FEFFFFFF0000000) +; CHECK-NEXT: ret <4 x i1> [[CMP]] +; + %vec = fptrunc <4 x double> %0 to <4 x float> + %cmp = fcmp olt <4 x float> %vec, + ret <4 x i1> %cmp +} + +define <1 x i1> @fcmp_vec_trunc_scalar(<1 x double> %0) { +; CHECK-LABEL: define <1 x i1> @fcmp_vec_trunc_scalar( +; CHECK-SAME: <1 x double> [[TMP0:%.*]]) { +; CHECK-NEXT: [[CMP:%.*]] = fcmp fast olt <1 x double> [[TMP0]], splat (double 0x3FEFFFFFF0000000) +; CHECK-NEXT: ret <1 x i1> [[CMP]] +; + %vec = fptrunc <1 x double> %0 to <1 x float> + %cmp = fcmp fast olt <1 x float> %vec, + ret <1 x i1> %cmp +} + +define i1 @fcmp_trunc_fp128(fp128 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_fp128( +; CHECK-SAME: fp128 [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp fast oge fp128 [[TMP0]], 0xL000000000000000040058FFFFF000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc fp128 %0 to float + %result = fcmp fast oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_x86_fp80(x86_fp80 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_x86_fp80( +; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp fast oge x86_fp80 [[TMP0]], 0xK4005C7FFFF8000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc x86_fp80 %0 to float + %result = fcmp fast oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_ppc_fp128(ppc_fp128 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_ppc_fp128( +; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp fast oge ppc_fp128 [[TMP0]], 0xM4058FFFFF00000000000000000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc ppc_fp128 %0 to float + %result = fcmp fast oge float %trunc, 1.000000e+02 + ret i1 %result +} + +define i1 @fcmp_trunc_nan(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_nan( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: ret i1 false +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 0x7FF8000000000000 + ret i1 %result +} + +; denomalized 0x00000001 +define i1 @fcmp_trunc_d1(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_d1( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x3690000000000001 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45 + ret i1 %result +} + +; denomalized 0x00000001 ole +define i1 @fcmp_trunc_d1_ole(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_d1_ole( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0x36A7FFFFFFFFFFFF +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ole float %trunc, 1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45 + ret i1 %result +} + +; denomalized 0x00000002 +define i1 @fcmp_trunc_d2(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_d2( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x36A8000000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 2.8025969286496341418474591665798322625605238837530315435141365677795821653717212029732763767242431640625e-45 + ret i1 %result +} + +; denomalized 0x7fffff +define i1 @fcmp_trunc_d3(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_d3( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x380FFFFFDFFFFFFF +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ogt float %trunc, 1.175494210692441075487029444849287348827052428745893333857174530571588870475618904265502351336181163787841796875e-38 + ret i1 %result +} + +; denomalized 0x80000001 +define i1 @fcmp_trunc_d4(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_d4( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0xB690000000000001 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ogt float %trunc, -1.40129846432481707092372958328991613128026194187651577175706828388979108268586060148663818836212158203125e-45 + ret i1 %result +} + +; denomalized 0x80000001 +define i1 @fcmp_trunc_d5(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_d5( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xB80FFFFFDFFFFFFF +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp olt float %trunc, -1.175494210692441075487029444849287348827052428745893333857174530571588870475618904265502351336181163787841796875e-38 + ret i1 %result +} + + +; +0 +define i1 @fcmp_trunc_p0(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_p0( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xB690000000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 0x00000000 + ret i1 %result +} + + +; -0 +define i1 @fcmp_trunc_n0(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_n0( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x3690000000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ogt float %trunc, 0x8000000000000000 + ret i1 %result +} + + +; max representable +define i1 @fcmp_trunc_mx(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_mx( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0x47EFFFFFEFFFFFFF +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ogt float %trunc, 0x47EFFFFFE0000000 + ret i1 %result +} + +; negative max representable +define i1 @fcmp_trunc_mn(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_mn( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xC7EFFFFFEFFFFFFF +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp olt float %trunc, -3.4028234663852885981170418348451692544e38 + ret i1 %result +} + + +define i1 @fcmp_trunc_literal_nan(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_literal_nan( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: ret i1 false +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 0x7FF8000000000000 + ret i1 %result +} + +define i1 @fcmp_trunc_literal_positive_inf(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_literal_positive_inf( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oeq float [[TRUNC]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 0x7FF0000000000000 + ret i1 %result +} + + +define i1 @fcmp_trunc_literal_negative_inf(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_literal_negative_inf( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float +; CHECK-NEXT: [[RESULT:%.*]] = fcmp uno float [[TRUNC]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ult float %trunc, 0xFFF0000000000000 + ret i1 %result +} + + +define i1 @fcmp_trunc_nan_ugt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_nan_ugt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: ret i1 true +; + %trunc = fptrunc double %0 to float + %result = fcmp ugt float %trunc, 0x7FF8000000000000 + ret i1 %result +} + +define i1 @fcmp_trunc_inf_uge(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_inf_uge( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[TRUNC:%.*]] = fptrunc double [[TMP0]] to float +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ueq float [[TRUNC]], 0x7FF0000000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp uge float %trunc, 0x7FF0000000000000 + ret i1 %result +} + + +define i1 @fcmp_trunc_ninf_olt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_ninf_olt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: ret i1 false +; + %trunc = fptrunc double %0 to float + %result = fcmp olt float %trunc, 0xFFF0000000000000 + ret i1 %result +} + + +define i1 @fcmp_trunc_uge(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_uge( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp uge double [[TMP0]], 0x405EBFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp uge float %trunc, 123.0 + ret i1 %result +} + + +define i1 @fcmp_trunc_neg_uge(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_uge( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp uge double [[TMP0]], 0xC05EC00010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp uge float %trunc, -123.0 + ret i1 %result +} + + +define i1 @fcmp_trunc_oge(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_oge( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0x405EBFFFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, 123.0 + ret i1 %result +} + + +define i1 @fcmp_trunc_neg_oge(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_oge( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oge double [[TMP0]], 0xC05EC00010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp oge float %trunc, -123.0 + ret i1 %result +} + + +define i1 @fcmp_trunc_ugt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_ugt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ugt double [[TMP0]], 0x40FE0F3010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ugt float %trunc, 123123.0 + ret i1 %result +} + + +define i1 @fcmp_trunc_neg_ugt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_ugt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ugt double [[TMP0]], 0xC0FE1B8FF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ugt float %trunc, -123321.0 + ret i1 %result +} + + +define i1 @fcmp_trunc_neg_ogt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_ogt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ogt double [[TMP0]], 0xC0FE1B8FF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ogt float %trunc, -123321.0 + ret i1 %result +} + + + +define i1 @fcmp_trunc_ule(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_ule( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ule double [[TMP0]], 0x408ED80010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ule float %trunc, 987.0 + ret i1 %result +} + + + +define i1 @fcmp_trunc_neg_ule(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_ule( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ule double [[TMP0]], 0xC088A7FFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ule float %trunc, -789.0 + ret i1 %result +} + + + +define i1 @fcmp_trunc_neg_ole(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_ole( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ole double [[TMP0]], 0xC088A7FFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ole float %trunc, -789.0 + ret i1 %result +} + + +define i1 @fcmp_trunc_neg_ult(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_ult( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ult double [[TMP0]], 0xC088A80010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp ult float %trunc, -789.0 + ret i1 %result +} + + + +define i1 @fcmp_trunc_olt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_olt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0x408ED7FFF0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp olt float %trunc, 987.0 + ret i1 %result +} + + + +define i1 @fcmp_trunc_neg_olt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_olt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp olt double [[TMP0]], 0xC088A80010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp olt float %trunc, -789.0 + ret i1 %result +} + + +define i1 @fcmp_trunc_neg_nsz_uge(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_nsz_uge( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp nsz uge double [[TMP0]], 0xC05EC00010000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp nsz uge float %trunc, -123.0 + ret i1 %result +} + + + +define i1 @fcmp_trunc_reassoc_ugt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_reassoc_ugt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp reassoc ugt double [[TMP0]], 0x40889F8210000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp reassoc ugt float %trunc, 787.9384765625 + ret i1 %result +} + + +define i1 @fcmp_trunc_neg_reassoc_ugt(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_reassoc_ugt( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp reassoc ugt double [[TMP0]], 0xC0889F81F0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp reassoc ugt float %trunc, -787.9384765625 + ret i1 %result +} + + + +define i1 @fcmp_trunc_fast_ult(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_fast_ult( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp fast uge double [[TMP0]], 0x40F8E8E010000001 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp fast uge float %trunc, 102030.0078125 + ret i1 %result +} + + +define i1 @fcmp_trunc_neg_fast_ult(double %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_neg_fast_ult( +; CHECK-SAME: double [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp fast uge double [[TMP0]], 0xC0F8E8E02FFFFFFF +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc double %0 to float + %result = fcmp fast uge float %trunc, -102030.0078125 + ret i1 %result +} + + +; max representable float to fp128 +define i1 @fcmp_trunc_mx_fp128(fp128 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_mx_fp128( +; CHECK-SAME: fp128 [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ole fp128 [[TMP0]], 0xLFFFFFFFFFFFFFFFF407EFFFFFEFFFFFF +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc fp128 %0 to float + %result = fcmp ole float %trunc, 0x47EFFFFFE0000000 + ret i1 %result +} + + +; max representable float to x86_fp80 +define i1 @fcmp_trunc_mx_x86_fp80(x86_fp80 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_mx_x86_fp80( +; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ule x86_fp80 [[TMP0]], 0xK407EFFFFFF7FFFFFFFFF +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc x86_fp80 %0 to float + %result = fcmp ule float %trunc, 0x47EFFFFFE0000000 + ret i1 %result +} + + +; max representable float to ppc_fp128 +define i1 @fcmp_trunc_mx_ppc_fp128(ppc_fp128 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_mx_ppc_fp128( +; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) { +; CHECK-NEXT: [[TRUNC:%.*]] = fptrunc ppc_fp128 [[TMP0]] to float +; CHECK-NEXT: [[RESULT:%.*]] = fcmp ole float [[TRUNC]], 0x47EFFFFFE0000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc ppc_fp128 %0 to float + %result = fcmp ole float %trunc, 0x47EFFFFFE0000000 + ret i1 %result +} + + +; negative max representable float to fp128 +define i1 @fcmp_trunc_mn_fp128(fp128 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_mn_fp128( +; CHECK-SAME: fp128 [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp olt fp128 [[TMP0]], 0xL0000000000000000C07EFFFFF1000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc fp128 %0 to float + %result = fcmp olt float %trunc, 0xC7EFFFFF00000000 + ret i1 %result +} + + +; negative max representable float to x86_fp80 +define i1 @fcmp_trunc_mn_x86_fp80(x86_fp80 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_mn_x86_fp80( +; CHECK-SAME: x86_fp80 [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp oge x86_fp80 [[TMP0]], 0xKC07EFFFFF88000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc x86_fp80 %0 to float + %result = fcmp oge float %trunc, 0xC7EFFFFF00000000 + ret i1 %result +} + + +; negative max representable float to ppc_fp128 +define i1 @fcmp_trunc_mn_ppc_fp128(ppc_fp128 %0) { +; CHECK-LABEL: define i1 @fcmp_trunc_mn_ppc_fp128( +; CHECK-SAME: ppc_fp128 [[TMP0:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = fcmp uge ppc_fp128 [[TMP0]], 0xMC7EFFFFF100000000000000000000000 +; CHECK-NEXT: ret i1 [[RESULT]] +; + %trunc = fptrunc ppc_fp128 %0 to float + %result = fcmp uge float %trunc, 0xC7EFFFFF00000000 + ret i1 %result +} + From b5348e76225a1580530cd4cd362cfa60728693e6 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Sat, 19 Jul 2025 11:12:20 +0200 Subject: [PATCH 438/813] [libc++] Diagnose passing null pointers to a bunch of APIs (#148585) --- libcxx/.clang-format | 1 + libcxx/include/__config | 14 +++++ libcxx/include/__memory/construct_at.h | 6 +-- libcxx/include/print | 14 +++-- libcxx/include/string | 45 ++++++++++------ libcxx/include/string_view | 52 ++++++++++-------- .../specialized.algorithms/nonnull.verify.cpp | 28 ++++++++++ .../print.fun/nonnull.verify.cpp | 23 ++++++++ .../strings/basic.string/nonnull.verify.cpp | 25 ++++++++- .../string.view/assert.ctor.pointer.pass.cpp | 3 ++ .../strings/string.view/nonnull.verify.cpp | 53 +++++++++++++++++++ 11 files changed, 219 insertions(+), 45 deletions(-) create mode 100644 libcxx/test/libcxx/algorithms/specialized.algorithms/nonnull.verify.cpp create mode 100644 libcxx/test/libcxx/input.output/iostream.format/print.fun/nonnull.verify.cpp create mode 100644 libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp diff --git a/libcxx/.clang-format b/libcxx/.clang-format index f372ac9619997..9557b955cd72c 100644 --- a/libcxx/.clang-format +++ b/libcxx/.clang-format @@ -33,6 +33,7 @@ AttributeMacros: [ '_LIBCPP_DEPRECATED_IN_CXX20', '_LIBCPP_DEPRECATED_IN_CXX23', '_LIBCPP_DEPRECATED', + '_LIBCPP_DIAGNOSE_NULLPTR_IF', '_LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION', '_LIBCPP_EXPORTED_FROM_ABI', '_LIBCPP_EXTERN_TEMPLATE_TYPE_VIS', diff --git a/libcxx/include/__config b/libcxx/include/__config index e4422298bf971..19398dd276a17 100644 --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1081,6 +1081,20 @@ typedef __char32_t char32_t; # define _LIBCPP_DIAGNOSE_WARNING(...) # endif +# if __has_attribute(__diagnose_if__) && !defined(_LIBCPP_APPLE_CLANG_VER) && \ + (!defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER >= 2001) +# define _LIBCPP_DIAGNOSE_IF(...) __attribute__((__diagnose_if__(__VA_ARGS__))) +# else +# define _LIBCPP_DIAGNOSE_IF(...) +# endif + +# define _LIBCPP_DIAGNOSE_NULLPTR_IF(condition, condition_description) \ + _LIBCPP_DIAGNOSE_IF( \ + condition, \ + "null passed to callee that requires a non-null argument" condition_description, \ + "warning", \ + "nonnull") + # if __has_cpp_attribute(_Clang::__lifetimebound__) # define _LIBCPP_LIFETIMEBOUND [[_Clang::__lifetimebound__]] # else diff --git a/libcxx/include/__memory/construct_at.h b/libcxx/include/__memory/construct_at.h index b64e64b5a29b0..658269158d945 100644 --- a/libcxx/include/__memory/construct_at.h +++ b/libcxx/include/__memory/construct_at.h @@ -33,7 +33,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 template ()) _Tp(std::declval<_Args>()...))> -_LIBCPP_HIDE_FROM_ABI constexpr _Tp* construct_at(_Tp* __location, _Args&&... __args) { +_LIBCPP_HIDE_FROM_ABI constexpr _Tp* construct_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __location, _Args&&... __args) { _LIBCPP_ASSERT_NON_NULL(__location != nullptr, "null pointer given to construct_at"); return ::new (static_cast(__location)) _Tp(std::forward<_Args>(__args)...); } @@ -73,13 +73,13 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __destroy_at(_Tp* __loc) { #if _LIBCPP_STD_VER >= 17 template , int> = 0> -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void destroy_at(_Tp* __loc) { +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) { std::__destroy_at(__loc); } # if _LIBCPP_STD_VER >= 20 template , int> = 0> -_LIBCPP_HIDE_FROM_ABI constexpr void destroy_at(_Tp* __loc) { +_LIBCPP_HIDE_FROM_ABI constexpr void destroy_at(_Tp* _LIBCPP_DIAGNOSE_NULLPTR __loc) { std::__destroy_at(__loc); } # endif diff --git a/libcxx/include/print b/libcxx/include/print index be05d30e0147f..0ff314c22dcd9 100644 --- a/libcxx/include/print +++ b/libcxx/include/print @@ -329,7 +329,8 @@ __vprint_unicode([[maybe_unused]] FILE* __stream, } // namespace __print template -_LIBCPP_HIDE_FROM_ABI void print(FILE* __stream, format_string<_Args...> __fmt, _Args&&... __args) { +_LIBCPP_HIDE_FROM_ABI void +print(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream, format_string<_Args...> __fmt, _Args&&... __args) { # if _LIBCPP_HAS_UNICODE if constexpr (__print::__use_unicode_execution_charset) __print::__vprint_unicode(__stream, __fmt.get(), std::make_format_args(__args...), false); @@ -346,7 +347,8 @@ _LIBCPP_HIDE_FROM_ABI void print(format_string<_Args...> __fmt, _Args&&... __arg } template -_LIBCPP_HIDE_FROM_ABI void println(FILE* __stream, format_string<_Args...> __fmt, _Args&&... __args) { +_LIBCPP_HIDE_FROM_ABI void +println(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream, format_string<_Args...> __fmt, _Args&&... __args) { # if _LIBCPP_HAS_UNICODE // Note the wording in the Standard is inefficient. The output of // std::format is a std::string which is then copied. This solution @@ -361,7 +363,7 @@ _LIBCPP_HIDE_FROM_ABI void println(FILE* __stream, format_string<_Args...> __fmt } template // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563). -_LIBCPP_HIDE_FROM_ABI inline void println(FILE* __stream) { +_LIBCPP_HIDE_FROM_ABI inline void println(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream) { std::print(__stream, "\n"); } @@ -377,7 +379,8 @@ _LIBCPP_HIDE_FROM_ABI void println(format_string<_Args...> __fmt, _Args&&... __a # if _LIBCPP_HAS_UNICODE template // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563). -_LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(FILE* __stream, string_view __fmt, format_args __args) { +_LIBCPP_HIDE_FROM_ABI inline void +vprint_unicode(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream, string_view __fmt, format_args __args) { __print::__vprint_unicode(__stream, __fmt, __args, false); } @@ -389,7 +392,8 @@ _LIBCPP_HIDE_FROM_ABI inline void vprint_unicode(string_view __fmt, format_args # endif // _LIBCPP_HAS_UNICODE template // TODO PRINT template or availability markup fires too eagerly (http://llvm.org/PR61563). -_LIBCPP_HIDE_FROM_ABI inline void vprint_nonunicode(FILE* __stream, string_view __fmt, format_args __args) { +_LIBCPP_HIDE_FROM_ABI inline void +vprint_nonunicode(FILE* _LIBCPP_DIAGNOSE_NULLPTR __stream, string_view __fmt, format_args __args) { __print::__vprint_nonunicode(__stream, __fmt, __args, false); } diff --git a/libcxx/include/string b/libcxx/include/string index 788af36d67c58..98297d04d0c61 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -1065,13 +1065,15 @@ public: basic_string(nullptr_t) = delete; # endif - _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _CharT* __s, size_type __n) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _CharT* __s, size_type __n) + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "basic_string(const char*, n) detected nullptr"); __init(__s, __n); } _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string(const _CharT* __s, size_type __n, const _Allocator& __a) + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") : __alloc_(__a) { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "basic_string(const char*, n, allocator) detected nullptr"); __init(__s, __n); @@ -1394,7 +1396,8 @@ public: return append(__sv.data() + __pos, std::min(__n, __sz - __pos)); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* __s, size_type __n); + _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* __s, size_type __n) + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero"); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& append(size_type __n, value_type __c); @@ -1521,8 +1524,9 @@ public: return assign(__sv.data() + __pos, std::min(__n, __sz - __pos)); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s, size_type __n); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s); + _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* __s, size_type __n) + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero"); + _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& assign(size_type __n, value_type __c); template ::value, int> = 0> @@ -1593,7 +1597,8 @@ public: _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos1, const basic_string& __str, size_type __pos2, size_type __n = npos); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, const value_type* __s, size_type __n); + _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, const value_type* __s, size_type __n) + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero"); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& insert(size_type __pos, size_type __n, value_type __c); _LIBCPP_CONSTEXPR_SINCE_CXX20 iterator insert(const_iterator __pos, value_type __c); @@ -1673,8 +1678,10 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& - replace(size_type __pos, size_type __n1, const value_type* __s, size_type __n2); - _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos, size_type __n1, const value_type* __s); + replace(size_type __pos, size_type __n1, const value_type* __s, size_type __n2) + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero"); + _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& + replace(size_type __pos, size_type __n1, const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s); _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& replace(size_type __pos, size_type __n1, size_type __n2, value_type __c); _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string& @@ -1783,7 +1790,8 @@ public: return std::__str_find(data(), size(), __sv.data(), __pos, __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type find(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find(): received nullptr"); return std::__str_find(data(), size(), __s, __pos, __n); } @@ -1814,7 +1822,8 @@ public: return std::__str_rfind(data(), size(), __sv.data(), __pos, __sv.size()); } - _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type rfind(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::rfind(): received nullptr"); return std::__str_rfind(data(), size(), __s, __pos, __n); } @@ -1847,7 +1856,8 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find_first_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_of(): received nullptr"); return std::__str_find_first_of(data(), size(), __s, __pos, __n); } @@ -1881,7 +1891,8 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find_last_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_of(): received nullptr"); return std::__str_find_last_of(data(), size(), __s, __pos, __n); } @@ -1915,7 +1926,8 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find_first_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of(data(), size(), __s, __pos, __n); } @@ -1949,7 +1961,8 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX20 size_type - find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find_last_not_of(const value_type* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of(data(), size(), __s, __pos, __n); } @@ -2026,7 +2039,8 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX20 int - compare(size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const; + compare(size_type __pos1, size_type __n1, const value_type* __s, size_type __n2) const + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero"); // starts_with @@ -3564,7 +3578,8 @@ operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs, template inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool -operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs) _NOEXCEPT { +operator==(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __rhs) _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__rhs != nullptr, "operator==(basic_string, char*): received nullptr"); using _String = basic_string<_CharT, _Traits, _Allocator>; diff --git a/libcxx/include/string_view b/libcxx/include/string_view index 861187c0640e1..f86b2722aca6c 100644 --- a/libcxx/include/string_view +++ b/libcxx/include/string_view @@ -318,8 +318,8 @@ public: _LIBCPP_HIDE_FROM_ABI basic_string_view& operator=(const basic_string_view&) _NOEXCEPT = default; _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s, size_type __len) _NOEXCEPT - : __data_(__s), - __size_(__len) { + _LIBCPP_DIAGNOSE_NULLPTR_IF(__len != 0 && __s == nullptr, " if len is not zero") + : __data_(__s), __size_(__len) { # if _LIBCPP_STD_VER >= 14 // Allocations must fit in `ptrdiff_t` for pointer arithmetic to work. If `__len` exceeds it, the input // range could not have been valid. Most likely the caller underflowed some arithmetic and inadvertently @@ -352,7 +352,7 @@ public: : __data_(ranges::data(__r)), __size_(ranges::size(__r)) {} # endif // _LIBCPP_STD_VER >= 23 - _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* __s) + _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI basic_string_view(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s) : __data_(__s), __size_(std::__char_traits_length_checked<_Traits>(__s)) {} # if _LIBCPP_STD_VER >= 23 @@ -483,17 +483,19 @@ public: return substr(__pos1, __n1).compare(__sv.substr(__pos2, __n2)); } - _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int compare(const _CharT* __s) const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int + compare(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s) const _NOEXCEPT { return compare(basic_string_view(__s)); } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int - compare(size_type __pos1, size_type __n1, const _CharT* __s) const { + compare(size_type __pos1, size_type __n1, const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s) const { return substr(__pos1, __n1).compare(basic_string_view(__s)); } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI int - compare(size_type __pos1, size_type __n1, const _CharT* __s, size_type __n2) const { + compare(size_type __pos1, size_type __n1, const _CharT* __s, size_type __n2) const + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n2 != 0 && __s == nullptr, " if n2 is not zero") { return substr(__pos1, __n1).compare(basic_string_view(__s, __n2)); } @@ -509,13 +511,14 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find(): received nullptr"); return std::__str_find(data(), size(), __s, __pos, __n); } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { + find(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find(): received nullptr"); return std::__str_find( data(), size(), __s, __pos, traits_type::length(__s)); @@ -534,13 +537,14 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - rfind(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + rfind(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::rfind(): received nullptr"); return std::__str_rfind(data(), size(), __s, __pos, __n); } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - rfind(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { + rfind(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::rfind(): received nullptr"); return std::__str_rfind( data(), size(), __s, __pos, traits_type::length(__s)); @@ -560,13 +564,14 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find_first_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_first_of(): received nullptr"); return std::__str_find_first_of(data(), size(), __s, __pos, __n); } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_of(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { + find_first_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_first_of(): received nullptr"); return std::__str_find_first_of( data(), size(), __s, __pos, traits_type::length(__s)); @@ -586,13 +591,14 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find_last_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_last_of(): received nullptr"); return std::__str_find_last_of(data(), size(), __s, __pos, __n); } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_of(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { + find_last_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_last_of(): received nullptr"); return std::__str_find_last_of( data(), size(), __s, __pos, traits_type::length(__s)); @@ -613,13 +619,14 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find_first_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of(data(), size(), __s, __pos, __n); } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_first_not_of(const _CharT* __s, size_type __pos = 0) const _NOEXCEPT { + find_first_not_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = 0) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_first_not_of(): received nullptr"); return std::__str_find_first_not_of( data(), size(), __s, __pos, traits_type::length(__s)); @@ -640,13 +647,14 @@ public: } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT { + find_last_not_of(const _CharT* __s, size_type __pos, size_type __n) const _NOEXCEPT + _LIBCPP_DIAGNOSE_NULLPTR_IF(__n != 0 && __s == nullptr, " if n is not zero") { _LIBCPP_ASSERT_NON_NULL(__n == 0 || __s != nullptr, "string_view::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of(data(), size(), __s, __pos, __n); } _LIBCPP_CONSTEXPR_SINCE_CXX14 _LIBCPP_HIDE_FROM_ABI size_type - find_last_not_of(const _CharT* __s, size_type __pos = npos) const _NOEXCEPT { + find_last_not_of(const _CharT* _LIBCPP_DIAGNOSE_NULLPTR __s, size_type __pos = npos) const _NOEXCEPT { _LIBCPP_ASSERT_NON_NULL(__s != nullptr, "string_view::find_last_not_of(): received nullptr"); return std::__str_find_last_not_of( data(), size(), __s, __pos, traits_type::length(__s)); @@ -661,7 +669,7 @@ public: return !empty() && _Traits::eq(front(), __c); } - constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(const value_type* __s) const noexcept { + constexpr _LIBCPP_HIDE_FROM_ABI bool starts_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept { return starts_with(basic_string_view(__s)); } @@ -673,7 +681,7 @@ public: return !empty() && _Traits::eq(back(), __c); } - constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(const value_type* __s) const noexcept { + constexpr _LIBCPP_HIDE_FROM_ABI bool ends_with(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const noexcept { return ends_with(basic_string_view(__s)); } # endif @@ -683,7 +691,9 @@ public: constexpr _LIBCPP_HIDE_FROM_ABI bool contains(value_type __c) const noexcept { return find(__c) != npos; } - constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* __s) const { return find(__s) != npos; } + constexpr _LIBCPP_HIDE_FROM_ABI bool contains(const value_type* _LIBCPP_DIAGNOSE_NULLPTR __s) const { + return find(__s) != npos; + } # endif private: diff --git a/libcxx/test/libcxx/algorithms/specialized.algorithms/nonnull.verify.cpp b/libcxx/test/libcxx/algorithms/specialized.algorithms/nonnull.verify.cpp new file mode 100644 index 0000000000000..4d720fb0c8459 --- /dev/null +++ b/libcxx/test/libcxx/algorithms/specialized.algorithms/nonnull.verify.cpp @@ -0,0 +1,28 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14 + +// Ensure that APIs which take a pointer are diagnosing passing a nullptr to them + +#include + +#include "test_macros.h" + +void func() { + using Arr = int[1]; + int* const np = nullptr; + +#if TEST_STD_VER >= 20 + Arr* const np2 = nullptr; + std::construct_at(np); // expected-warning {{null passed}} + std::destroy_at(np2); // expected-warning {{null passed}} +#endif + + std::destroy_at(np); // expected-warning {{null passed}} +} diff --git a/libcxx/test/libcxx/input.output/iostream.format/print.fun/nonnull.verify.cpp b/libcxx/test/libcxx/input.output/iostream.format/print.fun/nonnull.verify.cpp new file mode 100644 index 0000000000000..afa0bca11ca91 --- /dev/null +++ b/libcxx/test/libcxx/input.output/iostream.format/print.fun/nonnull.verify.cpp @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: libcpp-has-no-unicode + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +// Ensure that APIs which take a FILE* are diagnosing passing a nullptr to them + +#include + +void func() { + std::print(nullptr, ""); // expected-warning {{null passed}} + std::println(nullptr, ""); // expected-warning {{null passed}} + std::println(nullptr); // expected-warning {{null passed}} + std::vprint_unicode(nullptr, "", std::make_format_args()); // expected-warning {{null passed}} + std::vprint_nonunicode(nullptr, "", std::make_format_args()); // expected-warning {{null passed}} +} diff --git a/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp b/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp index d61896277afd4..f428c49fd05f4 100644 --- a/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp +++ b/libcxx/test/libcxx/strings/basic.string/nonnull.verify.cpp @@ -8,7 +8,10 @@ // UNSUPPORTED: c++03 -// Ensure that APIs which take a CharT* (and no size for it) are diagnosing passing a nullptr to them +// Ensure that APIs which take a CharT* are diagnosing passing a nullptr to them + +// Clang 19 and AppleClang don't have diagnose_if with diagnostic flags +// UNSUPPORTED: clang-19, apple-clang-17 #include @@ -20,6 +23,7 @@ void func() { std::string str2(np, std::allocator{}); // expected-warning {{null passed}} str2 = np; // expected-warning {{null passed}} str2 += np; // expected-warning {{null passed}} + str2.assign(np); // expected-warning {{null passed}} str2.append(np); // expected-warning {{null passed}} str2.insert(0, np); // expected-warning {{null passed}} str2.find(np); // expected-warning {{null passed}} @@ -30,6 +34,8 @@ void func() { str2.find_last_not_of(np); // expected-warning {{null passed}} str2.compare(np); // expected-warning {{null passed}} str2.compare(0, 0, np); // expected-warning {{null passed}} + str2.replace(0, 0, np); // expected-warning {{null passed}} + (void)(str2 == np); // expected-warning {{null passed}} #if TEST_STD_VER >= 20 str2.starts_with(np); // expected-warning {{null passed}} @@ -38,4 +44,21 @@ void func() { #if TEST_STD_VER >= 23 str2.contains(np); // expected-warning {{null passed}} #endif + + // clang-format off + // These diagnostics are issued via diagnose_if, so we want to check the full description + std::string str3(nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + std::string str4(nullptr, 1, std::allocator{}); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.find(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.rfind(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.find_first_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.find_last_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.find_first_not_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.find_last_not_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.compare(0, 0, nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n2 is not zero}} + str4.assign(nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.append(nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.insert(0, nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str4.replace(0, 0, nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n2 is not zero}} + // clang-format on } diff --git a/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp b/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp index 1810ec1ca8ac9..f358b5efd0df2 100644 --- a/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp +++ b/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp @@ -14,6 +14,9 @@ // Construct a string_view from a null pointer // constexpr basic_string_view( const CharT* s ); +// We're testing for assertions here, so let's not diagnose the misuses at compile time +// ADDITIONAL_COMPILE_FLAGS: -Wno-nonnull + #include #include "check_assertion.h" diff --git a/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp b/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp new file mode 100644 index 0000000000000..316c9828e0de5 --- /dev/null +++ b/libcxx/test/libcxx/strings/string.view/nonnull.verify.cpp @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03 + +// Ensure that APIs which take a CharT* are diagnosing passing a nullptr to them + +// Clang 19 and AppleClang don't have diagnose_if with diagnostic flags +// UNSUPPORTED: clang-19, apple-clang-17 + +#include + +#include "test_macros.h" + +void func() { + const char* const np = nullptr; + std::string_view str1(np); // expected-warning {{null passed}} + str1 = np; // expected-warning {{null passed}} + str1.find(np); // expected-warning {{null passed}} + str1.rfind(np); // expected-warning {{null passed}} + str1.find_first_of(np); // expected-warning {{null passed}} + str1.find_last_of(np); // expected-warning {{null passed}} + str1.find_first_not_of(np); // expected-warning {{null passed}} + str1.find_last_not_of(np); // expected-warning {{null passed}} + str1.compare(np); // expected-warning {{null passed}} + str1.compare(0, 0, np); // expected-warning {{null passed}} + (void)(str1 == np); // expected-warning {{null passed}} + +#if TEST_STD_VER >= 20 + str1.starts_with(np); // expected-warning {{null passed}} + str1.ends_with(np); // expected-warning {{null passed}} +#endif +#if TEST_STD_VER >= 23 + str1.contains(np); // expected-warning {{null passed}} +#endif + + // clang-format off + // These diagnostics are issued via diagnose_if, so we want to check the full description + std::string_view str2(nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if len is not zero}} + str2.find(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str2.rfind(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str2.find_first_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str2.find_last_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str2.find_first_not_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str2.find_last_not_of(nullptr, 0, 1); // expected-warning {{null passed to callee that requires a non-null argument if n is not zero}} + str2.compare(0, 0, nullptr, 1); // expected-warning {{null passed to callee that requires a non-null argument if n2 is not zero}} + // clang-format on +} From 193de1a566aa5a10a6f63f6f7c7fca2e52a7d75b Mon Sep 17 00:00:00 2001 From: Vassil Vassilev Date: Sat, 19 Jul 2025 11:28:26 +0000 Subject: [PATCH 439/813] [clang-repl] Spell out the enum types to appease some bots. This change is a follow-up of llvm/llvm-project#148701 where clang-s390x-linux and clang-s390x-linux-lnt failed. --- clang/test/Interpreter/pretty-print.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/clang/test/Interpreter/pretty-print.cpp b/clang/test/Interpreter/pretty-print.cpp index 0882a3f9e462e..1952cc7428e85 100644 --- a/clang/test/Interpreter/pretty-print.cpp +++ b/clang/test/Interpreter/pretty-print.cpp @@ -28,15 +28,15 @@ S4{} // CHECK-NEXT: (S4) @0x{{[0-9a-f]+}} // TODO-CHECK-NEXT: ~S4() -enum Enum{ e1 = -12, e2, e3=33, e4, e5 = 33}; +enum Enum : int { e1 = -12, e2, e3=33, e4, e5 = 33}; e2 // CHECK-NEXT: (Enum) (e2) : int -11 ::e1 // CHECK-NEXT: (Enum) (e1) : int -12 -enum class Color { R = 0, G, B }; +enum class Color : unsigned int { R = 0, G, B }; Color::R -// CHECK-NEXT: (Color) (Color::R) : int 0 +// CHECK-NEXT: (Color) (Color::R) : unsigned int 0 // Lambdas. From 3866e4e7f85aacd0e47978b22084ed00ebcd0531 Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Sat, 19 Jul 2025 17:16:21 +0530 Subject: [PATCH 440/813] [NVPTX] Add im2colw/w128 modes support to TMA intrinsics (#148863) This patch adds support for the im2col-w/w128 and scatter/gather modes for TMA Copy and Prefetch intrinsics, completing support for all the available modes. These are lowered through tablegen, building on top of earlier patches. * lit tests are added for all the combinations and verified with a 12.8 ptxas executable. * Documentation is updated in the NVPTXUsage.rst file. Signed-off-by: Durgadoss R --- llvm/docs/NVPTXUsage.rst | 161 +++++- llvm/include/llvm/IR/IntrinsicsNVVM.td | 78 ++- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 1 + llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 171 +++++- .../cp-async-bulk-tensor-g2s-cta-sm100.ll | 195 +++++++ .../cp-async-bulk-tensor-g2s-cta-sm100a.ll | 152 +++++ .../cp-async-bulk-tensor-g2s-cta-sm90.ll | 353 ++++++++++++ .../NVPTX/cp-async-bulk-tensor-g2s-gather4.ll | 174 ++++++ .../NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll | 524 ++++++++++++++++++ .../cp-async-bulk-tensor-g2s-im2colw128.ll | 524 ++++++++++++++++++ .../cp-async-bulk-tensor-prefetch-sm100a.ll | 171 ++++++ .../cp-async-bulk-tensor-s2g-scatter4.ll | 52 ++ 12 files changed, 2527 insertions(+), 29 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 11017fe4e01b4..d28eb6860c33a 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1072,6 +1072,8 @@ Syntax: declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) + Overview: """"""""" @@ -1082,7 +1084,13 @@ global memory to shared::cluster memory (indicated by the ``g2s`` prefix) in ``tile`` mode. In tile mode, the multi-dimensional layout of the source tensor is preserved at the destination. The dimension of the tensor data ranges from 1d to 5d with the coordinates specified -by the ``i32 %d0 ... i32 %d4`` arguments. +by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode, +four rows in a 2D tensor are combined to form a single 2D destination +tensor. The first coordinate ``i32 %x0`` denotes the column index +followed by four coordinates indicating the four row-indices. +So, this mode takes a total of 5 coordinates as input arguments. +For more information on ``gather4`` mode, refer PTX ISA +``_. * The last three arguments to these intrinsics are flags indicating support for multicast, cache_hint and cta_group::1/2 @@ -1116,10 +1124,18 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + Overview: """"""""" @@ -1131,10 +1147,105 @@ in ``im2col`` mode. In im2col mode, some dimensions of the source tensor are unrolled into a single dimensional column at the destination. In this mode, the tensor has to be at least three-dimensional. Along with the tensor coordinates, im2col offsets are also specified (denoted by -``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less -than the number of dimensions of the tensor operation. The last three arguments -to these intrinsics are flags, with the same functionality as described -in the ``tile`` mode intrinsics above. +``i16 im2col0...i16 %im2col2``). For the ``im2col`` mode, the number of offsets +is two less than the number of dimensions of the tensor operation. For the +``im2col.w`` and ``im2col.w.128`` mode, the number of offsets is always 2, +denoted by ``i16 %wHalo`` and ``i16 %wOffset`` arguments. For more information +on ``im2col.w`` and ``im2col.w.128`` modes, refer PTX ISA +``_. + +The last three arguments to these intrinsics are flags, with the same functionality +as described in the ``tile`` mode intrinsics above. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(..., i32 %d0, i32 %d1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.[1-5]d``' intrinsics +correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*`` +set of PTX instructions. These instructions initiate an asynchronous +copy of tensor data from global memory to shared::cta memory in +``tile`` mode. In tile mode, the multi-dimensional layout of the +source tensor is preserved at the destination. The dimension of the +tensor data ranges from 1d to 5d with the coordinates specified +by the ``i32 %d0 ... i32 %d4`` arguments. In ``tile.gather4`` mode, +four rows in a 2D tensor are combined to form a single 2D destination +tensor. The first coordinate ``i32 %x0`` denotes the column index +followed by four coordinates indicating the four row-indices. +So, this mode takes a total of 5 coordinates as input arguments. +For more information on ``gather4`` mode, refer PTX ISA +``_. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.[3-5]d``' intrinsics +correspond to the ``cp.async.bulk.tensor.[1-5]d.shared::cta.global.*`` +set of PTX instructions. These instructions initiate an asynchronous copy +of tensor data from global memory to shared::cta memory in ``im2col`` mode. +In im2col mode, some dimensions of the source tensor are unrolled into a +single dimensional column at the destination. In this mode, the tensor has +to be at least three-dimensional. Along with the tensor coordinates, im2col +offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2``). +For the ``im2col`` mode, the number of offsets is two less than the number +of dimensions of the tensor operation. For the ``im2col.w`` and ``im2col.w.128`` +mode, the number of offsets is always 2, denoted by ``i16 %wHalo`` and +``i16 %wOffset`` arguments. For more information on ``im2col.w`` and +``im2col.w.128`` modes, refer PTX ISA +``_. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. For more information, refer PTX ISA ``_. @@ -1153,6 +1264,8 @@ Syntax: declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch) + Overview: """"""""" @@ -1162,6 +1275,12 @@ These instructions initiate an asynchronous copy of tensor data from shared::cta to global memory (indicated by the ``s2g`` prefix) in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments. +In ``tile.scatter4`` mode, a single 2D source tensor is divided into +four rows in the 2D destination tensor. The first coordinate ``i32 %x0`` +denotes the column index followed by four coordinates indicating the +four row-indices. So, this mode takes a total of 5 coordinates as input arguments. +For more information on ``scatter4`` mode, refer PTX ISA +``_. * The last argument to these intrinsics is a boolean flag indicating support for cache_hint. This flag argument must @@ -1214,6 +1333,8 @@ Syntax: declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tensor_map, i32 %x0, i32 %y0, i32 %y1, i32 %y2, i32 %y3, i64 %ch, i1 %flag_ch) + Overview: """"""""" @@ -1225,6 +1346,13 @@ multi-dimensional layout of the source tensor is preserved at the destination. The dimension of the tensor data ranges from 1d to 5d with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments. +In ``tile.gather4`` mode, four rows in the 2-dimnesional source tensor are +fetched to the L2 cache. The first coordinate ``i32 %x0`` denotes the column index +followed by four coordinates indicating the four row-indices. So, this mode takes +a total of 5 coordinates as input arguments. +For more information on ``gather4`` mode, refer PTX ISA +``_. + * The last argument to these intrinsics is a boolean flag indicating support for cache_hint. This flag argument must be a compile-time constant. When set, it indicates a valid @@ -1246,6 +1374,14 @@ Syntax: declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) + declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...) + Overview: """"""""" @@ -1256,9 +1392,16 @@ of tensor data from global memory to the L2 cache. In im2col mode, some dimensions of the source tensor are unrolled into a single dimensional column at the destination. In this mode, the tensor has to be at least three-dimensional. Along with the tensor coordinates, im2col offsets are -also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number -of im2col offsets is two less than the number of dimensions of the tensor -operation. The last argument to these intrinsics is a boolean flag, with +also specified (denoted by ``i16 im2col0...i16 %im2col2``). For ``im2col`` +mode, the number of offsets is two less than the number of dimensions of +the tensor operation. For the ``im2col.w`` and ``im2col.w.128`` modes, +the number of offsets is always 2, denoted by ``i16 %wHalo`` and +``i16 %wOffset`` arguments. For more information on ``im2col.w`` and +``im2col.w.128`` modes, refer PTX ISA +``_. + + +The last argument to these intrinsics is a boolean flag, with the same functionality as described in the ``tile`` mode intrinsics above. For more information, refer PTX ISA diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 0375f29ad8906..5ddc14445908b 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2024,9 +2024,7 @@ foreach dim = 1...5 in { tensor_dim_args, // actual tensor dims [llvm_i64_ty]), // cache_hint [llvm_i1_ty], // Flag for cache_hint - [IntrConvergent, - ReadOnly>, ReadOnly>, - NoCapture>, NoCapture>]>; + [IntrConvergent, ReadOnly>, ReadOnly>]>; // Intrinsics for TMA Copy with reduction foreach red_op = ["add", "min", "max", "inc", "dec", "and", "or", "xor"] in @@ -2037,18 +2035,31 @@ foreach dim = 1...5 in { tensor_dim_args, // actual tensor dims [llvm_i64_ty]), // cache_hint [llvm_i1_ty], // Flag for cache_hint - [IntrConvergent, ReadOnly>, ReadOnly>, - NoCapture>, NoCapture>]>; + [IntrConvergent, ReadOnly>, ReadOnly>]>; } } +// TMA S2G tile::scatter4 +def int_nvvm_cp_async_bulk_tensor_s2g_tile_scatter4_2d + : DefaultAttrsIntrinsicFlags<[], + !listconcat([llvm_shared_ptr_ty, // src_smem_ptr + llvm_ptr_ty], // tensormap_ptr + !listsplat(llvm_i32_ty, 5), // dims + [llvm_i64_ty]), // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, ReadOnly>, ReadOnly>]>; + // TMA Tensor Copy Intrinsics: G2S -> From Global to Shared memory variants foreach dim = 1...5 in { defvar tensor_dim_args = !listsplat(llvm_i32_ty, dim); - foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { + foreach mode = !if(!ge(dim, 3), ["tile", "im2col", "im2col_w", "im2col_w_128"], ["tile"]) in { defvar is_im2col = !eq(mode, "im2col"); - defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0); + defvar is_im2colw = !or(!eq(mode, "im2col_w"), !eq(mode, "im2col_w_128")); + + // For im2col_w/w128 modes, the num_offsets is always 2. + // For im2col mode, the num_offsets is (dim - 2). + defvar num_im2col_offsets = !if(is_im2colw, 2, !if(is_im2col, !add(dim, -2), 0)); defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets); defvar g2s_params = !listconcat( @@ -2079,11 +2090,60 @@ foreach dim = 1...5 in { im2col_offsets_args, // im2col offsets [llvm_i64_ty]), // cache_hint [llvm_i1_ty], // Flag for cache_hint - [IntrConvergent, - ReadOnly>, NoCapture>]>; + [IntrConvergent, ReadOnly>]>; + + def int_nvvm_cp_async_bulk_tensor_g2s_cta_ # mode # _ # dim # d : + DefaultAttrsIntrinsicFlags<[], + !listconcat([llvm_shared_ptr_ty, // dst_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_ptr_ty], // tensormap_ptr + tensor_dim_args, // actual tensor dims + im2col_offsets_args, // im2col offsets + [llvm_i64_ty]), // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, WriteOnly>, ReadOnly>]>; } } +// TMA copy for tile::gather4 +def int_nvvm_cp_async_bulk_tensor_g2s_tile_gather4_2d + : DefaultAttrsIntrinsicFlags<[], + !listconcat( + [llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_ptr_ty], // tensormap_ptr + !listsplat(llvm_i32_ty, 5), // co-ordinates + [llvm_i16_ty, // cta_mask + llvm_i64_ty]), // cache_hint + [llvm_i1_ty, // Flag for cta_mask + llvm_i1_ty, // Flag for cache_hint + llvm_i32_ty], // Flag for cta_group + [IntrConvergent, + WriteOnly>, ReadOnly>, + // Allowed values for cta_group are {0,1,2} i.e [0, 3). + Range, 0, 3>]>; + +def int_nvvm_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d + : DefaultAttrsIntrinsicFlags<[], + !listconcat( + [llvm_shared_ptr_ty, // dst_shared_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_ptr_ty], // tensormap_ptr + !listsplat(llvm_i32_ty, 5), // co-ordinates + [llvm_i64_ty]), // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, + WriteOnly>, ReadOnly>]>; + +// TMA prefetch for tile::gather4 +def int_nvvm_cp_async_bulk_tensor_prefetch_tile_gather4_2d + : DefaultAttrsIntrinsicFlags<[], + !listconcat([llvm_ptr_ty], // tensormap_ptr + !listsplat(llvm_i32_ty, 5), // co-ordinates + [llvm_i64_ty]), // cache_hint + [llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, ReadOnly>]>; + // Intrinsics for Prefetch and Prefetchu let IntrProperties = [IntrArgMemOnly, ReadOnly>, NoCapture>] in { foreach level = ["L1", "L2"] in { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index a5bb83dfadb84..b5df4c6de7fd8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -131,6 +131,7 @@ def hasHWROT32 : Predicate<"Subtarget->hasHWROT32()">; def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">; def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; +def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">; def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">; class hasPTX: Predicate<"Subtarget->getPTXVersion() >= " # version>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 70150bdfc8d16..f329f48c1b96b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -600,12 +600,23 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR; // TMA Async Bulk Tensor Copy Functions //------------------------------------- -class TMA_DIMS_UTIL { +class TMA_DIMS_UTIL { // For example, when 'dim' is 3, this generates: // an ins_dag: B32:$d0, B32:$d1, B32:$d2 // with base_str: $d0, $d1, $d2 dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i)); string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + + // Tile::Gather4/scatter4 actually operate on a 2D tensor, + // though they take 5 co-ordinates. + // + // The scatter-gather happens over 4 rows with a fixed + // column-index. The first co-ordinate represents the + // col-index followed by four row-indices. + int num_dims = !cond( + !eq(mode, "tile_scatter4") : 2, + !eq(mode, "tile_gather4") : 2, + true : dim); // for all other modes } class TMA_IM2COL_UTIL { @@ -692,14 +703,138 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +multiclass TMA_TENSOR_G2S_INTR pred = []> { + defvar dims_dag = TMA_DIMS_UTIL.ins_dag; + defvar dims_str = TMA_DIMS_UTIL.base_str; + defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + + defvar im2col_dag = TMA_IM2COL_UTIL.ins_dag; + defvar im2col_str = TMA_IM2COL_UTIL.base_str; + defvar asm_str = !if(!empty(im2col_str), + asm_str_base, + asm_str_base # ", {{" # im2col_str # "}}"); + + defvar dim_val = TMA_DIMS_UTIL.num_dims; + defvar inst_name = "cp.async.bulk.tensor" + # "." # dim_val # "d" + # "." # "shared::cluster.global" + # "." # !subst("_", "::", mode) + # "." # "mbarrier::complete_tx::bytes"; + defvar intr = !cast( + "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim_val # "d"); + + defvar ins_dag = !con( + (ins ADDR:$dst, ADDR:$mbar, B64:$tmap), + dims_dag, im2col_dag, + (ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)); + + defvar intr_dag_base = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B16:$mc, B64:$ch)); + defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg)); + defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg)); + defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg)); + defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg)); + + def "" : NVPTXInst<(outs), ins_dag, + inst_name # asm_str # ";", + [intr_dag_no_hints]>, + Requires; + def _MC : NVPTXInst<(outs), ins_dag, + inst_name # ".multicast::cluster" # asm_str # ", $mc;", + [intr_dag_with_mc]>, + Requires; + def _CH : NVPTXInst<(outs), ins_dag, + inst_name # ".L2::cache_hint" # asm_str # ", $ch;", + [intr_dag_with_ch]>, + Requires; + def _MC_CH : NVPTXInst<(outs), ins_dag, + inst_name # ".multicast::cluster.L2::cache_hint" # asm_str # ", $mc, $ch;", + [intr_dag_with_mc_ch]>, + Requires; +} +foreach dim = 3...5 in { + foreach mode = ["im2col_w", "im2col_w_128"] in { + defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D" + : TMA_TENSOR_G2S_INTR; + } +} +defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4", + [hasTMACTAGroupSupport]>; + +multiclass TMA_TENSOR_G2S_CTA_INTR pred = []> { + defvar dims_dag = TMA_DIMS_UTIL.ins_dag; + defvar dims_str = TMA_DIMS_UTIL.base_str; + defvar asm_str_base = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + + defvar im2col_dag = TMA_IM2COL_UTIL.ins_dag; + defvar im2col_str = TMA_IM2COL_UTIL.base_str; + defvar asm_str = !if(!empty(im2col_str), + asm_str_base, + asm_str_base # ", {{" # im2col_str # "}}"); + + defvar ins_dag = !con( + (ins ADDR:$dst, ADDR:$mbar, B64:$tmap), + dims_dag, im2col_dag, + (ins B64:$ch)); + + defvar dim_val = TMA_DIMS_UTIL.num_dims; + defvar intr = !cast( + "int_nvvm_cp_async_bulk_tensor_g2s_cta_" # mode # "_" # dim_val # "d"); + defvar intr_dag = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B64:$ch, 0)); + defvar intr_dag_with_ch = !con( + (intr addr:$dst, addr:$mbar, B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B64:$ch, -1)); + defvar inst_name = "cp.async.bulk.tensor" + # "." # dim_val # "d" + # "." # "shared::cta.global" + # "." # !subst("_", "::", mode) + # "." # "mbarrier::complete_tx::bytes"; + + def "" : NVPTXInst<(outs), ins_dag, + inst_name # asm_str # ";", + [intr_dag]>, + Requires; + def _CH : NVPTXInst<(outs), ins_dag, + inst_name # ".L2::cache_hint" # asm_str # ", $ch;", + [intr_dag_with_ch]>, + Requires; +} +foreach dim = 1...5 in { + defm TMA_G2S_CTA_TILE_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR, hasSM<90>]>; +} +foreach dim = 3...5 in { + defm TMA_G2S_CTA_IM2COL_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR, hasSM<90>]>; + + defm TMA_G2S_CTA_IM2COL_W_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR, hasSM<100>]>; + + defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D" + : TMA_TENSOR_G2S_CTA_INTR; +} +defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4", + [hasPTX<86>, hasSM<100>]>; + multiclass TMA_TENSOR_S2G_INTR pred = [hasPTX<80>, hasSM<90>]> { defvar dims_dag = TMA_DIMS_UTIL.ins_dag; defvar dims_str = TMA_DIMS_UTIL.base_str; defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; + defvar dim_val = TMA_DIMS_UTIL.num_dims; defvar intr = !cast( - "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d); + "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim_val # "d"); + defvar intr_dag = !con((intr addr:$src, B64:$tmap), !setdagop(dims_dag, intr), (intr B64:$ch, 0)); @@ -707,11 +842,13 @@ multiclass TMA_TENSOR_S2G_INTR; } } +defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4", + [hasTMACTAGroupSupport]>; def TMAReductionFlags : Operand { let PrintMethod = "printTmaReductionMode"; @@ -786,13 +925,14 @@ multiclass TMA_TENSOR_PREFETCH_INTR.num_dims; defvar inst_name = "cp.async.bulk.prefetch.tensor" - # "." # dim # "d" + # "." # dim_val # "d" # "." # "L2.global" - # "." # mode; + # "." # !subst("_", "::", mode); defvar intr = !cast( - "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d); + "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim_val # "d"); defvar ins_dag = !con((ins B64:$tmap), dims_dag, @@ -818,10 +958,19 @@ multiclass TMA_TENSOR_PREFETCH_INTR; } } +foreach dim = 3...5 in { + foreach mode = ["im2col_w", "im2col_w_128"] in { + defvar suffix = !toupper(mode) # "_" # dim # "D"; + defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR; + } +} +defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4", + [hasTMACTAGroupSupport]>; //Prefetch and Prefetchu diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll new file mode 100644 index 0000000000000..843446a658626 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_tile_gather4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.gather4.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll new file mode 100644 index 0000000000000..9b4858036fca6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm100a.ll @@ -0,0 +1,152 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_w128_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.w.128.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll new file mode 100644 index 0000000000000..432540594c790 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-cta-sm90.ll @@ -0,0 +1,353 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1); + +define void @cp_async_bulk_tensor_g2s_cta_tile_1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i64 %ch, i1 0) + ret void +} + +define void @cp_async_bulk_tensor_g2s_cta_tile_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_cta_tile_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_cta_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + ret void +} + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i64 %ch, i1 0) + ret void +} + +define void @test_cp_async_bulk_tensor_g2s_cta_im2col_5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_cta_im2col_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_cta_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cta.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.cta.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i64 %ch, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll new file mode 100644 index 0000000000000..ef4a8fb6ca72f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll @@ -0,0 +1,174 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1 +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2 +define void @test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_gather4_2d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile::gather4.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.gather4.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll new file mode 100644 index 0000000000000..112dab1964065 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw.ll @@ -0,0 +1,524 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d +define void @cp_async_bulk_tensor_g2s_im2colw_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_3d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_3d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d +define void @cp_async_bulk_tensor_g2s_im2colw_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_4d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_4d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d +define void @cp_async_bulk_tensor_g2s_im2colw_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg1_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_5d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_5d_cg2_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll new file mode 100644 index 0000000000000..54e861eca30cc --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-im2colw128.ll @@ -0,0 +1,524 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_3d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_4d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 0) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1 +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg1_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + + ret void +} + +; CHECK-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2 +define void @cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2colw_128_5d_cg2_param_11]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col::w::128.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.w.128.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll new file mode 100644 index 0000000000000..6bf8f03f99ee1 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch-sm100a.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 %f1); +declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %f1); + +define void @test_cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.3d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.4d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_prefetch_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_prefetch_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128 [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col::w::128.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.w.128.5d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %wHalo, i16 %wOffset, i64 %ch, i1 1) + ret void +} + +define void @test_cp_async_bulk_tensor_prefetch_tile_gather4_2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.gather4.2d(ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll new file mode 100644 index 0000000000000..2ef44ff643bfe --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g-scatter4.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %s, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 %flag); + +; CHECK-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d +define void @cp_async_bulk_tensor_s2g_tile_scatter4_2d(i32 %flag, ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_tensor_s2g_tile_scatter4_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_scatter4_2d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile::scatter4.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.scatter4.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i64 %ch, i1 1) + + ret void +} From 802ea0eb78f7c974d4097c38587f4c207451d7ee Mon Sep 17 00:00:00 2001 From: Guy David <49722543+guy-david@users.noreply.github.com> Date: Sat, 19 Jul 2025 15:00:21 +0300 Subject: [PATCH 441/813] [Support] System include SipHash.h (#149499) A regular include may not search the system include path. --- llvm/lib/Support/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index a579eaf7d953d..10b6101d73277 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -380,7 +380,7 @@ if(LLVM_WITH_Z3) ) endif() -target_include_directories(LLVMSupport SYSTEM +target_include_directories(LLVMSupport PRIVATE ${LLVM_THIRD_PARTY_DIR}/siphash/include - ) +) From 65bde89c9a081eab1e8102b06a46d445e1320bd8 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Sat, 19 Jul 2025 15:13:36 +0200 Subject: [PATCH 442/813] [CIR] Upstream CompoundLiteralExpr for Scalar (#148943) Upstream CompoundLiteralExpr for Scalar as a prerequisite for CompoundLiteralExpr for ComplexType --- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 61 +++++++++++++ clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 14 +++ clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 4 + clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 2 + clang/lib/CIR/CodeGen/CIRGenFunction.h | 8 ++ clang/test/CIR/CodeGen/compound_literal.cpp | 99 +++++++++++++++++++++ 6 files changed, 188 insertions(+) create mode 100644 clang/test/CIR/CodeGen/compound_literal.cpp diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index d63c18fc5056b..2a998cc39dba2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1054,6 +1054,67 @@ LValue CIRGenFunction::emitMemberExpr(const MemberExpr *e) { llvm_unreachable("Unhandled member declaration!"); } +/// Evaluate an expression into a given memory location. +void CIRGenFunction::emitAnyExprToMem(const Expr *e, Address location, + Qualifiers quals, bool isInit) { + // FIXME: This function should take an LValue as an argument. + switch (getEvaluationKind(e->getType())) { + case cir::TEK_Complex: { + LValue lv = makeAddrLValue(location, e->getType()); + emitComplexExprIntoLValue(e, lv, isInit); + return; + } + + case cir::TEK_Aggregate: { + emitAggExpr(e, AggValueSlot::forAddr(location, quals, + AggValueSlot::IsDestructed_t(isInit), + AggValueSlot::IsAliased_t(!isInit), + AggValueSlot::MayOverlap)); + return; + } + + case cir::TEK_Scalar: { + RValue rv = RValue::get(emitScalarExpr(e)); + LValue lv = makeAddrLValue(location, e->getType()); + emitStoreThroughLValue(rv, lv); + return; + } + } + + llvm_unreachable("bad evaluation kind"); +} + +LValue CIRGenFunction::emitCompoundLiteralLValue(const CompoundLiteralExpr *e) { + if (e->isFileScope()) { + cgm.errorNYI(e->getSourceRange(), "emitCompoundLiteralLValue: FileScope"); + return {}; + } + + if (e->getType()->isVariablyModifiedType()) { + cgm.errorNYI(e->getSourceRange(), + "emitCompoundLiteralLValue: VariablyModifiedType"); + return {}; + } + + Address declPtr = createMemTemp(e->getType(), getLoc(e->getSourceRange()), + ".compoundliteral"); + const Expr *initExpr = e->getInitializer(); + LValue result = makeAddrLValue(declPtr, e->getType(), AlignmentSource::Decl); + + emitAnyExprToMem(initExpr, declPtr, e->getType().getQualifiers(), + /*Init*/ true); + + // Block-scope compound literals are destroyed at the end of the enclosing + // scope in C. + if (!getLangOpts().CPlusPlus && e->getType().isDestructedType()) { + cgm.errorNYI(e->getSourceRange(), + "emitCompoundLiteralLValue: non C++ DestructedType"); + return {}; + } + + return result; +} + LValue CIRGenFunction::emitCallExprLValue(const CallExpr *e) { RValue rv = emitCallExpr(e); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index 0a22771378ff1..81cb7f9cf77cb 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -52,6 +52,11 @@ class ComplexExprEmitter : public StmtVisitor { mlir::Value VisitGenericSelectionExpr(GenericSelectionExpr *e); mlir::Value VisitImplicitCastExpr(ImplicitCastExpr *e); mlir::Value VisitInitListExpr(const InitListExpr *e); + + mlir::Value VisitCompoundLiteralExpr(CompoundLiteralExpr *e) { + return emitLoadOfLValue(e); + } + mlir::Value VisitImaginaryLiteral(const ImaginaryLiteral *il); mlir::Value VisitParenExpr(ParenExpr *e); mlir::Value @@ -467,6 +472,15 @@ mlir::Value CIRGenFunction::emitComplexPrePostIncDec(const UnaryOperator *e, return isPre ? incVal : inVal; } +void CIRGenFunction::emitComplexExprIntoLValue(const Expr *e, LValue dest, + bool isInit) { + assert(e && getComplexType(e->getType()) && + "Invalid complex expression to emit"); + ComplexExprEmitter emitter(*this); + mlir::Value value = emitter.Visit(const_cast(e)); + emitter.emitStoreOfComplex(getLoc(e->getExprLoc()), value, dest, isInit); +} + mlir::Value CIRGenFunction::emitLoadOfComplex(LValue src, SourceLocation loc) { return ComplexExprEmitter(*this).emitLoadOfLValue(src, loc); } diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 9e13b4c83e3a8..23112be6bf3e7 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -233,6 +233,10 @@ class ScalarExprEmitter : public StmtVisitor { mlir::Value VisitMemberExpr(MemberExpr *e); + mlir::Value VisitCompoundLiteralExpr(CompoundLiteralExpr *e) { + return emitLoadOfLValue(e); + } + mlir::Value VisitInitListExpr(InitListExpr *e); mlir::Value VisitExplicitCastExpr(ExplicitCastExpr *e) { diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp index e532b9d855843..7e1a44ce602d4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp @@ -698,6 +698,8 @@ LValue CIRGenFunction::emitLValue(const Expr *e) { return emitStringLiteralLValue(cast(e)); case Expr::MemberExprClass: return emitMemberExpr(cast(e)); + case Expr::CompoundLiteralExprClass: + return emitCompoundLiteralLValue(cast(e)); case Expr::BinaryOperatorClass: return emitBinaryOperatorLValue(cast(e)); case Expr::CompoundAssignOperatorClass: { diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 9541f4f0725eb..e2fa03d03b81e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -757,6 +757,11 @@ class CIRGenFunction : public CIRGenTypeCache { RValue emitAnyExpr(const clang::Expr *e, AggValueSlot aggSlot = AggValueSlot::ignored()); + /// Emits the code necessary to evaluate an arbitrary expression into the + /// given memory location. + void emitAnyExprToMem(const Expr *e, Address location, Qualifiers quals, + bool isInitializer); + /// Similarly to emitAnyExpr(), however, the result will always be accessible /// even if no aggregate location is provided. RValue emitAnyExprToTemp(const clang::Expr *e); @@ -828,6 +833,7 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::Value emitCheckedArgForAssume(const Expr *e); LValue emitCompoundAssignmentLValue(const clang::CompoundAssignOperator *e); + LValue emitCompoundLiteralLValue(const CompoundLiteralExpr *e); void emitConstructorBody(FunctionArgList &args); void emitDestructorBody(FunctionArgList &args); @@ -930,6 +936,8 @@ class CIRGenFunction : public CIRGenTypeCache { /// returning the result. mlir::Value emitComplexExpr(const Expr *e); + void emitComplexExprIntoLValue(const Expr *e, LValue dest, bool isInit); + mlir::Value emitComplexPrePostIncDec(const UnaryOperator *e, LValue lv, bool isInc, bool isPre); diff --git a/clang/test/CIR/CodeGen/compound_literal.cpp b/clang/test/CIR/CodeGen/compound_literal.cpp new file mode 100644 index 0000000000000..a92af95c62a1b --- /dev/null +++ b/clang/test/CIR/CodeGen/compound_literal.cpp @@ -0,0 +1,99 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +int foo() { + int e = (int){1}; + return e; +} + +// CIR: %[[RET:.*]] = cir.alloca !s32i, !cir.ptr, ["__retval"] +// CIR: %[[INIT:.*]] = cir.alloca !s32i, !cir.ptr, ["e", init] +// CIR: %[[COMPOUND:.*]] = cir.alloca !s32i, !cir.ptr, [".compoundliteral", init] +// CIR: %[[VALUE:.*]] = cir.const #cir.int<1> : !s32i +// CIR: cir.store{{.*}} %[[VALUE]], %[[COMPOUND]] : !s32i, !cir.ptr +// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPOUND]] : !cir.ptr, !s32i +// CIR: cir.store{{.*}} %[[TMP]], %[[INIT]] : !s32i, !cir.ptr +// CIR: %[[TMP_2:.*]] = cir.load{{.*}} %[[INIT]] : !cir.ptr, !s32i +// CIR: cir.store %[[TMP_2]], %[[RET]] : !s32i, !cir.ptr +// CIR: %[[TMP_3:.*]] = cir.load %[[RET]] : !cir.ptr, !s32i +// CIR: cir.return %[[TMP_3]] : !s32i + +// LLVM: %[[RET:.*]] = alloca i32, i64 1, align 4 +// LLVM: %[[INIT:.*]] = alloca i32, i64 1, align 4 +// LLVM: %[[COMPOUND:.*]] = alloca i32, i64 1, align 4 +// LLVM: store i32 1, ptr %[[COMPOUND]], align 4 +// LLVM: %[[TMP:.*]] = load i32, ptr %[[COMPOUND]], align 4 +// LLVM: store i32 %[[TMP]], ptr %[[INIT]], align 4 +// LLVM: %[[TMP_2:.*]] = load i32, ptr %[[INIT]], align 4 +// LLVM: store i32 %[[TMP_2]], ptr %[[RET]], align 4 +// LLVM: %[[TMP_3:.*]] = load i32, ptr %[[RET]], align 4 +// LLVM: ret i32 %[[TMP_3]] + +// OGCG: %[[INIT:.*]] = alloca i32, align 4 +// OGCG: %[[COMPOUND:.*]] = alloca i32, align 4 +// OGCG: store i32 1, ptr %[[COMPOUND]], align 4 +// OGCG: %[[TMP:.*]] = load i32, ptr %[[COMPOUND]], align 4 +// OGCG: store i32 %[[TMP]], ptr %[[INIT]], align 4 +// OGCG: %[[TMP_2:.*]] = load i32, ptr %[[INIT]], align 4 +// OGCG: ret i32 %[[TMP_2]] + +void foo2() { + int _Complex a = (int _Complex) { 1, 2}; +} + +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, ["a", init] +// CIR: %[[CL_ADDR:.*]] = cir.alloca !cir.complex, !cir.ptr>, [".compoundliteral"] +// CIR: %[[COMPLEX:.*]] = cir.const #cir.const_complex<#cir.int<1> : !s32i, #cir.int<2> : !s32i> : !cir.complex +// CIR: cir.store{{.*}} %[[COMPLEX]], %[[CL_ADDR]] : !cir.complex, !cir.ptr> +// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[CL_ADDR]] : !cir.ptr>, !cir.complex +// CIR: cir.store{{.*}} %[[TMP]], %[[A_ADDR]] : !cir.complex, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: %[[CL_ADDR:.*]] = alloca { i32, i32 }, i64 1, align 4 +// LLVM: store { i32, i32 } { i32 1, i32 2 }, ptr %[[CL_ADDR]], align 4 +// LLVM: %[[TMP:.*]] = load { i32, i32 }, ptr %[[CL_ADDR]], align 4 +// LLVM: store { i32, i32 } %[[TMP]], ptr %[[A_ADDR]], align 4 + +// OGCG: %[[A_ADDR:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[CL_ADDR:.*]] = alloca { i32, i32 }, align 4 +// OGCG: %[[CL_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CL_ADDR]], i32 0, i32 0 +// OGCG: %[[CL_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CL_ADDR]], i32 0, i32 1 +// OGCG: store i32 1, ptr %[[CL_REAL_PTR]], align 4 +// OGCG: store i32 2, ptr %[[CL_IMAG_PTR]], align 4 +// OGCG: %[[CL_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CL_ADDR]], i32 0, i32 0 +// OGCG: %[[CL_REAL:.*]] = load i32, ptr %[[CL_REAL_PTR]], align 4 +// OGCG: %[[CL_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[CL_ADDR]], i32 0, i32 1 +// OGCG: %[[CL_IMAG:.*]] = load i32, ptr %[[CL_IMAG_PTR]], align 4 +// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 0 +// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { i32, i32 }, ptr %[[A_ADDR]], i32 0, i32 1 +// OGCG: store i32 %[[CL_REAL]], ptr %[[A_REAL_PTR]], align 4 +// OGCG: store i32 %[[CL_IMAG]], ptr %[[A_IMAG_PTR]], align 4 + +void foo3() { + typedef int vi4 __attribute__((vector_size(16))); + auto a = (vi4){10, 20, 30, 40}; +} + +// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] +// CIR: %[[CL_ADDR:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, [".compoundliteral", init] +// CIR: %[[VEC:.*]] = cir.const #cir.const_vector<[#cir.int<10> : !s32i, #cir.int<20> : !s32i, #cir.int<30> : !s32i, #cir.int<40> : !s32i]> : !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[VEC]], %[[CL_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[TMP:.*]] = cir.load{{.*}} %[[CL_ADDR]] : !cir.ptr>, !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[TMP]], %[[A_ADDR]] : !cir.vector<4 x !s32i>, !cir.ptr> + +// LLVM: %[[A_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[CL_ADDR:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: store <4 x i32> , ptr %[[CL_ADDR]], align 16 +// LLVM: %[[TMP:.*]] = load <4 x i32>, ptr %[[CL_ADDR]], align 16 +// LLVM: store <4 x i32> %[[TMP]], ptr %[[A_ADDR]], align 16 + +// OGCG: %[[A_ADDR:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[CL_ADDR:.*]] = alloca <4 x i32>, align 16 +// OGCG: store <4 x i32> , ptr %[[CL_ADDR]], align 16 +// OGCG: %[[TMP:.*]] = load <4 x i32>, ptr %[[CL_ADDR]], align 16 +// OGCG: store <4 x i32> %[[TMP]], ptr %[[A_ADDR]], align 16 + From 0aff1b6cdda5f6f3ce31fe30a4aaa6c8f947b64b Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Sat, 19 Jul 2025 15:19:42 +0200 Subject: [PATCH 443/813] [CIR][NFC] Replace bool by cir::UnaryOpKind in emitComplexPrePostIncDec (#149566) Replace bool by cir::UnaryOpKind in emitComplexPrePostIncDec --- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 5 ++-- clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp | 24 +++++++++------- clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 31 +++++++++++---------- clang/lib/CIR/CodeGen/CIRGenFunction.h | 4 +-- 4 files changed, 35 insertions(+), 29 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 2a998cc39dba2..1f64801926887 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -663,7 +663,8 @@ LValue CIRGenFunction::emitUnaryOpLValue(const UnaryOperator *e) { } case UO_PreInc: case UO_PreDec: { - bool isInc = e->isIncrementOp(); + cir::UnaryOpKind kind = + e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec; LValue lv = emitLValue(e->getSubExpr()); assert(e->isPrefix() && "Prefix operator in unexpected state!"); @@ -672,7 +673,7 @@ LValue CIRGenFunction::emitUnaryOpLValue(const UnaryOperator *e) { cgm.errorNYI(e->getSourceRange(), "UnaryOp complex inc/dec"); lv = LValue(); } else { - emitScalarPrePostIncDec(e, lv, isInc, /*isPre=*/true); + emitScalarPrePostIncDec(e, lv, kind, /*isPre=*/true); } return lv; diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp index 81cb7f9cf77cb..6756a7ce067a5 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp @@ -62,23 +62,23 @@ class ComplexExprEmitter : public StmtVisitor { mlir::Value VisitSubstNonTypeTemplateParmExpr(SubstNonTypeTemplateParmExpr *e); - mlir::Value VisitPrePostIncDec(const UnaryOperator *e, bool isInc, + mlir::Value VisitPrePostIncDec(const UnaryOperator *e, cir::UnaryOpKind op, bool isPre); mlir::Value VisitUnaryPostDec(const UnaryOperator *e) { - return VisitPrePostIncDec(e, false, false); + return VisitPrePostIncDec(e, cir::UnaryOpKind::Dec, false); } mlir::Value VisitUnaryPostInc(const UnaryOperator *e) { - return VisitPrePostIncDec(e, true, false); + return VisitPrePostIncDec(e, cir::UnaryOpKind::Inc, false); } mlir::Value VisitUnaryPreDec(const UnaryOperator *e) { - return VisitPrePostIncDec(e, false, true); + return VisitPrePostIncDec(e, cir::UnaryOpKind::Dec, true); } mlir::Value VisitUnaryPreInc(const UnaryOperator *e) { - return VisitPrePostIncDec(e, true, true); + return VisitPrePostIncDec(e, cir::UnaryOpKind::Inc, true); } mlir::Value VisitUnaryDeref(const Expr *e); @@ -360,9 +360,10 @@ mlir::Value ComplexExprEmitter::VisitSubstNonTypeTemplateParmExpr( } mlir::Value ComplexExprEmitter::VisitPrePostIncDec(const UnaryOperator *e, - bool isInc, bool isPre) { + cir::UnaryOpKind op, + bool isPre) { LValue lv = cgf.emitLValue(e->getSubExpr()); - return cgf.emitComplexPrePostIncDec(e, lv, isInc, isPre); + return cgf.emitComplexPrePostIncDec(e, lv, op, isPre); } mlir::Value ComplexExprEmitter::VisitUnaryDeref(const Expr *e) { @@ -454,12 +455,15 @@ mlir::Value CIRGenFunction::emitComplexExpr(const Expr *e) { } mlir::Value CIRGenFunction::emitComplexPrePostIncDec(const UnaryOperator *e, - LValue lv, bool isInc, + LValue lv, + cir::UnaryOpKind op, bool isPre) { + assert(op == cir::UnaryOpKind::Inc || + op == cir::UnaryOpKind::Dec && "Invalid UnaryOp kind for ComplexType"); + mlir::Value inVal = emitLoadOfComplex(lv, e->getExprLoc()); mlir::Location loc = getLoc(e->getExprLoc()); - auto opKind = isInc ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec; - mlir::Value incVal = builder.createUnaryOp(loc, opKind, inVal); + mlir::Value incVal = builder.createUnaryOp(loc, op, inVal); // Store the updated result through the lvalue. emitStoreOfComplex(loc, incVal, lv, /*isInit=*/false); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 23112be6bf3e7..eba6bffbf2927 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -387,22 +387,22 @@ class ScalarExprEmitter : public StmtVisitor { // Unary Operators. mlir::Value VisitUnaryPostDec(const UnaryOperator *e) { LValue lv = cgf.emitLValue(e->getSubExpr()); - return emitScalarPrePostIncDec(e, lv, false, false); + return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Dec, false); } mlir::Value VisitUnaryPostInc(const UnaryOperator *e) { LValue lv = cgf.emitLValue(e->getSubExpr()); - return emitScalarPrePostIncDec(e, lv, true, false); + return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Inc, false); } mlir::Value VisitUnaryPreDec(const UnaryOperator *e) { LValue lv = cgf.emitLValue(e->getSubExpr()); - return emitScalarPrePostIncDec(e, lv, false, true); + return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Dec, true); } mlir::Value VisitUnaryPreInc(const UnaryOperator *e) { LValue lv = cgf.emitLValue(e->getSubExpr()); - return emitScalarPrePostIncDec(e, lv, true, true); + return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Inc, true); } mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv, - bool isInc, bool isPre) { + cir::UnaryOpKind kind, bool isPre) { if (cgf.getLangOpts().OpenMP) cgf.cgm.errorNYI(e->getSourceRange(), "inc/dec OpenMP"); @@ -431,7 +431,7 @@ class ScalarExprEmitter : public StmtVisitor { // -> bool = ((int)bool + 1 != 0) // An interesting aspect of this is that increment is always true. // Decrement does not have this property. - if (isInc && type->isBooleanType()) { + if (kind == cir::UnaryOpKind::Inc && type->isBooleanType()) { value = builder.getTrue(cgf.getLoc(e->getExprLoc())); } else if (type->isIntegerType()) { QualType promotedType; @@ -462,7 +462,7 @@ class ScalarExprEmitter : public StmtVisitor { assert(!cir::MissingFeatures::sanitizers()); if (e->canOverflow() && type->isSignedIntegerOrEnumerationType()) { - value = emitIncDecConsiderOverflowBehavior(e, value, isInc); + value = emitIncDecConsiderOverflowBehavior(e, value, kind); } else { cir::UnaryOpKind kind = e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec; @@ -484,7 +484,7 @@ class ScalarExprEmitter : public StmtVisitor { // For everything else, we can just do a simple increment. mlir::Location loc = cgf.getLoc(e->getSourceRange()); CIRGenBuilderTy &builder = cgf.getBuilder(); - int amount = (isInc ? 1 : -1); + int amount = kind == cir::UnaryOpKind::Inc ? 1 : -1; mlir::Value amt = builder.getSInt32(amount, loc); assert(!cir::MissingFeatures::sanitizers()); value = builder.createPtrStride(loc, value, amt); @@ -504,8 +504,8 @@ class ScalarExprEmitter : public StmtVisitor { if (mlir::isa(value.getType())) { // Create the inc/dec operation. // NOTE(CIR): clang calls CreateAdd but folds this to a unary op - cir::UnaryOpKind kind = - (isInc ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec); + assert(kind == cir::UnaryOpKind::Inc || + kind == cir::UnaryOpKind::Dec && "Invalid UnaryOp kind"); value = emitUnaryOp(e, kind, value); } else { cgf.cgm.errorNYI(e->getSourceRange(), "Unary inc/dec other fp type"); @@ -536,9 +536,9 @@ class ScalarExprEmitter : public StmtVisitor { mlir::Value emitIncDecConsiderOverflowBehavior(const UnaryOperator *e, mlir::Value inVal, - bool isInc) { - cir::UnaryOpKind kind = - e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec; + cir::UnaryOpKind kind) { + assert(kind == cir::UnaryOpKind::Inc || + kind == cir::UnaryOpKind::Dec && "Invalid UnaryOp kind"); switch (cgf.getLangOpts().getSignedOverflowBehavior()) { case LangOptions::SOB_Defined: return emitUnaryOp(e, kind, inVal, /*nsw=*/false); @@ -2151,8 +2151,9 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator( } mlir::Value CIRGenFunction::emitScalarPrePostIncDec(const UnaryOperator *e, - LValue lv, bool isInc, + LValue lv, + cir::UnaryOpKind kind, bool isPre) { return ScalarExprEmitter(*this, builder) - .emitScalarPrePostIncDec(e, lv, isInc, isPre); + .emitScalarPrePostIncDec(e, lv, kind, isPre); } diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index e2fa03d03b81e..12484196cbef2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -917,7 +917,7 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::Value emitScalarExpr(const clang::Expr *e); mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv, - bool isInc, bool isPre); + cir::UnaryOpKind kind, bool isPre); /// Build a debug stoppoint if we are emitting debug info. void emitStopPoint(const Stmt *s); @@ -939,7 +939,7 @@ class CIRGenFunction : public CIRGenTypeCache { void emitComplexExprIntoLValue(const Expr *e, LValue dest, bool isInit); mlir::Value emitComplexPrePostIncDec(const UnaryOperator *e, LValue lv, - bool isInc, bool isPre); + cir::UnaryOpKind op, bool isPre); LValue emitComplexAssignmentLValue(const BinaryOperator *e); From 167c695cece8e4fa78b7e9c5fc94bae3821ade52 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Sat, 19 Jul 2025 09:32:54 -0400 Subject: [PATCH 444/813] [libc++] Add and empty skeleton for LLVM 22 release notes (#149535) --- libcxx/docs/ReleaseNotes.rst | 3 +- libcxx/docs/ReleaseNotes/22.rst | 58 +++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 libcxx/docs/ReleaseNotes/22.rst diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst index 9feea5fffc26c..acfcd367de505 100644 --- a/libcxx/docs/ReleaseNotes.rst +++ b/libcxx/docs/ReleaseNotes.rst @@ -1,10 +1,11 @@ -.. include:: ReleaseNotes/21.rst +.. include:: ReleaseNotes/22.rst .. Make sure to reference the non-live release notes in a toctree to avoid Sphinx errors. .. toctree:: :hidden: ReleaseNotes/20 + ReleaseNotes/21 .. The release notes are in versioned files, but we make sure to keep publishing .. them in an unversioned ReleaseNotes.html page for external sites to reference. diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst new file mode 100644 index 0000000000000..cade192260f6d --- /dev/null +++ b/libcxx/docs/ReleaseNotes/22.rst @@ -0,0 +1,58 @@ +=========================================== +Libc++ 22.0.0 (In-Progress) Release Notes +=========================================== + +.. contents:: + :local: + :depth: 2 + +Written by the `Libc++ Team `_ + +.. warning:: + + These are in-progress notes for the upcoming libc++ 22.0.0 release. + Release notes for previous releases can be found on + `the Download Page `_. + +Introduction +============ + +This document contains the release notes for the libc++ C++ Standard Library, +part of the LLVM Compiler Infrastructure, release 22.0.0. Here we describe the +status of libc++ in some detail, including major improvements from the previous +release and new feature work. For the general LLVM release notes, see `the LLVM +documentation `_. All LLVM releases may +be downloaded from the `LLVM releases web site `_. + +For more information about libc++, please see the `Libc++ Web Site +`_ or the `LLVM Web Site `_. + +Note that if you are reading this file from a Git checkout or the +main Libc++ web page, this document applies to the *next* release, not +the current one. To see the release notes for a specific release, please +see the `releases page `_. + +What's New in Libc++ 22.0.0? +============================== + +Implemented Papers +------------------ + + +Improvements and New Features +----------------------------- + +Deprecations and Removals +------------------------- + +Potentially breaking changes +---------------------------- + +Announcements About Future Releases +----------------------------------- + +ABI Affecting Changes +--------------------- + +Build System Changes +-------------------- From 50408eeff6020061ceb6685448e214f36c75f71b Mon Sep 17 00:00:00 2001 From: Vassil Vassilev Date: Sat, 19 Jul 2025 14:29:59 +0000 Subject: [PATCH 445/813] Revert "[clang-repl] Spell out the enum types to appease some bots." This did not help the bots. Add another check that might help me figure out the issue. This reverts commit 193de1a566aa5a10a6f63f6f7c7fca2e52a7d75b. --- clang/test/Interpreter/pretty-print.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/clang/test/Interpreter/pretty-print.cpp b/clang/test/Interpreter/pretty-print.cpp index 1952cc7428e85..fd79d315e48ba 100644 --- a/clang/test/Interpreter/pretty-print.cpp +++ b/clang/test/Interpreter/pretty-print.cpp @@ -8,6 +8,9 @@ extern "C" int printf(const char*,...); "ab" // CHECK: (const char[3]) "ab" +123456 +// CHECK-NEXT: (int) 123456 + char ch[2] = {'1','a'}; ch // CHECK-NEXT: (char[2]) { '1', 'a' } @@ -28,15 +31,15 @@ S4{} // CHECK-NEXT: (S4) @0x{{[0-9a-f]+}} // TODO-CHECK-NEXT: ~S4() -enum Enum : int { e1 = -12, e2, e3=33, e4, e5 = 33}; +enum Enum{ e1 = -12, e2, e3=33, e4, e5 = 33}; e2 // CHECK-NEXT: (Enum) (e2) : int -11 ::e1 // CHECK-NEXT: (Enum) (e1) : int -12 -enum class Color : unsigned int { R = 0, G, B }; +enum class Color { R = 0, G, B }; Color::R -// CHECK-NEXT: (Color) (Color::R) : unsigned int 0 +// CHECK-NEXT: (Color) (Color::R) : int 0 // Lambdas. From 64220357b45b2c262eece817e797a29b8daabdd5 Mon Sep 17 00:00:00 2001 From: Hui Date: Sat, 19 Jul 2025 17:36:29 +0100 Subject: [PATCH 446/813] [libc++] constexpr flat_multimap (#148417) Fixes #128674 --- libcxx/include/__flat_map/flat_multimap.h | 355 +++++++++++------- .../flat.multimap.capacity/empty.pass.cpp | 18 +- .../flat.multimap.capacity/max_size.pass.cpp | 12 +- .../flat.multimap.capacity/size.pass.cpp | 21 +- .../flat.multimap.cons/alloc.pass.cpp | 48 ++- .../assign_initializer_list.pass.cpp | 41 +- .../flat.multimap.cons/compare.pass.cpp | 108 ++++-- .../flat.multimap.cons/containers.pass.cpp | 207 ++++++---- .../flat.multimap.cons/copy.pass.cpp | 36 +- .../flat.multimap.cons/copy_alloc.pass.cpp | 58 ++- .../flat.multimap.cons/copy_assign.pass.cpp | 56 ++- .../flat.multimap.cons/default.pass.cpp | 40 +- .../default_noexcept.pass.cpp | 15 +- .../flat.multimap.cons/dtor_noexcept.pass.cpp | 43 ++- .../initializer_list.pass.cpp | 160 ++++---- .../flat.multimap.cons/iter_iter.pass.cpp | 231 +++++++++--- .../flat.multimap.cons/move.pass.cpp | 34 +- .../flat.multimap.cons/move_alloc.pass.cpp | 76 ++-- .../flat.multimap.cons/move_assign.pass.cpp | 35 +- .../move_assign_clears.pass.cpp | 65 +++- ... => move_assign_noexcept.compile.pass.cpp} | 4 +- .../flat.multimap.cons/range.pass.cpp | 287 ++++++++++---- .../sorted_container.pass.cpp | 139 ++++--- .../sorted_initializer_list.pass.cpp | 164 ++++---- .../sorted_iter_iter.pass.cpp | 165 ++++---- .../flat.multimap.erasure/erase_if.pass.cpp | 28 +- .../flat.multimap.iterators/iterator.pass.cpp | 18 +- .../iterator_comparison.pass.cpp | 18 +- .../reverse_iterator.pass.cpp | 98 +++-- .../flat.multimap.modifiers/clear.pass.cpp | 17 +- .../flat.multimap.modifiers/emplace.pass.cpp | 27 +- .../emplace_hint.pass.cpp | 29 +- .../erase_iter.pass.cpp | 59 ++- .../erase_iter_iter.pass.cpp | 34 +- .../erase_key.pass.cpp | 19 +- .../erase_key_transparent.pass.cpp | 42 ++- .../flat.multimap.modifiers/extract.pass.cpp | 27 +- .../insert_cv.pass.cpp | 20 +- .../insert_initializer_list.pass.cpp | 24 +- .../insert_iter_cv.pass.cpp | 20 +- .../insert_iter_iter.pass.cpp | 27 +- .../insert_iter_rv.pass.cpp | 25 +- .../insert_range.pass.cpp | 22 +- .../insert_rv.pass.cpp | 21 +- .../insert_sorted_initializer_list.pass.cpp | 23 +- .../insert_sorted_iter_iter.pass.cpp | 19 +- .../insert_transparent.pass.cpp | 120 +++--- .../flat.multimap.modifiers/replace.pass.cpp | 21 +- .../swap_free.pass.cpp | 17 +- .../swap_member.pass.cpp | 18 +- .../flat.multimap.observers/comp.pass.cpp | 16 +- .../keys_values.pass.cpp | 31 +- .../contains.pass.cpp | 17 +- .../contains_transparent.pass.cpp | 19 +- .../flat.multimap.operations/count.pass.cpp | 18 +- .../count_transparent.pass.cpp | 18 +- .../equal_range.pass.cpp | 17 +- .../equal_range_transparent.pass.cpp | 20 +- .../flat.multimap.operations/find.pass.cpp | 17 +- .../find_transparent.pass.cpp | 17 +- .../lower_bound.pass.cpp | 17 +- .../lower_bound_transparent.pass.cpp | 17 +- .../upper_bound.pass.cpp | 17 +- .../upper_bound_transparent.pass.cpp | 17 +- .../flat.multimap/helpers.h | 11 +- 65 files changed, 2416 insertions(+), 1064 deletions(-) rename libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/{move_assign_noexcept.pass.cpp => move_assign_noexcept.compile.pass.cpp} (99%) diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h index 0af6aac00c383..260d93ed25785 100644 --- a/libcxx/include/__flat_map/flat_multimap.h +++ b/libcxx/include/__flat_map/flat_multimap.h @@ -114,11 +114,12 @@ class flat_multimap { class value_compare { private: _LIBCPP_NO_UNIQUE_ADDRESS key_compare __comp_; - _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare(key_compare __c) : __comp_(__c) {} friend flat_multimap; public: - _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool + operator()(const_reference __x, const_reference __y) const { return __comp_(__x.first, __y.first); } }; @@ -137,17 +138,17 @@ class flat_multimap { public: // [flat.map.cons], construct/copy/destroy - _LIBCPP_HIDE_FROM_ABI flat_multimap() noexcept( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap() noexcept( is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_MappedContainer> && is_nothrow_default_constructible_v<_Compare>) : __containers_(), __compare_() {} - _LIBCPP_HIDE_FROM_ABI flat_multimap(const flat_multimap&) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(const flat_multimap&) = default; // The copy/move constructors are not specified in the spec, which means they should be defaulted. // However, the move constructor can potentially leave a moved-from object in an inconsistent // state if an exception is thrown. - _LIBCPP_HIDE_FROM_ABI flat_multimap(flat_multimap&& __other) noexcept( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(flat_multimap&& __other) noexcept( is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_MappedContainer> && is_nothrow_move_constructible_v<_Compare>) # if _LIBCPP_HAS_EXCEPTIONS @@ -168,7 +169,8 @@ class flat_multimap { template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap(const flat_multimap& __other, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(const flat_multimap& __other, const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __other.__containers_.keys, @@ -177,7 +179,7 @@ class flat_multimap { template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap(flat_multimap&& __other, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(flat_multimap&& __other, const _Allocator& __alloc) # if _LIBCPP_HAS_EXCEPTIONS try # endif // _LIBCPP_HAS_EXCEPTIONS @@ -194,7 +196,7 @@ class flat_multimap { # endif // _LIBCPP_HAS_EXCEPTIONS } - _LIBCPP_HIDE_FROM_ABI flat_multimap( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( key_container_type __key_cont, mapped_container_type __mapped_cont, const key_compare& __comp = key_compare()) : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), @@ -204,7 +206,7 @@ class flat_multimap { template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( const key_container_type& __key_cont, const mapped_container_type& __mapped_cont, const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), @@ -214,22 +216,22 @@ class flat_multimap { template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI - flat_multimap(const key_container_type& __key_cont, - const mapped_container_type& __mapped_cont, - const key_compare& __comp, - const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( + const key_container_type& __key_cont, + const mapped_container_type& __mapped_cont, + const key_compare& __comp, + const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), "flat_multimap keys and mapped containers have different size"); __sort(); } - _LIBCPP_HIDE_FROM_ABI - flat_multimap(sorted_equivalent_t, - key_container_type __key_cont, - mapped_container_type __mapped_cont, - const key_compare& __comp = key_compare()) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( + sorted_equivalent_t, + key_container_type __key_cont, + mapped_container_type __mapped_cont, + const key_compare& __comp = key_compare()) : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), "flat_multimap keys and mapped containers have different size"); @@ -238,11 +240,11 @@ class flat_multimap { template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI - flat_multimap(sorted_equivalent_t, - const key_container_type& __key_cont, - const mapped_container_type& __mapped_cont, - const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( + sorted_equivalent_t, + const key_container_type& __key_cont, + const mapped_container_type& __mapped_cont, + const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), "flat_multimap keys and mapped containers have different size"); @@ -251,33 +253,35 @@ class flat_multimap { template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI - flat_multimap(sorted_equivalent_t, - const key_container_type& __key_cont, - const mapped_container_type& __mapped_cont, - const key_compare& __comp, - const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( + sorted_equivalent_t, + const key_container_type& __key_cont, + const mapped_container_type& __mapped_cont, + const key_compare& __comp, + const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) { _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(), "flat_multimap keys and mapped containers have different size"); _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__containers_.keys), "Key container is not sorted"); } - _LIBCPP_HIDE_FROM_ABI explicit flat_multimap(const key_compare& __comp) : __containers_(), __compare_(__comp) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multimap(const key_compare& __comp) + : __containers_(), __compare_(__comp) {} template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap(const key_compare& __comp, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(const key_compare& __comp, const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {} template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI explicit flat_multimap(const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 explicit flat_multimap(const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {} template requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) : __containers_(), __compare_(__comp) { insert(__first, __last); @@ -285,7 +289,7 @@ class flat_multimap { template requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>) - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) { insert(__first, __last); @@ -293,91 +297,99 @@ class flat_multimap { template requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>) - _LIBCPP_HIDE_FROM_ABI flat_multimap(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) { insert(__first, __last); } template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t __fr, _Range&& __rg) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(from_range_t __fr, _Range&& __rg) : flat_multimap(__fr, std::forward<_Range>(__rg), key_compare()) {} template <_ContainerCompatibleRange _Range, class _Allocator> requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(from_range_t, _Range&& __rg, const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) { insert_range(std::forward<_Range>(__rg)); } template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multimap(__comp) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp) + : flat_multimap(__comp) { insert_range(std::forward<_Range>(__rg)); } template <_ContainerCompatibleRange _Range, class _Allocator> requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) { insert_range(std::forward<_Range>(__rg)); } template requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI flat_multimap( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare()) : __containers_(), __compare_(__comp) { insert(sorted_equivalent, __first, __last); } template requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>) - _LIBCPP_HIDE_FROM_ABI - flat_multimap(sorted_equivalent_t, - _InputIterator __first, - _InputIterator __last, - const key_compare& __comp, - const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( + sorted_equivalent_t, + _InputIterator __first, + _InputIterator __last, + const key_compare& __comp, + const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) { insert(sorted_equivalent, __first, __last); } template requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>) - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc) : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) { insert(sorted_equivalent, __first, __last); } - _LIBCPP_HIDE_FROM_ABI flat_multimap(initializer_list __il, const key_compare& __comp = key_compare()) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(initializer_list __il, const key_compare& __comp = key_compare()) : flat_multimap(__il.begin(), __il.end(), __comp) {} template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(initializer_list __il, const key_compare& __comp, const _Allocator& __alloc) : flat_multimap(__il.begin(), __il.end(), __comp, __alloc) {} template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap(initializer_list __il, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(initializer_list __il, const _Allocator& __alloc) : flat_multimap(__il.begin(), __il.end(), __alloc) {} - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap(sorted_equivalent_t, initializer_list __il, const key_compare& __comp = key_compare()) : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __comp) {} template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( sorted_equivalent_t, initializer_list __il, const key_compare& __comp, const _Allocator& __alloc) : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {} template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap(sorted_equivalent_t, initializer_list __il, const _Allocator& __alloc) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(sorted_equivalent_t, initializer_list __il, const _Allocator& __alloc) : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __alloc) {} - _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(initializer_list __il) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap& operator=(initializer_list __il) { clear(); insert(__il); return *this; @@ -386,9 +398,9 @@ class flat_multimap { // copy/move assignment are not specified in the spec (defaulted) // but move assignment can potentially leave moved from object in an inconsistent // state if an exception is thrown - _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(const flat_multimap&) = default; + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap& operator=(const flat_multimap&) = default; - _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(flat_multimap&& __other) noexcept( + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap& operator=(flat_multimap&& __other) noexcept( is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_MappedContainer> && is_nothrow_move_assignable_v<_Compare>) { auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; }); @@ -400,38 +412,54 @@ class flat_multimap { } // iterators - _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator begin() noexcept { return iterator(__containers_.keys.begin(), __containers_.values.begin()); } - _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator begin() const noexcept { return const_iterator(__containers_.keys.begin(), __containers_.values.begin()); } - _LIBCPP_HIDE_FROM_ABI iterator end() noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator end() noexcept { return iterator(__containers_.keys.end(), __containers_.values.end()); } - _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator end() const noexcept { return const_iterator(__containers_.keys.end(), __containers_.values.end()); } - _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rbegin() noexcept { + return reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rbegin() const noexcept { + return const_reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 reverse_iterator rend() noexcept { + return reverse_iterator(begin()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator rend() const noexcept { + return const_reverse_iterator(begin()); + } - _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); } - _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); } - _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cbegin() const noexcept { return begin(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator cend() const noexcept { return end(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crbegin() const noexcept { + return const_reverse_iterator(end()); + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_reverse_iterator crend() const noexcept { + return const_reverse_iterator(begin()); + } // [flat.map.capacity], capacity - [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __containers_.keys.empty(); } + [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool empty() const noexcept { + return __containers_.keys.empty(); + } - _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __containers_.keys.size(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type size() const noexcept { + return __containers_.keys.size(); + } - _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type max_size() const noexcept { return std::min(__containers_.keys.max_size(), __containers_.values.max_size()); } @@ -439,7 +467,7 @@ class flat_multimap { template requires is_constructible_v, _Args...> && is_move_constructible_v && is_move_constructible_v - _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace(_Args&&... __args) { std::pair __pair(std::forward<_Args>(__args)...); auto __key_it = std::upper_bound(__containers_.keys.begin(), __containers_.keys.end(), __pair.first, __compare_); auto __mapped_it = __corresponding_mapped_it(*this, __key_it); @@ -450,7 +478,7 @@ class flat_multimap { template requires is_constructible_v, _Args...> - _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator emplace_hint(const_iterator __hint, _Args&&... __args) { std::pair __pair(std::forward<_Args>(__args)...); auto __prev_larger = __hint != cbegin() && __compare_(__pair.first, (__hint - 1)->first); @@ -490,33 +518,35 @@ class flat_multimap { *this, __key_iter, __mapped_iter, std::move(__pair.first), std::move(__pair.second)); } - _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const value_type& __x) { return emplace(__x); } - _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(value_type&& __x) { + return emplace(std::move(__x)); + } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, const value_type& __x) { return emplace_hint(__hint, __x); } - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, value_type&& __x) { return emplace_hint(__hint, std::move(__x)); } template requires is_constructible_v, _PairLike> - _LIBCPP_HIDE_FROM_ABI iterator insert(_PairLike&& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(_PairLike&& __x) { return emplace(std::forward<_PairLike>(__x)); } template requires is_constructible_v, _PairLike> - _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _PairLike&& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator insert(const_iterator __hint, _PairLike&& __x) { return emplace_hint(__hint, std::forward<_PairLike>(__x)); } template requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(_InputIterator __first, _InputIterator __last) { if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { __reserve(__last - __first); } @@ -525,7 +555,8 @@ class flat_multimap { template requires __has_input_iterator_category<_InputIterator>::value - _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) { if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) { __reserve(__last - __first); } @@ -534,7 +565,7 @@ class flat_multimap { } template <_ContainerCompatibleRange _Range> - _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert_range(_Range&& __range) { if constexpr (ranges::sized_range<_Range>) { __reserve(ranges::size(__range)); } @@ -542,19 +573,23 @@ class flat_multimap { __append_sort_merge(ranges::begin(__range), ranges::end(__range)); } - _LIBCPP_HIDE_FROM_ABI void insert(initializer_list __il) { insert(__il.begin(), __il.end()); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void insert(initializer_list __il) { + insert(__il.begin(), __il.end()); + } - _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list __il) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + insert(sorted_equivalent_t, initializer_list __il) { insert(sorted_equivalent, __il.begin(), __il.end()); } - _LIBCPP_HIDE_FROM_ABI containers extract() && { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 containers extract() && { auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; }); auto __ret = std::move(__containers_); return __ret; } - _LIBCPP_HIDE_FROM_ABI void replace(key_container_type&& __key_cont, mapped_container_type&& __mapped_cont) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + replace(key_container_type&& __key_cont, mapped_container_type&& __mapped_cont) { _LIBCPP_ASSERT_VALID_INPUT_RANGE( __key_cont.size() == __mapped_cont.size(), "flat_multimap keys and mapped containers have different size"); @@ -565,15 +600,15 @@ class flat_multimap { __guard.__complete(); } - _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(iterator __position) { return __erase(__position.__key_iter_, __position.__mapped_iter_); } - _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __position) { return __erase(__position.__key_iter_, __position.__mapped_iter_); } - _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(const key_type& __x) { auto [__first, __last] = equal_range(__x); auto __res = __last - __first; erase(__first, __last); @@ -583,14 +618,14 @@ class flat_multimap { template requires(__is_compare_transparent && !is_convertible_v<_Kp &&, iterator> && !is_convertible_v<_Kp &&, const_iterator>) - _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type erase(_Kp&& __x) { auto [__first, __last] = equal_range(__x); auto __res = __last - __first; erase(__first, __last); return __res; } - _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator erase(const_iterator __first, const_iterator __last) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); auto __key_it = __containers_.keys.erase(__first.__key_iter_, __last.__key_iter_); auto __mapped_it = __containers_.values.erase(__first.__mapped_iter_, __last.__mapped_iter_); @@ -598,7 +633,7 @@ class flat_multimap { return iterator(std::move(__key_it), std::move(__mapped_it)); } - _LIBCPP_HIDE_FROM_ABI void swap(flat_multimap& __y) noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void swap(flat_multimap& __y) noexcept { // warning: The spec has unconditional noexcept, which means that // if any of the following functions throw an exception, // std::terminate will be called @@ -607,137 +642,160 @@ class flat_multimap { ranges::swap(__containers_.values, __y.__containers_.values); } - _LIBCPP_HIDE_FROM_ABI void clear() noexcept { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void clear() noexcept { __containers_.keys.clear(); __containers_.values.clear(); } // observers - _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; } - _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return value_compare(__compare_); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 key_compare key_comp() const { return __compare_; } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 value_compare value_comp() const { + return value_compare(__compare_); + } - _LIBCPP_HIDE_FROM_ABI const key_container_type& keys() const noexcept { return __containers_.keys; } - _LIBCPP_HIDE_FROM_ABI const mapped_container_type& values() const noexcept { return __containers_.values; } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const key_container_type& keys() const noexcept { + return __containers_.keys; + } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const mapped_container_type& values() const noexcept { + return __containers_.values; + } // map operations - _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const key_type& __x) { + return __find_impl(*this, __x); + } - _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const key_type& __x) const { + return __find_impl(*this, __x); + } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator find(const _Kp& __x) { return __find_impl(*this, __x); } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator find(const _Kp& __x) const { return __find_impl(*this, __x); } - _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const key_type& __x) const { auto [__first, __last] = equal_range(__x); return __last - __first; } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 size_type count(const _Kp& __x) const { auto [__first, __last] = equal_range(__x); return __last - __first; } - _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const key_type& __x) const { + return find(__x) != end(); + } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool contains(const _Kp& __x) const { return find(__x) != end(); } - _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) { return __lower_bound(*this, __x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const key_type& __x) { + return __lower_bound(*this, __x); + } - _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const key_type& __x) const { return __lower_bound(*this, __x); } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator lower_bound(const _Kp& __x) { return __lower_bound(*this, __x); } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator lower_bound(const _Kp& __x) const { return __lower_bound(*this, __x); } - _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) { return __upper_bound(*this, __x); } + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const key_type& __x) { + return __upper_bound(*this, __x); + } - _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const key_type& __x) const { return __upper_bound(*this, __x); } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator upper_bound(const _Kp& __x) { return __upper_bound(*this, __x); } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 const_iterator upper_bound(const _Kp& __x) const { return __upper_bound(*this, __x); } - _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair equal_range(const key_type& __x) { return __equal_range_impl(*this, __x); } - _LIBCPP_HIDE_FROM_ABI pair equal_range(const key_type& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair + equal_range(const key_type& __x) const { return __equal_range_impl(*this, __x); } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI pair equal_range(const _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair equal_range(const _Kp& __x) { return __equal_range_impl(*this, __x); } template requires __is_compare_transparent - _LIBCPP_HIDE_FROM_ABI pair equal_range(const _Kp& __x) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 pair + equal_range(const _Kp& __x) const { return __equal_range_impl(*this, __x); } - friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multimap& __x, const flat_multimap& __y) { + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool + operator==(const flat_multimap& __x, const flat_multimap& __y) { return ranges::equal(__x, __y); } - friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multimap& __x, const flat_multimap& __y) { + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 auto + operator<=>(const flat_multimap& __x, const flat_multimap& __y) { return std::lexicographical_compare_three_way( __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } - friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multimap& __x, flat_multimap& __y) noexcept { __x.swap(__y); } + friend _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + swap(flat_multimap& __x, flat_multimap& __y) noexcept { + __x.swap(__y); + } private: struct __ctor_uses_allocator_tag { - explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_tag() = default; + explicit _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __ctor_uses_allocator_tag() = default; }; struct __ctor_uses_allocator_empty_tag { - explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_empty_tag() = default; + explicit _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __ctor_uses_allocator_empty_tag() = default; }; template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI - flat_multimap(__ctor_uses_allocator_tag, - const _Allocator& __alloc, - _KeyCont&& __key_cont, - _MappedCont&& __mapped_cont, - _CompArg&&... __comp) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 flat_multimap( + __ctor_uses_allocator_tag, + const _Allocator& __alloc, + _KeyCont&& __key_cont, + _MappedCont&& __mapped_cont, + _CompArg&&... __comp) : __containers_{.keys = std::make_obj_using_allocator( __alloc, std::forward<_KeyCont>(__key_cont)), .values = std::make_obj_using_allocator( @@ -746,29 +804,32 @@ class flat_multimap { template requires __allocator_ctor_constraint<_Allocator> - _LIBCPP_HIDE_FROM_ABI flat_multimap(__ctor_uses_allocator_empty_tag, const _Allocator& __alloc, _CompArg&&... __comp) + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + flat_multimap(__ctor_uses_allocator_empty_tag, const _Allocator& __alloc, _CompArg&&... __comp) : __containers_{.keys = std::make_obj_using_allocator(__alloc), .values = std::make_obj_using_allocator(__alloc)}, __compare_(std::forward<_CompArg>(__comp)...) {} - _LIBCPP_HIDE_FROM_ABI bool __is_sorted(auto&& __key_container) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool __is_sorted(auto&& __key_container) const { return ranges::is_sorted(__key_container, __compare_); } - _LIBCPP_HIDE_FROM_ABI void __sort() { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __sort() { auto __zv = ranges::views::zip(__containers_.keys, __containers_.values); ranges::sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); }); } template - _LIBCPP_HIDE_FROM_ABI static auto __corresponding_mapped_it(_Self&& __self, _KeyIter&& __key_iter) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto + __corresponding_mapped_it(_Self&& __self, _KeyIter&& __key_iter) { return __self.__containers_.values.begin() + static_cast>( ranges::distance(__self.__containers_.keys.begin(), __key_iter)); } template - _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_InputIterator __first, _Sentinel __last) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void + __append_sort_merge(_InputIterator __first, _Sentinel __last) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); size_t __num_appended = __flat_map_utils::__append(*this, std::move(__first), std::move(__last)); if (__num_appended != 0) { @@ -791,7 +852,7 @@ class flat_multimap { } template - _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __find_impl(_Self&& __self, const _Kp& __key) { auto __it = __self.lower_bound(__key); auto __last = __self.end(); if (__it == __last || __self.__compare_(__key, __it->first)) { @@ -801,7 +862,7 @@ class flat_multimap { } template - _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static auto __equal_range_impl(_Self&& __self, const _Kp& __key) { auto [__key_first, __key_last] = std::equal_range(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __key, __self.__compare_); @@ -811,7 +872,7 @@ class flat_multimap { } template - _LIBCPP_HIDE_FROM_ABI static _Res __lower_bound(_Self&& __self, _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static _Res __lower_bound(_Self&& __self, _Kp& __x) { auto __key_iter = std::lower_bound(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __x, __self.__compare_); auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter); @@ -819,14 +880,14 @@ class flat_multimap { } template - _LIBCPP_HIDE_FROM_ABI static _Res __upper_bound(_Self&& __self, _Kp& __x) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 static _Res __upper_bound(_Self&& __self, _Kp& __x) { auto __key_iter = std::upper_bound(__self.__containers_.keys.begin(), __self.__containers_.keys.end(), __x, __self.__compare_); auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter); return _Res(std::move(__key_iter), std::move(__mapped_iter)); } - _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 void __reserve(size_t __size) { if constexpr (__container_traits<_KeyContainer>::__reservable) { __containers_.keys.reserve(__size); } @@ -837,7 +898,8 @@ class flat_multimap { } template - _LIBCPP_HIDE_FROM_ABI iterator __erase(_KIter __key_iter_to_remove, _MIter __mapped_iter_to_remove) { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 iterator + __erase(_KIter __key_iter_to_remove, _MIter __mapped_iter_to_remove) { auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; }); auto __key_iter = __containers_.keys.erase(__key_iter_to_remove); auto __mapped_iter = __containers_.values.erase(__mapped_iter_to_remove); @@ -847,7 +909,8 @@ class flat_multimap { template friend typename flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>::size_type - erase_if(flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate); + _LIBCPP_CONSTEXPR_SINCE_CXX26 + erase_if(flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate); friend __flat_map_utils; @@ -855,8 +918,9 @@ class flat_multimap { _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_; struct __key_equiv { - _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {} - _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const { + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 __key_equiv(key_compare __c) : __comp_(__c) {} + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 bool + operator()(const_reference __x, const_reference __y) const { return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x)); } key_compare __comp_; @@ -980,8 +1044,9 @@ struct uses_allocator && uses_allocator_v<_MappedContainer, _Allocator>> {}; template -_LIBCPP_HIDE_FROM_ABI typename flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::size_type -erase_if(flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>& __flat_multimap, _Predicate __pred) { +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX26 + typename flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::size_type + erase_if(flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>& __flat_multimap, _Predicate __pred) { auto __zv = ranges::views::zip(__flat_multimap.__containers_.keys, __flat_multimap.__containers_.values); auto __first = __zv.begin(); auto __last = __zv.end(); diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp index 4fa4fd6a69b94..a09bb8c1aa514 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp @@ -26,7 +26,7 @@ #include "min_allocator.h" template -void test() { +constexpr void test() { using Key = typename KeyContainer::value_type; using Value = typename ValueContainer::value_type; using M = std::flat_multimap, KeyContainer, ValueContainer>; @@ -41,11 +41,23 @@ void test() { assert(m.empty()); } -int main(int, char**) { +constexpr bool test() { test, std::vector>(); - test, std::vector>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test, std::vector>(); test, MinSequenceContainer>(); test>, std::vector>>(); + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp index 0960c43c5a90a..fc35fec10cd95 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp @@ -26,7 +26,7 @@ #include "test_allocator.h" #include "test_macros.h" -int main(int, char**) { +constexpr bool test() { { using A1 = limited_allocator; using A2 = limited_allocator; @@ -74,5 +74,15 @@ int main(int, char**) { assert(c.max_size() <= max_dist); assert(c.max_size() <= alloc_max_size(std::allocator())); } + + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp index 533f8da631fc8..3a99e20235135 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=200000000 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=800000000 // @@ -25,7 +27,7 @@ #include "min_allocator.h" template -void test() { +constexpr void test() { using M = std::flat_multimap, KeyContainer, ValueContainer>; { const M m = {{1, 'a'}, {1, 'b'}, {4, 'd'}, {5, 'e'}, {5, 'h'}}; @@ -47,7 +49,7 @@ void test() { } { M m; - std::size_t s = 1000; + std::size_t s = 500; for (auto i = 0u; i < s; ++i) { m.emplace(i, 'a'); } @@ -60,11 +62,22 @@ void test() { } } -int main(int, char**) { +constexpr bool test() { test, std::vector>(); - test, std::vector>(); +#ifndef __cpp_lib_constexpr_deque + if (!TEST_IS_CONSTANT_EVALUATED) +#endif + test, std::vector>(); test, MinSequenceContainer>(); test>, std::vector>>(); + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp index 3e155eb2a1075..596da81f6e940 100644 --- a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp +++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp @@ -8,12 +8,13 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 -// +// // template // explicit flat_multimap(const Allocator& a); #include +#include #include #include #include @@ -22,7 +23,23 @@ #include "test_allocator.h" #include "../../../test_compare.h" -int main(int, char**) { +template